drivers/iommu/intel-iommu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright © 2006-2014 Intel Corporation.
   4  *
   5  * Authors: David Woodhouse <dwmw2@infradead.org>,
   6  *          Ashok Raj <ashok.raj@intel.com>,
   7  *          Shaohua Li <shaohua.li@intel.com>,
   8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9  *          Fenghua Yu <fenghua.yu@intel.com>
  10  *          Joerg Roedel <jroedel@suse.de>
  11  */
  12
  13 #define pr_fmt(fmt)     "DMAR: " fmt
  14 #define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16 #include <linux/init.h>
  17 #include <linux/bitmap.h>
  18 #include <linux/debugfs.h>
  19 #include <linux/export.h>
  20 #include <linux/slab.h>
  21 #include <linux/irq.h>
  22 #include <linux/interrupt.h>
  23 #include <linux/spinlock.h>
  24 #include <linux/pci.h>
  25 #include <linux/dmar.h>
  26 #include <linux/dma-mapping.h>
  27 #include <linux/mempool.h>
  28 #include <linux/memory.h>
  29 #include <linux/cpu.h>
  30 #include <linux/timer.h>
  31 #include <linux/io.h>
  32 #include <linux/iova.h>
  33 #include <linux/iommu.h>
  34 #include <linux/intel-iommu.h>
  35 #include <linux/syscore_ops.h>
  36 #include <linux/tboot.h>
  37 #include <linux/dmi.h>
  38 #include <linux/pci-ats.h>
  39 #include <linux/memblock.h>
  40 #include <linux/dma-contiguous.h>
  41 #include <linux/dma-direct.h>
  42 #include <linux/crash_dump.h>
  43 #include <linux/numa.h>
  44 #include <linux/swiotlb.h>
  45 #include <asm/irq_remapping.h>
  46 #include <asm/cacheflush.h>
  47 #include <asm/iommu.h>
  48 #include <trace/events/intel_iommu.h>
  49
  50 #include "irq_remapping.h"
  51 #include "intel-pasid.h"
  52
  53 #define ROOT_SIZE               VTD_PAGE_SIZE
  54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  55
  56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  60
  61 #define IOAPIC_RANGE_START      (0xfee00000)
  62 #define IOAPIC_RANGE_END        (0xfeefffff)
  63 #define IOVA_START_ADDR         (0x1000)
  64
  65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  66
  67 #define MAX_AGAW_WIDTH 64
  68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  69
  70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  72
  73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  78
  79 /* IO virtual address start page frame number */
  80 #define IOVA_START_PFN          (1)
  81
  82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  83
  84 /* page table handling */
  85 #define LEVEL_STRIDE            (9)
  86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  87
  88 /*
  89  * This bitmap is used to advertise the page sizes our hardware support
  90  * to the IOMMU core, which will then use this information to split
  91  * physically contiguous memory regions it is mapping into page sizes
  92  * that we support.
  93  *
  94  * Traditionally the IOMMU core just handed us the mappings directly,
  95  * after making sure the size is an order of a 4KiB page and that the
  96  * mapping has natural alignment.
  97  *
  98  * To retain this behavior, we currently advertise that we support
  99  * all page sizes that are an order of 4KiB.
 100  *
 101  * If at some point we'd like to utilize the IOMMU core's new behavior,
 102  * we could change this to advertise the real page sizes we support.
 103  */
 104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 105
 106 static inline int agaw_to_level(int agaw)
 107 {
 108         return agaw + 2;
 109 }
 110
 111 static inline int agaw_to_width(int agaw)
 112 {
 113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 114 }
 115
 116 static inline int width_to_agaw(int width)
 117 {
 118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 119 }
 120
 121 static inline unsigned int level_to_offset_bits(int level)
 122 {
 123         return (level - 1) * LEVEL_STRIDE;
 124 }
 125
 126 static inline int pfn_level_offset(unsigned long pfn, int level)
 127 {
 128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 129 }
 130
 131 static inline unsigned long level_mask(int level)
 132 {
 133         return -1UL << level_to_offset_bits(level);
 134 }
 135
 136 static inline unsigned long level_size(int level)
 137 {
 138         return 1UL << level_to_offset_bits(level);
 139 }
 140
 141 static inline unsigned long align_to_level(unsigned long pfn, int level)
 142 {
 143         return (pfn + level_size(level) - 1) & level_mask(level);
 144 }
 145
 146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 147 {
 148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 149 }
 150
 151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 152    are never going to work. */
 153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 154 {
 155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 156 }
 157
 158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 159 {
 160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 161 }
 162 static inline unsigned long page_to_dma_pfn(struct page *pg)
 163 {
 164         return mm_to_dma_pfn(page_to_pfn(pg));
 165 }
 166 static inline unsigned long virt_to_dma_pfn(void *p)
 167 {
 168         return page_to_dma_pfn(virt_to_page(p));
 169 }
 170
 171 /* global iommu list, set NULL for ignored DMAR units */
 172 static struct intel_iommu **g_iommus;
 173
 174 static void __init check_tylersburg_isoch(void);
 175 static int rwbf_quirk;
 176
 177 /*
 178  * set to 1 to panic kernel if can't successfully enable VT-d
 179  * (used when kernel is launched w/ TXT)
 180  */
 181 static int force_on = 0;
 182 int intel_iommu_tboot_noforce;
 183 static int no_platform_optin;
 184
 185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 186
 187 /*
 188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 189  * if marked present.
 190  */
 191 static phys_addr_t root_entry_lctp(struct root_entry *re)
 192 {
 193         if (!(re->lo & 1))
 194                 return 0;
 195
 196         return re->lo & VTD_PAGE_MASK;
 197 }
 198
 199 /*
 200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 201  * if marked present.
 202  */
 203 static phys_addr_t root_entry_uctp(struct root_entry *re)
 204 {
 205         if (!(re->hi & 1))
 206                 return 0;
 207
 208         return re->hi & VTD_PAGE_MASK;
 209 }
 210
 211 static inline void context_clear_pasid_enable(struct context_entry *context)
 212 {
 213         context->lo &= ~(1ULL << 11);
 214 }
 215
 216 static inline bool context_pasid_enabled(struct context_entry *context)
 217 {
 218         return !!(context->lo & (1ULL << 11));
 219 }
 220
 221 static inline void context_set_copied(struct context_entry *context)
 222 {
 223         context->hi |= (1ull << 3);
 224 }
 225
 226 static inline bool context_copied(struct context_entry *context)
 227 {
 228         return !!(context->hi & (1ULL << 3));
 229 }
 230
 231 static inline bool __context_present(struct context_entry *context)
 232 {
 233         return (context->lo & 1);
 234 }
 235
 236 bool context_present(struct context_entry *context)
 237 {
 238         return context_pasid_enabled(context) ?
 239              __context_present(context) :
 240              __context_present(context) && !context_copied(context);
 241 }
 242
 243 static inline void context_set_present(struct context_entry *context)
 244 {
 245         context->lo |= 1;
 246 }
 247
 248 static inline void context_set_fault_enable(struct context_entry *context)
 249 {
 250         context->lo &= (((u64)-1) << 2) | 1;
 251 }
 252
 253 static inline void context_set_translation_type(struct context_entry *context,
 254                                                 unsigned long value)
 255 {
 256         context->lo &= (((u64)-1) << 4) | 3;
 257         context->lo |= (value & 3) << 2;
 258 }
 259
 260 static inline void context_set_address_root(struct context_entry *context,
 261                                             unsigned long value)
 262 {
 263         context->lo &= ~VTD_PAGE_MASK;
 264         context->lo |= value & VTD_PAGE_MASK;
 265 }
 266
 267 static inline void context_set_address_width(struct context_entry *context,
 268                                              unsigned long value)
 269 {
 270         context->hi |= value & 7;
 271 }
 272
 273 static inline void context_set_domain_id(struct context_entry *context,
 274                                          unsigned long value)
 275 {
 276         context->hi |= (value & ((1 << 16) - 1)) << 8;
 277 }
 278
 279 static inline int context_domain_id(struct context_entry *c)
 280 {
 281         return((c->hi >> 8) & 0xffff);
 282 }
 283
 284 static inline void context_clear_entry(struct context_entry *context)
 285 {
 286         context->lo = 0;
 287         context->hi = 0;
 288 }
 289
 290 /*
 291  * This domain is a statically identity mapping domain.
 292  *      1. This domain creats a static 1:1 mapping to all usable memory.
 293  *      2. It maps to each iommu if successful.
 294  *      3. Each iommu mapps to this domain if successful.
 295  */
 296 static struct dmar_domain *si_domain;
 297 static int hw_pass_through = 1;
 298
 299 /* si_domain contains mulitple devices */
 300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
 301
 302 /*
 303  * This is a DMA domain allocated through the iommu domain allocation
 304  * interface. But one or more devices belonging to this domain have
 305  * been chosen to use a private domain. We should avoid to use the
 306  * map/unmap/iova_to_phys APIs on it.
 307  */
 308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
 309
 310 /*
 311  * When VT-d works in the scalable mode, it allows DMA translation to
 312  * happen through either first level or second level page table. This
 313  * bit marks that the DMA translation for the domain goes through the
 314  * first level page table, otherwise, it goes through the second level.
 315  */
 316 #define DOMAIN_FLAG_USE_FIRST_LEVEL             BIT(2)
 317
 318 /*
 319  * Domain represents a virtual machine which demands iommu nested
 320  * translation mode support.
 321  */
 322 #define DOMAIN_FLAG_NESTING_MODE                BIT(3)
 323
 324 #define for_each_domain_iommu(idx, domain)                      \
 325         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 326                 if (domain->iommu_refcnt[idx])
 327
 328 struct dmar_rmrr_unit {
 329         struct list_head list;          /* list of rmrr units   */
 330         struct acpi_dmar_header *hdr;   /* ACPI header          */
 331         u64     base_address;           /* reserved base address*/
 332         u64     end_address;            /* reserved end address */
 333         struct dmar_dev_scope *devices; /* target devices */
 334         int     devices_cnt;            /* target device count */
 335 };
 336
 337 struct dmar_atsr_unit {
 338         struct list_head list;          /* list of ATSR units */
 339         struct acpi_dmar_header *hdr;   /* ACPI header */
 340         struct dmar_dev_scope *devices; /* target devices */
 341         int devices_cnt;                /* target device count */
 342         u8 include_all:1;               /* include all ports */
 343 };
 344
 345 static LIST_HEAD(dmar_atsr_units);
 346 static LIST_HEAD(dmar_rmrr_units);
 347
 348 #define for_each_rmrr_units(rmrr) \
 349         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 350
 351 /* bitmap for indexing intel_iommus */
 352 static int g_num_of_iommus;
 353
 354 static void domain_exit(struct dmar_domain *domain);
 355 static void domain_remove_dev_info(struct dmar_domain *domain);
 356 static void dmar_remove_one_dev_info(struct device *dev);
 357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 358 static void domain_context_clear(struct intel_iommu *iommu,
 359                                  struct device *dev);
 360 static int domain_detach_iommu(struct dmar_domain *domain,
 361                                struct intel_iommu *iommu);
 362 static bool device_is_rmrr_locked(struct device *dev);
 363 static int intel_iommu_attach_device(struct iommu_domain *domain,
 364                                      struct device *dev);
 365 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 366                                             dma_addr_t iova);
 367
 368 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 369 int dmar_disabled = 0;
 370 #else
 371 int dmar_disabled = 1;
 372 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
 373
 374 #ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 375 int intel_iommu_sm = 1;
 376 #else
 377 int intel_iommu_sm;
 378 #endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
 379
 380 int intel_iommu_enabled = 0;
 381 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 382
 383 static int dmar_map_gfx = 1;
 384 static int dmar_forcedac;
 385 static int intel_iommu_strict;
 386 static int intel_iommu_superpage = 1;
 387 static int iommu_identity_mapping;
 388 static int intel_no_bounce;
 389
 390 #define IDENTMAP_GFX            2
 391 #define IDENTMAP_AZALIA         4
 392
 393 int intel_iommu_gfx_mapped;
 394 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 395
 396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 397 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 398 DEFINE_SPINLOCK(device_domain_lock);
 399 static LIST_HEAD(device_domain_list);
 400
 401 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
 402                                 to_pci_dev(d)->untrusted)
 403
 404 /*
 405  * Iterate over elements in device_domain_list and call the specified
 406  * callback @fn against each element.
 407  */
 408 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 409                                      void *data), void *data)
 410 {
 411         int ret = 0;
 412         unsigned long flags;
 413         struct device_domain_info *info;
 414
 415         spin_lock_irqsave(&device_domain_lock, flags);
 416         list_for_each_entry(info, &device_domain_list, global) {
 417                 ret = fn(info, data);
 418                 if (ret) {
 419                         spin_unlock_irqrestore(&device_domain_lock, flags);
 420                         return ret;
 421                 }
 422         }
 423         spin_unlock_irqrestore(&device_domain_lock, flags);
 424
 425         return 0;
 426 }
 427
 428 const struct iommu_ops intel_iommu_ops;
 429
 430 static bool translation_pre_enabled(struct intel_iommu *iommu)
 431 {
 432         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 433 }
 434
 435 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 436 {
 437         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 438 }
 439
 440 static void init_translation_status(struct intel_iommu *iommu)
 441 {
 442         u32 gsts;
 443
 444         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 445         if (gsts & DMA_GSTS_TES)
 446                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 447 }
 448
 449 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 450 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 451 {
 452         return container_of(dom, struct dmar_domain, domain);
 453 }
 454
 455 static int __init intel_iommu_setup(char *str)
 456 {
 457         if (!str)
 458                 return -EINVAL;
 459         while (*str) {
 460                 if (!strncmp(str, "on", 2)) {
 461                         dmar_disabled = 0;
 462                         pr_info("IOMMU enabled\n");
 463                 } else if (!strncmp(str, "off", 3)) {
 464                         dmar_disabled = 1;
 465                         no_platform_optin = 1;
 466                         pr_info("IOMMU disabled\n");
 467                 } else if (!strncmp(str, "igfx_off", 8)) {
 468                         dmar_map_gfx = 0;
 469                         pr_info("Disable GFX device mapping\n");
 470                 } else if (!strncmp(str, "forcedac", 8)) {
 471                         pr_info("Forcing DAC for PCI devices\n");
 472                         dmar_forcedac = 1;
 473                 } else if (!strncmp(str, "strict", 6)) {
 474                         pr_info("Disable batched IOTLB flush\n");
 475                         intel_iommu_strict = 1;
 476                 } else if (!strncmp(str, "sp_off", 6)) {
 477                         pr_info("Disable supported super page\n");
 478                         intel_iommu_superpage = 0;
 479                 } else if (!strncmp(str, "sm_on", 5)) {
 480                         pr_info("Intel-IOMMU: scalable mode supported\n");
 481                         intel_iommu_sm = 1;
 482                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 483                         printk(KERN_INFO
 484                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 485                         intel_iommu_tboot_noforce = 1;
 486                 } else if (!strncmp(str, "nobounce", 8)) {
 487                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
 488                         intel_no_bounce = 1;
 489                 }
 490
 491                 str += strcspn(str, ",");
 492                 while (*str == ',')
 493                         str++;
 494         }
 495         return 0;
 496 }
 497 __setup("intel_iommu=", intel_iommu_setup);
 498
 499 static struct kmem_cache *iommu_domain_cache;
 500 static struct kmem_cache *iommu_devinfo_cache;
 501
 502 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 503 {
 504         struct dmar_domain **domains;
 505         int idx = did >> 8;
 506
 507         domains = iommu->domains[idx];
 508         if (!domains)
 509                 return NULL;
 510
 511         return domains[did & 0xff];
 512 }
 513
 514 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 515                              struct dmar_domain *domain)
 516 {
 517         struct dmar_domain **domains;
 518         int idx = did >> 8;
 519
 520         if (!iommu->domains[idx]) {
 521                 size_t size = 256 * sizeof(struct dmar_domain *);
 522                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 523         }
 524
 525         domains = iommu->domains[idx];
 526         if (WARN_ON(!domains))
 527                 return;
 528         else
 529                 domains[did & 0xff] = domain;
 530 }
 531
 532 void *alloc_pgtable_page(int node)
 533 {
 534         struct page *page;
 535         void *vaddr = NULL;
 536
 537         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 538         if (page)
 539                 vaddr = page_address(page);
 540         return vaddr;
 541 }
 542
 543 void free_pgtable_page(void *vaddr)
 544 {
 545         free_page((unsigned long)vaddr);
 546 }
 547
 548 static inline void *alloc_domain_mem(void)
 549 {
 550         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 551 }
 552
 553 static void free_domain_mem(void *vaddr)
 554 {
 555         kmem_cache_free(iommu_domain_cache, vaddr);
 556 }
 557
 558 static inline void * alloc_devinfo_mem(void)
 559 {
 560         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 561 }
 562
 563 static inline void free_devinfo_mem(void *vaddr)
 564 {
 565         kmem_cache_free(iommu_devinfo_cache, vaddr);
 566 }
 567
 568 static inline int domain_type_is_si(struct dmar_domain *domain)
 569 {
 570         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 571 }
 572
 573 static inline bool domain_use_first_level(struct dmar_domain *domain)
 574 {
 575         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
 576 }
 577
 578 static inline int domain_pfn_supported(struct dmar_domain *domain,
 579                                        unsigned long pfn)
 580 {
 581         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 582
 583         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 584 }
 585
 586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 587 {
 588         unsigned long sagaw;
 589         int agaw = -1;
 590
 591         sagaw = cap_sagaw(iommu->cap);
 592         for (agaw = width_to_agaw(max_gaw);
 593              agaw >= 0; agaw--) {
 594                 if (test_bit(agaw, &sagaw))
 595                         break;
 596         }
 597
 598         return agaw;
 599 }
 600
 601 /*
 602  * Calculate max SAGAW for each iommu.
 603  */
 604 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 605 {
 606         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 607 }
 608
 609 /*
 610  * calculate agaw for each iommu.
 611  * "SAGAW" may be different across iommus, use a default agaw, and
 612  * get a supported less agaw for iommus that don't support the default agaw.
 613  */
 614 int iommu_calculate_agaw(struct intel_iommu *iommu)
 615 {
 616         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 617 }
 618
 619 /* This functionin only returns single iommu in a domain */
 620 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 621 {
 622         int iommu_id;
 623
 624         /* si_domain and vm domain should not get here. */
 625         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
 626                 return NULL;
 627
 628         for_each_domain_iommu(iommu_id, domain)
 629                 break;
 630
 631         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 632                 return NULL;
 633
 634         return g_iommus[iommu_id];
 635 }
 636
 637 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 638 {
 639         struct dmar_drhd_unit *drhd;
 640         struct intel_iommu *iommu;
 641         bool found = false;
 642         int i;
 643
 644         domain->iommu_coherency = 1;
 645
 646         for_each_domain_iommu(i, domain) {
 647                 found = true;
 648                 if (!ecap_coherent(g_iommus[i]->ecap)) {
 649                         domain->iommu_coherency = 0;
 650                         break;
 651                 }
 652         }
 653         if (found)
 654                 return;
 655
 656         /* No hardware attached; use lowest common denominator */
 657         rcu_read_lock();
 658         for_each_active_iommu(iommu, drhd) {
 659                 if (!ecap_coherent(iommu->ecap)) {
 660                         domain->iommu_coherency = 0;
 661                         break;
 662                 }
 663         }
 664         rcu_read_unlock();
 665 }
 666
 667 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 668 {
 669         struct dmar_drhd_unit *drhd;
 670         struct intel_iommu *iommu;
 671         int ret = 1;
 672
 673         rcu_read_lock();
 674         for_each_active_iommu(iommu, drhd) {
 675                 if (iommu != skip) {
 676                         if (!ecap_sc_support(iommu->ecap)) {
 677                                 ret = 0;
 678                                 break;
 679                         }
 680                 }
 681         }
 682         rcu_read_unlock();
 683
 684         return ret;
 685 }
 686
 687 static int domain_update_iommu_superpage(struct dmar_domain *domain,
 688                                          struct intel_iommu *skip)
 689 {
 690         struct dmar_drhd_unit *drhd;
 691         struct intel_iommu *iommu;
 692         int mask = 0x3;
 693
 694         if (!intel_iommu_superpage) {
 695                 return 0;
 696         }
 697
 698         /* set iommu_superpage to the smallest common denominator */
 699         rcu_read_lock();
 700         for_each_active_iommu(iommu, drhd) {
 701                 if (iommu != skip) {
 702                         if (domain && domain_use_first_level(domain)) {
 703                                 if (!cap_fl1gp_support(iommu->cap))
 704                                         mask = 0x1;
 705                         } else {
 706                                 mask &= cap_super_page_val(iommu->cap);
 707                         }
 708
 709                         if (!mask)
 710                                 break;
 711                 }
 712         }
 713         rcu_read_unlock();
 714
 715         return fls(mask);
 716 }
 717
 718 /* Some capabilities may be different across iommus */
 719 static void domain_update_iommu_cap(struct dmar_domain *domain)
 720 {
 721         domain_update_iommu_coherency(domain);
 722         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 723         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 724 }
 725
 726 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 727                                          u8 devfn, int alloc)
 728 {
 729         struct root_entry *root = &iommu->root_entry[bus];
 730         struct context_entry *context;
 731         u64 *entry;
 732
 733         entry = &root->lo;
 734         if (sm_supported(iommu)) {
 735                 if (devfn >= 0x80) {
 736                         devfn -= 0x80;
 737                         entry = &root->hi;
 738                 }
 739                 devfn *= 2;
 740         }
 741         if (*entry & 1)
 742                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 743         else {
 744                 unsigned long phy_addr;
 745                 if (!alloc)
 746                         return NULL;
 747
 748                 context = alloc_pgtable_page(iommu->node);
 749                 if (!context)
 750                         return NULL;
 751
 752                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 753                 phy_addr = virt_to_phys((void *)context);
 754                 *entry = phy_addr | 1;
 755                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 756         }
 757         return &context[devfn];
 758 }
 759
 760 static int iommu_dummy(struct device *dev)
 761 {
 762         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 763 }
 764
 765 static bool attach_deferred(struct device *dev)
 766 {
 767         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
 768 }
 769
 770 /**
 771  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 772  *                               sub-hierarchy of a candidate PCI-PCI bridge
 773  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 774  * @bridge: the candidate PCI-PCI bridge
 775  *
 776  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 777  */
 778 static bool
 779 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 780 {
 781         struct pci_dev *pdev, *pbridge;
 782
 783         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 784                 return false;
 785
 786         pdev = to_pci_dev(dev);
 787         pbridge = to_pci_dev(bridge);
 788
 789         if (pbridge->subordinate &&
 790             pbridge->subordinate->number <= pdev->bus->number &&
 791             pbridge->subordinate->busn_res.end >= pdev->bus->number)
 792                 return true;
 793
 794         return false;
 795 }
 796
 797 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 798 {
 799         struct dmar_drhd_unit *drhd = NULL;
 800         struct intel_iommu *iommu;
 801         struct device *tmp;
 802         struct pci_dev *pdev = NULL;
 803         u16 segment = 0;
 804         int i;
 805
 806         if (iommu_dummy(dev))
 807                 return NULL;
 808
 809         if (dev_is_pci(dev)) {
 810                 struct pci_dev *pf_pdev;
 811
 812                 pdev = pci_real_dma_dev(to_pci_dev(dev));
 813
 814                 /* VFs aren't listed in scope tables; we need to look up
 815                  * the PF instead to find the IOMMU. */
 816                 pf_pdev = pci_physfn(pdev);
 817                 dev = &pf_pdev->dev;
 818                 segment = pci_domain_nr(pdev->bus);
 819         } else if (has_acpi_companion(dev))
 820                 dev = &ACPI_COMPANION(dev)->dev;
 821
 822         rcu_read_lock();
 823         for_each_active_iommu(iommu, drhd) {
 824                 if (pdev && segment != drhd->segment)
 825                         continue;
 826
 827                 for_each_active_dev_scope(drhd->devices,
 828                                           drhd->devices_cnt, i, tmp) {
 829                         if (tmp == dev) {
 830                                 /* For a VF use its original BDF# not that of the PF
 831                                  * which we used for the IOMMU lookup. Strictly speaking
 832                                  * we could do this for all PCI devices; we only need to
 833                                  * get the BDF# from the scope table for ACPI matches. */
 834                                 if (pdev && pdev->is_virtfn)
 835                                         goto got_pdev;
 836
 837                                 *bus = drhd->devices[i].bus;
 838                                 *devfn = drhd->devices[i].devfn;
 839                                 goto out;
 840                         }
 841
 842                         if (is_downstream_to_pci_bridge(dev, tmp))
 843                                 goto got_pdev;
 844                 }
 845
 846                 if (pdev && drhd->include_all) {
 847                 got_pdev:
 848                         *bus = pdev->bus->number;
 849                         *devfn = pdev->devfn;
 850                         goto out;
 851                 }
 852         }
 853         iommu = NULL;
 854  out:
 855         rcu_read_unlock();
 856
 857         return iommu;
 858 }
 859
 860 static void domain_flush_cache(struct dmar_domain *domain,
 861                                void *addr, int size)
 862 {
 863         if (!domain->iommu_coherency)
 864                 clflush_cache_range(addr, size);
 865 }
 866
 867 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 868 {
 869         struct context_entry *context;
 870         int ret = 0;
 871         unsigned long flags;
 872
 873         spin_lock_irqsave(&iommu->lock, flags);
 874         context = iommu_context_addr(iommu, bus, devfn, 0);
 875         if (context)
 876                 ret = context_present(context);
 877         spin_unlock_irqrestore(&iommu->lock, flags);
 878         return ret;
 879 }
 880
 881 static void free_context_table(struct intel_iommu *iommu)
 882 {
 883         int i;
 884         unsigned long flags;
 885         struct context_entry *context;
 886
 887         spin_lock_irqsave(&iommu->lock, flags);
 888         if (!iommu->root_entry) {
 889                 goto out;
 890         }
 891         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 892                 context = iommu_context_addr(iommu, i, 0, 0);
 893                 if (context)
 894                         free_pgtable_page(context);
 895
 896                 if (!sm_supported(iommu))
 897                         continue;
 898
 899                 context = iommu_context_addr(iommu, i, 0x80, 0);
 900                 if (context)
 901                         free_pgtable_page(context);
 902
 903         }
 904         free_pgtable_page(iommu->root_entry);
 905         iommu->root_entry = NULL;
 906 out:
 907         spin_unlock_irqrestore(&iommu->lock, flags);
 908 }
 909
 910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 911                                       unsigned long pfn, int *target_level)
 912 {
 913         struct dma_pte *parent, *pte;
 914         int level = agaw_to_level(domain->agaw);
 915         int offset;
 916
 917         BUG_ON(!domain->pgd);
 918
 919         if (!domain_pfn_supported(domain, pfn))
 920                 /* Address beyond IOMMU's addressing capabilities. */
 921                 return NULL;
 922
 923         parent = domain->pgd;
 924
 925         while (1) {
 926                 void *tmp_page;
 927
 928                 offset = pfn_level_offset(pfn, level);
 929                 pte = &parent[offset];
 930                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 931                         break;
 932                 if (level == *target_level)
 933                         break;
 934
 935                 if (!dma_pte_present(pte)) {
 936                         uint64_t pteval;
 937
 938                         tmp_page = alloc_pgtable_page(domain->nid);
 939
 940                         if (!tmp_page)
 941                                 return NULL;
 942
 943                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 944                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 945                         if (domain_use_first_level(domain))
 946                                 pteval |= DMA_FL_PTE_XD;
 947                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 948                                 /* Someone else set it while we were thinking; use theirs. */
 949                                 free_pgtable_page(tmp_page);
 950                         else
 951                                 domain_flush_cache(domain, pte, sizeof(*pte));
 952                 }
 953                 if (level == 1)
 954                         break;
 955
 956                 parent = phys_to_virt(dma_pte_addr(pte));
 957                 level--;
 958         }
 959
 960         if (!*target_level)
 961                 *target_level = level;
 962
 963         return pte;
 964 }
 965
 966 /* return address's pte at specific level */
 967 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 968                                          unsigned long pfn,
 969                                          int level, int *large_page)
 970 {
 971         struct dma_pte *parent, *pte;
 972         int total = agaw_to_level(domain->agaw);
 973         int offset;
 974
 975         parent = domain->pgd;
 976         while (level <= total) {
 977                 offset = pfn_level_offset(pfn, total);
 978                 pte = &parent[offset];
 979                 if (level == total)
 980                         return pte;
 981
 982                 if (!dma_pte_present(pte)) {
 983                         *large_page = total;
 984                         break;
 985                 }
 986
 987                 if (dma_pte_superpage(pte)) {
 988                         *large_page = total;
 989                         return pte;
 990                 }
 991
 992                 parent = phys_to_virt(dma_pte_addr(pte));
 993                 total--;
 994         }
 995         return NULL;
 996 }
 997
 998 /* clear last level pte, a tlb flush should be followed */
 999 static void dma_pte_clear_range(struct dmar_domain *domain,
1000                                 unsigned long start_pfn,
1001                                 unsigned long last_pfn)
1002 {
1003         unsigned int large_page;
1004         struct dma_pte *first_pte, *pte;
1005
1006         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1007         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1008         BUG_ON(start_pfn > last_pfn);
1009
1010         /* we don't need lock here; nobody else touches the iova range */
1011         do {
1012                 large_page = 1;
1013                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1014                 if (!pte) {
1015                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1016                         continue;
1017                 }
1018                 do {
1019                         dma_clear_pte(pte);
1020                         start_pfn += lvl_to_nr_pages(large_page);
1021                         pte++;
1022                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1023
1024                 domain_flush_cache(domain, first_pte,
1025                                    (void *)pte - (void *)first_pte);
1026
1027         } while (start_pfn && start_pfn <= last_pfn);
1028 }
1029
1030 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1031                                int retain_level, struct dma_pte *pte,
1032                                unsigned long pfn, unsigned long start_pfn,
1033                                unsigned long last_pfn)
1034 {
1035         pfn = max(start_pfn, pfn);
1036         pte = &pte[pfn_level_offset(pfn, level)];
1037
1038         do {
1039                 unsigned long level_pfn;
1040                 struct dma_pte *level_pte;
1041
1042                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1043                         goto next;
1044
1045                 level_pfn = pfn & level_mask(level);
1046                 level_pte = phys_to_virt(dma_pte_addr(pte));
1047
1048                 if (level > 2) {
1049                         dma_pte_free_level(domain, level - 1, retain_level,
1050                                            level_pte, level_pfn, start_pfn,
1051                                            last_pfn);
1052                 }
1053
1054                 /*
1055                  * Free the page table if we're below the level we want to
1056                  * retain and the range covers the entire table.
1057                  */
1058                 if (level < retain_level && !(start_pfn > level_pfn ||
1059                       last_pfn < level_pfn + level_size(level) - 1)) {
1060                         dma_clear_pte(pte);
1061                         domain_flush_cache(domain, pte, sizeof(*pte));
1062                         free_pgtable_page(level_pte);
1063                 }
1064 next:
1065                 pfn += level_size(level);
1066         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1067 }
1068
1069 /*
1070  * clear last level (leaf) ptes and free page table pages below the
1071  * level we wish to keep intact.
1072  */
1073 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1074                                    unsigned long start_pfn,
1075                                    unsigned long last_pfn,
1076                                    int retain_level)
1077 {
1078         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1079         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1080         BUG_ON(start_pfn > last_pfn);
1081
1082         dma_pte_clear_range(domain, start_pfn, last_pfn);
1083
1084         /* We don't need lock here; nobody else touches the iova range */
1085         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1086                            domain->pgd, 0, start_pfn, last_pfn);
1087
1088         /* free pgd */
1089         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090                 free_pgtable_page(domain->pgd);
1091                 domain->pgd = NULL;
1092         }
1093 }
1094
1095 /* When a page at a given level is being unlinked from its parent, we don't
1096    need to *modify* it at all. All we need to do is make a list of all the
1097    pages which can be freed just as soon as we've flushed the IOTLB and we
1098    know the hardware page-walk will no longer touch them.
1099    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1100    be freed. */
1101 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1102                                             int level, struct dma_pte *pte,
1103                                             struct page *freelist)
1104 {
1105         struct page *pg;
1106
1107         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1108         pg->freelist = freelist;
1109         freelist = pg;
1110
1111         if (level == 1)
1112                 return freelist;
1113
1114         pte = page_address(pg);
1115         do {
1116                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117                         freelist = dma_pte_list_pagetables(domain, level - 1,
1118                                                            pte, freelist);
1119                 pte++;
1120         } while (!first_pte_in_page(pte));
1121
1122         return freelist;
1123 }
1124
1125 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1126                                         struct dma_pte *pte, unsigned long pfn,
1127                                         unsigned long start_pfn,
1128                                         unsigned long last_pfn,
1129                                         struct page *freelist)
1130 {
1131         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1132
1133         pfn = max(start_pfn, pfn);
1134         pte = &pte[pfn_level_offset(pfn, level)];
1135
1136         do {
1137                 unsigned long level_pfn;
1138
1139                 if (!dma_pte_present(pte))
1140                         goto next;
1141
1142                 level_pfn = pfn & level_mask(level);
1143
1144                 /* If range covers entire pagetable, free it */
1145                 if (start_pfn <= level_pfn &&
1146                     last_pfn >= level_pfn + level_size(level) - 1) {
1147                         /* These suborbinate page tables are going away entirely. Don't
1148                            bother to clear them; we're just going to *free* them. */
1149                         if (level > 1 && !dma_pte_superpage(pte))
1150                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1151
1152                         dma_clear_pte(pte);
1153                         if (!first_pte)
1154                                 first_pte = pte;
1155                         last_pte = pte;
1156                 } else if (level > 1) {
1157                         /* Recurse down into a level that isn't *entirely* obsolete */
1158                         freelist = dma_pte_clear_level(domain, level - 1,
1159                                                        phys_to_virt(dma_pte_addr(pte)),
1160                                                        level_pfn, start_pfn, last_pfn,
1161                                                        freelist);
1162                 }
1163 next:
1164                 pfn += level_size(level);
1165         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1166
1167         if (first_pte)
1168                 domain_flush_cache(domain, first_pte,
1169                                    (void *)++last_pte - (void *)first_pte);
1170
1171         return freelist;
1172 }
1173
1174 /* We can't just free the pages because the IOMMU may still be walking
1175    the page tables, and may have cached the intermediate levels. The
1176    pages can only be freed after the IOTLB flush has been done. */
1177 static struct page *domain_unmap(struct dmar_domain *domain,
1178                                  unsigned long start_pfn,
1179                                  unsigned long last_pfn)
1180 {
1181         struct page *freelist;
1182
1183         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1184         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1185         BUG_ON(start_pfn > last_pfn);
1186
1187         /* we don't need lock here; nobody else touches the iova range */
1188         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1189                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1190
1191         /* free pgd */
1192         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1193                 struct page *pgd_page = virt_to_page(domain->pgd);
1194                 pgd_page->freelist = freelist;
1195                 freelist = pgd_page;
1196
1197                 domain->pgd = NULL;
1198         }
1199
1200         return freelist;
1201 }
1202
1203 static void dma_free_pagelist(struct page *freelist)
1204 {
1205         struct page *pg;
1206
1207         while ((pg = freelist)) {
1208                 freelist = pg->freelist;
1209                 free_pgtable_page(page_address(pg));
1210         }
1211 }
1212
1213 static void iova_entry_free(unsigned long data)
1214 {
1215         struct page *freelist = (struct page *)data;
1216
1217         dma_free_pagelist(freelist);
1218 }
1219
1220 /* iommu handling */
1221 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1222 {
1223         struct root_entry *root;
1224         unsigned long flags;
1225
1226         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1227         if (!root) {
1228                 pr_err("Allocating root entry for %s failed\n",
1229                         iommu->name);
1230                 return -ENOMEM;
1231         }
1232
1233         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1234
1235         spin_lock_irqsave(&iommu->lock, flags);
1236         iommu->root_entry = root;
1237         spin_unlock_irqrestore(&iommu->lock, flags);
1238
1239         return 0;
1240 }
1241
1242 static void iommu_set_root_entry(struct intel_iommu *iommu)
1243 {
1244         u64 addr;
1245         u32 sts;
1246         unsigned long flag;
1247
1248         addr = virt_to_phys(iommu->root_entry);
1249         if (sm_supported(iommu))
1250                 addr |= DMA_RTADDR_SMT;
1251
1252         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1254
1255         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1256
1257         /* Make sure hardware complete it */
1258         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1259                       readl, (sts & DMA_GSTS_RTPS), sts);
1260
1261         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1262 }
1263
1264 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1265 {
1266         u32 val;
1267         unsigned long flag;
1268
1269         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1270                 return;
1271
1272         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1273         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1274
1275         /* Make sure hardware complete it */
1276         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1277                       readl, (!(val & DMA_GSTS_WBFS)), val);
1278
1279         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1280 }
1281
1282 /* return value determine if we need a write buffer flush */
1283 static void __iommu_flush_context(struct intel_iommu *iommu,
1284                                   u16 did, u16 source_id, u8 function_mask,
1285                                   u64 type)
1286 {
1287         u64 val = 0;
1288         unsigned long flag;
1289
1290         switch (type) {
1291         case DMA_CCMD_GLOBAL_INVL:
1292                 val = DMA_CCMD_GLOBAL_INVL;
1293                 break;
1294         case DMA_CCMD_DOMAIN_INVL:
1295                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1296                 break;
1297         case DMA_CCMD_DEVICE_INVL:
1298                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1299                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1300                 break;
1301         default:
1302                 BUG();
1303         }
1304         val |= DMA_CCMD_ICC;
1305
1306         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1307         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1308
1309         /* Make sure hardware complete it */
1310         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1311                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1312
1313         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1314 }
1315
1316 /* return value determine if we need a write buffer flush */
1317 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1318                                 u64 addr, unsigned int size_order, u64 type)
1319 {
1320         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1321         u64 val = 0, val_iva = 0;
1322         unsigned long flag;
1323
1324         switch (type) {
1325         case DMA_TLB_GLOBAL_FLUSH:
1326                 /* global flush doesn't need set IVA_REG */
1327                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1328                 break;
1329         case DMA_TLB_DSI_FLUSH:
1330                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1331                 break;
1332         case DMA_TLB_PSI_FLUSH:
1333                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1334                 /* IH bit is passed in as part of address */
1335                 val_iva = size_order | addr;
1336                 break;
1337         default:
1338                 BUG();
1339         }
1340         /* Note: set drain read/write */
1341 #if 0
1342         /*
1343          * This is probably to be super secure.. Looks like we can
1344          * ignore it without any impact.
1345          */
1346         if (cap_read_drain(iommu->cap))
1347                 val |= DMA_TLB_READ_DRAIN;
1348 #endif
1349         if (cap_write_drain(iommu->cap))
1350                 val |= DMA_TLB_WRITE_DRAIN;
1351
1352         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1353         /* Note: Only uses first TLB reg currently */
1354         if (val_iva)
1355                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1356         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1357
1358         /* Make sure hardware complete it */
1359         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1360                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1361
1362         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1363
1364         /* check IOTLB invalidation granularity */
1365         if (DMA_TLB_IAIG(val) == 0)
1366                 pr_err("Flush IOTLB failed\n");
1367         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1368                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1369                         (unsigned long long)DMA_TLB_IIRG(type),
1370                         (unsigned long long)DMA_TLB_IAIG(val));
1371 }
1372
1373 static struct device_domain_info *
1374 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1375                          u8 bus, u8 devfn)
1376 {
1377         struct device_domain_info *info;
1378
1379         assert_spin_locked(&device_domain_lock);
1380
1381         if (!iommu->qi)
1382                 return NULL;
1383
1384         list_for_each_entry(info, &domain->devices, link)
1385                 if (info->iommu == iommu && info->bus == bus &&
1386                     info->devfn == devfn) {
1387                         if (info->ats_supported && info->dev)
1388                                 return info;
1389                         break;
1390                 }
1391
1392         return NULL;
1393 }
1394
1395 static void domain_update_iotlb(struct dmar_domain *domain)
1396 {
1397         struct device_domain_info *info;
1398         bool has_iotlb_device = false;
1399
1400         assert_spin_locked(&device_domain_lock);
1401
1402         list_for_each_entry(info, &domain->devices, link) {
1403                 struct pci_dev *pdev;
1404
1405                 if (!info->dev || !dev_is_pci(info->dev))
1406                         continue;
1407
1408                 pdev = to_pci_dev(info->dev);
1409                 if (pdev->ats_enabled) {
1410                         has_iotlb_device = true;
1411                         break;
1412                 }
1413         }
1414
1415         domain->has_iotlb_device = has_iotlb_device;
1416 }
1417
1418 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1419 {
1420         struct pci_dev *pdev;
1421
1422         assert_spin_locked(&device_domain_lock);
1423
1424         if (!info || !dev_is_pci(info->dev))
1425                 return;
1426
1427         pdev = to_pci_dev(info->dev);
1428         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1429          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1430          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1431          * reserved, which should be set to 0.
1432          */
1433         if (!ecap_dit(info->iommu->ecap))
1434                 info->pfsid = 0;
1435         else {
1436                 struct pci_dev *pf_pdev;
1437
1438                 /* pdev will be returned if device is not a vf */
1439                 pf_pdev = pci_physfn(pdev);
1440                 info->pfsid = pci_dev_id(pf_pdev);
1441         }
1442
1443 #ifdef CONFIG_INTEL_IOMMU_SVM
1444         /* The PCIe spec, in its wisdom, declares that the behaviour of
1445            the device if you enable PASID support after ATS support is
1446            undefined. So always enable PASID support on devices which
1447            have it, even if we can't yet know if we're ever going to
1448            use it. */
1449         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1450                 info->pasid_enabled = 1;
1451
1452         if (info->pri_supported &&
1453             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1454             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1455                 info->pri_enabled = 1;
1456 #endif
1457         if (!pdev->untrusted && info->ats_supported &&
1458             pci_ats_page_aligned(pdev) &&
1459             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1460                 info->ats_enabled = 1;
1461                 domain_update_iotlb(info->domain);
1462                 info->ats_qdep = pci_ats_queue_depth(pdev);
1463         }
1464 }
1465
1466 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1467 {
1468         struct pci_dev *pdev;
1469
1470         assert_spin_locked(&device_domain_lock);
1471
1472         if (!dev_is_pci(info->dev))
1473                 return;
1474
1475         pdev = to_pci_dev(info->dev);
1476
1477         if (info->ats_enabled) {
1478                 pci_disable_ats(pdev);
1479                 info->ats_enabled = 0;
1480                 domain_update_iotlb(info->domain);
1481         }
1482 #ifdef CONFIG_INTEL_IOMMU_SVM
1483         if (info->pri_enabled) {
1484                 pci_disable_pri(pdev);
1485                 info->pri_enabled = 0;
1486         }
1487         if (info->pasid_enabled) {
1488                 pci_disable_pasid(pdev);
1489                 info->pasid_enabled = 0;
1490         }
1491 #endif
1492 }
1493
1494 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1495                                   u64 addr, unsigned mask)
1496 {
1497         u16 sid, qdep;
1498         unsigned long flags;
1499         struct device_domain_info *info;
1500
1501         if (!domain->has_iotlb_device)
1502                 return;
1503
1504         spin_lock_irqsave(&device_domain_lock, flags);
1505         list_for_each_entry(info, &domain->devices, link) {
1506                 if (!info->ats_enabled)
1507                         continue;
1508
1509                 sid = info->bus << 8 | info->devfn;
1510                 qdep = info->ats_qdep;
1511                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1512                                 qdep, addr, mask);
1513         }
1514         spin_unlock_irqrestore(&device_domain_lock, flags);
1515 }
1516
1517 static void domain_flush_piotlb(struct intel_iommu *iommu,
1518                                 struct dmar_domain *domain,
1519                                 u64 addr, unsigned long npages, bool ih)
1520 {
1521         u16 did = domain->iommu_did[iommu->seq_id];
1522
1523         if (domain->default_pasid)
1524                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1525                                 addr, npages, ih);
1526
1527         if (!list_empty(&domain->devices))
1528                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1529 }
1530
1531 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1532                                   struct dmar_domain *domain,
1533                                   unsigned long pfn, unsigned int pages,
1534                                   int ih, int map)
1535 {
1536         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1537         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1538         u16 did = domain->iommu_did[iommu->seq_id];
1539
1540         BUG_ON(pages == 0);
1541
1542         if (ih)
1543                 ih = 1 << 6;
1544
1545         if (domain_use_first_level(domain)) {
1546                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1547         } else {
1548                 /*
1549                  * Fallback to domain selective flush if no PSI support or
1550                  * the size is too big. PSI requires page size to be 2 ^ x,
1551                  * and the base address is naturally aligned to the size.
1552                  */
1553                 if (!cap_pgsel_inv(iommu->cap) ||
1554                     mask > cap_max_amask_val(iommu->cap))
1555                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1556                                                         DMA_TLB_DSI_FLUSH);
1557                 else
1558                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1559                                                         DMA_TLB_PSI_FLUSH);
1560         }
1561
1562         /*
1563          * In caching mode, changes of pages from non-present to present require
1564          * flush. However, device IOTLB doesn't need to be flushed in this case.
1565          */
1566         if (!cap_caching_mode(iommu->cap) || !map)
1567                 iommu_flush_dev_iotlb(domain, addr, mask);
1568 }
1569
1570 /* Notification for newly created mappings */
1571 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1572                                         struct dmar_domain *domain,
1573                                         unsigned long pfn, unsigned int pages)
1574 {
1575         /*
1576          * It's a non-present to present mapping. Only flush if caching mode
1577          * and second level.
1578          */
1579         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1580                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1581         else
1582                 iommu_flush_write_buffer(iommu);
1583 }
1584
1585 static void iommu_flush_iova(struct iova_domain *iovad)
1586 {
1587         struct dmar_domain *domain;
1588         int idx;
1589
1590         domain = container_of(iovad, struct dmar_domain, iovad);
1591
1592         for_each_domain_iommu(idx, domain) {
1593                 struct intel_iommu *iommu = g_iommus[idx];
1594                 u16 did = domain->iommu_did[iommu->seq_id];
1595
1596                 if (domain_use_first_level(domain))
1597                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1598                 else
1599                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1600                                                  DMA_TLB_DSI_FLUSH);
1601
1602                 if (!cap_caching_mode(iommu->cap))
1603                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1604                                               0, MAX_AGAW_PFN_WIDTH);
1605         }
1606 }
1607
1608 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1609 {
1610         u32 pmen;
1611         unsigned long flags;
1612
1613         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1614                 return;
1615
1616         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1617         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1618         pmen &= ~DMA_PMEN_EPM;
1619         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1620
1621         /* wait for the protected region status bit to clear */
1622         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1623                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1624
1625         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1626 }
1627
1628 static void iommu_enable_translation(struct intel_iommu *iommu)
1629 {
1630         u32 sts;
1631         unsigned long flags;
1632
1633         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634         iommu->gcmd |= DMA_GCMD_TE;
1635         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1636
1637         /* Make sure hardware complete it */
1638         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1639                       readl, (sts & DMA_GSTS_TES), sts);
1640
1641         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1642 }
1643
1644 static void iommu_disable_translation(struct intel_iommu *iommu)
1645 {
1646         u32 sts;
1647         unsigned long flag;
1648
1649         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1650         iommu->gcmd &= ~DMA_GCMD_TE;
1651         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1652
1653         /* Make sure hardware complete it */
1654         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1655                       readl, (!(sts & DMA_GSTS_TES)), sts);
1656
1657         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1658 }
1659
1660 static int iommu_init_domains(struct intel_iommu *iommu)
1661 {
1662         u32 ndomains, nlongs;
1663         size_t size;
1664
1665         ndomains = cap_ndoms(iommu->cap);
1666         pr_debug("%s: Number of Domains supported <%d>\n",
1667                  iommu->name, ndomains);
1668         nlongs = BITS_TO_LONGS(ndomains);
1669
1670         spin_lock_init(&iommu->lock);
1671
1672         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1673         if (!iommu->domain_ids) {
1674                 pr_err("%s: Allocating domain id array failed\n",
1675                        iommu->name);
1676                 return -ENOMEM;
1677         }
1678
1679         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1680         iommu->domains = kzalloc(size, GFP_KERNEL);
1681
1682         if (iommu->domains) {
1683                 size = 256 * sizeof(struct dmar_domain *);
1684                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1685         }
1686
1687         if (!iommu->domains || !iommu->domains[0]) {
1688                 pr_err("%s: Allocating domain array failed\n",
1689                        iommu->name);
1690                 kfree(iommu->domain_ids);
1691                 kfree(iommu->domains);
1692                 iommu->domain_ids = NULL;
1693                 iommu->domains    = NULL;
1694                 return -ENOMEM;
1695         }
1696
1697         /*
1698          * If Caching mode is set, then invalid translations are tagged
1699          * with domain-id 0, hence we need to pre-allocate it. We also
1700          * use domain-id 0 as a marker for non-allocated domain-id, so
1701          * make sure it is not used for a real domain.
1702          */
1703         set_bit(0, iommu->domain_ids);
1704
1705         /*
1706          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1707          * entry for first-level or pass-through translation modes should
1708          * be programmed with a domain id different from those used for
1709          * second-level or nested translation. We reserve a domain id for
1710          * this purpose.
1711          */
1712         if (sm_supported(iommu))
1713                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1714
1715         return 0;
1716 }
1717
1718 static void disable_dmar_iommu(struct intel_iommu *iommu)
1719 {
1720         struct device_domain_info *info, *tmp;
1721         unsigned long flags;
1722
1723         if (!iommu->domains || !iommu->domain_ids)
1724                 return;
1725
1726         spin_lock_irqsave(&device_domain_lock, flags);
1727         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1728                 if (info->iommu != iommu)
1729                         continue;
1730
1731                 if (!info->dev || !info->domain)
1732                         continue;
1733
1734                 __dmar_remove_one_dev_info(info);
1735         }
1736         spin_unlock_irqrestore(&device_domain_lock, flags);
1737
1738         if (iommu->gcmd & DMA_GCMD_TE)
1739                 iommu_disable_translation(iommu);
1740 }
1741
1742 static void free_dmar_iommu(struct intel_iommu *iommu)
1743 {
1744         if ((iommu->domains) && (iommu->domain_ids)) {
1745                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1746                 int i;
1747
1748                 for (i = 0; i < elems; i++)
1749                         kfree(iommu->domains[i]);
1750                 kfree(iommu->domains);
1751                 kfree(iommu->domain_ids);
1752                 iommu->domains = NULL;
1753                 iommu->domain_ids = NULL;
1754         }
1755
1756         g_iommus[iommu->seq_id] = NULL;
1757
1758         /* free context mapping */
1759         free_context_table(iommu);
1760
1761 #ifdef CONFIG_INTEL_IOMMU_SVM
1762         if (pasid_supported(iommu)) {
1763                 if (ecap_prs(iommu->ecap))
1764                         intel_svm_finish_prq(iommu);
1765         }
1766 #endif
1767 }
1768
1769 /*
1770  * Check and return whether first level is used by default for
1771  * DMA translation.
1772  */
1773 static bool first_level_by_default(void)
1774 {
1775         struct dmar_drhd_unit *drhd;
1776         struct intel_iommu *iommu;
1777         static int first_level_support = -1;
1778
1779         if (likely(first_level_support != -1))
1780                 return first_level_support;
1781
1782         first_level_support = 1;
1783
1784         rcu_read_lock();
1785         for_each_active_iommu(iommu, drhd) {
1786                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1787                         first_level_support = 0;
1788                         break;
1789                 }
1790         }
1791         rcu_read_unlock();
1792
1793         return first_level_support;
1794 }
1795
1796 static struct dmar_domain *alloc_domain(int flags)
1797 {
1798         struct dmar_domain *domain;
1799
1800         domain = alloc_domain_mem();
1801         if (!domain)
1802                 return NULL;
1803
1804         memset(domain, 0, sizeof(*domain));
1805         domain->nid = NUMA_NO_NODE;
1806         domain->flags = flags;
1807         if (first_level_by_default())
1808                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1809         domain->has_iotlb_device = false;
1810         INIT_LIST_HEAD(&domain->devices);
1811
1812         return domain;
1813 }
1814
1815 /* Must be called with iommu->lock */
1816 static int domain_attach_iommu(struct dmar_domain *domain,
1817                                struct intel_iommu *iommu)
1818 {
1819         unsigned long ndomains;
1820         int num;
1821
1822         assert_spin_locked(&device_domain_lock);
1823         assert_spin_locked(&iommu->lock);
1824
1825         domain->iommu_refcnt[iommu->seq_id] += 1;
1826         domain->iommu_count += 1;
1827         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1828                 ndomains = cap_ndoms(iommu->cap);
1829                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1830
1831                 if (num >= ndomains) {
1832                         pr_err("%s: No free domain ids\n", iommu->name);
1833                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1834                         domain->iommu_count -= 1;
1835                         return -ENOSPC;
1836                 }
1837
1838                 set_bit(num, iommu->domain_ids);
1839                 set_iommu_domain(iommu, num, domain);
1840
1841                 domain->iommu_did[iommu->seq_id] = num;
1842                 domain->nid                      = iommu->node;
1843
1844                 domain_update_iommu_cap(domain);
1845         }
1846
1847         return 0;
1848 }
1849
1850 static int domain_detach_iommu(struct dmar_domain *domain,
1851                                struct intel_iommu *iommu)
1852 {
1853         int num, count;
1854
1855         assert_spin_locked(&device_domain_lock);
1856         assert_spin_locked(&iommu->lock);
1857
1858         domain->iommu_refcnt[iommu->seq_id] -= 1;
1859         count = --domain->iommu_count;
1860         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1861                 num = domain->iommu_did[iommu->seq_id];
1862                 clear_bit(num, iommu->domain_ids);
1863                 set_iommu_domain(iommu, num, NULL);
1864
1865                 domain_update_iommu_cap(domain);
1866                 domain->iommu_did[iommu->seq_id] = 0;
1867         }
1868
1869         return count;
1870 }
1871
1872 static struct iova_domain reserved_iova_list;
1873 static struct lock_class_key reserved_rbtree_key;
1874
1875 static int dmar_init_reserved_ranges(void)
1876 {
1877         struct pci_dev *pdev = NULL;
1878         struct iova *iova;
1879         int i;
1880
1881         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1882
1883         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1884                 &reserved_rbtree_key);
1885
1886         /* IOAPIC ranges shouldn't be accessed by DMA */
1887         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1888                 IOVA_PFN(IOAPIC_RANGE_END));
1889         if (!iova) {
1890                 pr_err("Reserve IOAPIC range failed\n");
1891                 return -ENODEV;
1892         }
1893
1894         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1895         for_each_pci_dev(pdev) {
1896                 struct resource *r;
1897
1898                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1899                         r = &pdev->resource[i];
1900                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1901                                 continue;
1902                         iova = reserve_iova(&reserved_iova_list,
1903                                             IOVA_PFN(r->start),
1904                                             IOVA_PFN(r->end));
1905                         if (!iova) {
1906                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1907                                 return -ENODEV;
1908                         }
1909                 }
1910         }
1911         return 0;
1912 }
1913
1914 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1915 {
1916         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1917 }
1918
1919 static inline int guestwidth_to_adjustwidth(int gaw)
1920 {
1921         int agaw;
1922         int r = (gaw - 12) % 9;
1923
1924         if (r == 0)
1925                 agaw = gaw;
1926         else
1927                 agaw = gaw + 9 - r;
1928         if (agaw > 64)
1929                 agaw = 64;
1930         return agaw;
1931 }
1932
1933 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1934                        int guest_width)
1935 {
1936         int adjust_width, agaw;
1937         unsigned long sagaw;
1938         int ret;
1939
1940         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1941
1942         if (!intel_iommu_strict) {
1943                 ret = init_iova_flush_queue(&domain->iovad,
1944                                             iommu_flush_iova, iova_entry_free);
1945                 if (ret)
1946                         pr_info("iova flush queue initialization failed\n");
1947         }
1948
1949         domain_reserve_special_ranges(domain);
1950
1951         /* calculate AGAW */
1952         if (guest_width > cap_mgaw(iommu->cap))
1953                 guest_width = cap_mgaw(iommu->cap);
1954         domain->gaw = guest_width;
1955         adjust_width = guestwidth_to_adjustwidth(guest_width);
1956         agaw = width_to_agaw(adjust_width);
1957         sagaw = cap_sagaw(iommu->cap);
1958         if (!test_bit(agaw, &sagaw)) {
1959                 /* hardware doesn't support it, choose a bigger one */
1960                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1961                 agaw = find_next_bit(&sagaw, 5, agaw);
1962                 if (agaw >= 5)
1963                         return -ENODEV;
1964         }
1965         domain->agaw = agaw;
1966
1967         if (ecap_coherent(iommu->ecap))
1968                 domain->iommu_coherency = 1;
1969         else
1970                 domain->iommu_coherency = 0;
1971
1972         if (ecap_sc_support(iommu->ecap))
1973                 domain->iommu_snooping = 1;
1974         else
1975                 domain->iommu_snooping = 0;
1976
1977         if (intel_iommu_superpage)
1978                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1979         else
1980                 domain->iommu_superpage = 0;
1981
1982         domain->nid = iommu->node;
1983
1984         /* always allocate the top pgd */
1985         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1986         if (!domain->pgd)
1987                 return -ENOMEM;
1988         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1989         return 0;
1990 }
1991
1992 static void domain_exit(struct dmar_domain *domain)
1993 {
1994
1995         /* Remove associated devices and clear attached or cached domains */
1996         domain_remove_dev_info(domain);
1997
1998         /* destroy iovas */
1999         put_iova_domain(&domain->iovad);
2000
2001         if (domain->pgd) {
2002                 struct page *freelist;
2003
2004                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2005                 dma_free_pagelist(freelist);
2006         }
2007
2008         free_domain_mem(domain);
2009 }
2010
2011 /*
2012  * Get the PASID directory size for scalable mode context entry.
2013  * Value of X in the PDTS field of a scalable mode context entry
2014  * indicates PASID directory with 2^(X + 7) entries.
2015  */
2016 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2017 {
2018         int pds, max_pde;
2019
2020         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2021         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2022         if (pds < 7)
2023                 return 0;
2024
2025         return pds - 7;
2026 }
2027
2028 /*
2029  * Set the RID_PASID field of a scalable mode context entry. The
2030  * IOMMU hardware will use the PASID value set in this field for
2031  * DMA translations of DMA requests without PASID.
2032  */
2033 static inline void
2034 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2035 {
2036         context->hi |= pasid & ((1 << 20) - 1);
2037         context->hi |= (1 << 20);
2038 }
2039
2040 /*
2041  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2042  * entry.
2043  */
2044 static inline void context_set_sm_dte(struct context_entry *context)
2045 {
2046         context->lo |= (1 << 2);
2047 }
2048
2049 /*
2050  * Set the PRE(Page Request Enable) field of a scalable mode context
2051  * entry.
2052  */
2053 static inline void context_set_sm_pre(struct context_entry *context)
2054 {
2055         context->lo |= (1 << 4);
2056 }
2057
2058 /* Convert value to context PASID directory size field coding. */
2059 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2060
2061 static int domain_context_mapping_one(struct dmar_domain *domain,
2062                                       struct intel_iommu *iommu,
2063                                       struct pasid_table *table,
2064                                       u8 bus, u8 devfn)
2065 {
2066         u16 did = domain->iommu_did[iommu->seq_id];
2067         int translation = CONTEXT_TT_MULTI_LEVEL;
2068         struct device_domain_info *info = NULL;
2069         struct context_entry *context;
2070         unsigned long flags;
2071         int ret;
2072
2073         WARN_ON(did == 0);
2074
2075         if (hw_pass_through && domain_type_is_si(domain))
2076                 translation = CONTEXT_TT_PASS_THROUGH;
2077
2078         pr_debug("Set context mapping for %02x:%02x.%d\n",
2079                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2080
2081         BUG_ON(!domain->pgd);
2082
2083         spin_lock_irqsave(&device_domain_lock, flags);
2084         spin_lock(&iommu->lock);
2085
2086         ret = -ENOMEM;
2087         context = iommu_context_addr(iommu, bus, devfn, 1);
2088         if (!context)
2089                 goto out_unlock;
2090
2091         ret = 0;
2092         if (context_present(context))
2093                 goto out_unlock;
2094
2095         /*
2096          * For kdump cases, old valid entries may be cached due to the
2097          * in-flight DMA and copied pgtable, but there is no unmapping
2098          * behaviour for them, thus we need an explicit cache flush for
2099          * the newly-mapped device. For kdump, at this point, the device
2100          * is supposed to finish reset at its driver probe stage, so no
2101          * in-flight DMA will exist, and we don't need to worry anymore
2102          * hereafter.
2103          */
2104         if (context_copied(context)) {
2105                 u16 did_old = context_domain_id(context);
2106
2107                 if (did_old < cap_ndoms(iommu->cap)) {
2108                         iommu->flush.flush_context(iommu, did_old,
2109                                                    (((u16)bus) << 8) | devfn,
2110                                                    DMA_CCMD_MASK_NOBIT,
2111                                                    DMA_CCMD_DEVICE_INVL);
2112                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2113                                                  DMA_TLB_DSI_FLUSH);
2114                 }
2115         }
2116
2117         context_clear_entry(context);
2118
2119         if (sm_supported(iommu)) {
2120                 unsigned long pds;
2121
2122                 WARN_ON(!table);
2123
2124                 /* Setup the PASID DIR pointer: */
2125                 pds = context_get_sm_pds(table);
2126                 context->lo = (u64)virt_to_phys(table->table) |
2127                                 context_pdts(pds);
2128
2129                 /* Setup the RID_PASID field: */
2130                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2131
2132                 /*
2133                  * Setup the Device-TLB enable bit and Page request
2134                  * Enable bit:
2135                  */
2136                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2137                 if (info && info->ats_supported)
2138                         context_set_sm_dte(context);
2139                 if (info && info->pri_supported)
2140                         context_set_sm_pre(context);
2141         } else {
2142                 struct dma_pte *pgd = domain->pgd;
2143                 int agaw;
2144
2145                 context_set_domain_id(context, did);
2146
2147                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2148                         /*
2149                          * Skip top levels of page tables for iommu which has
2150                          * less agaw than default. Unnecessary for PT mode.
2151                          */
2152                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2153                                 ret = -ENOMEM;
2154                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2155                                 if (!dma_pte_present(pgd))
2156                                         goto out_unlock;
2157                         }
2158
2159                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2160                         if (info && info->ats_supported)
2161                                 translation = CONTEXT_TT_DEV_IOTLB;
2162                         else
2163                                 translation = CONTEXT_TT_MULTI_LEVEL;
2164
2165                         context_set_address_root(context, virt_to_phys(pgd));
2166                         context_set_address_width(context, agaw);
2167                 } else {
2168                         /*
2169                          * In pass through mode, AW must be programmed to
2170                          * indicate the largest AGAW value supported by
2171                          * hardware. And ASR is ignored by hardware.
2172                          */
2173                         context_set_address_width(context, iommu->msagaw);
2174                 }
2175
2176                 context_set_translation_type(context, translation);
2177         }
2178
2179         context_set_fault_enable(context);
2180         context_set_present(context);
2181         domain_flush_cache(domain, context, sizeof(*context));
2182
2183         /*
2184          * It's a non-present to present mapping. If hardware doesn't cache
2185          * non-present entry we only need to flush the write-buffer. If the
2186          * _does_ cache non-present entries, then it does so in the special
2187          * domain #0, which we have to flush:
2188          */
2189         if (cap_caching_mode(iommu->cap)) {
2190                 iommu->flush.flush_context(iommu, 0,
2191                                            (((u16)bus) << 8) | devfn,
2192                                            DMA_CCMD_MASK_NOBIT,
2193                                            DMA_CCMD_DEVICE_INVL);
2194                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2195         } else {
2196                 iommu_flush_write_buffer(iommu);
2197         }
2198         iommu_enable_dev_iotlb(info);
2199
2200         ret = 0;
2201
2202 out_unlock:
2203         spin_unlock(&iommu->lock);
2204         spin_unlock_irqrestore(&device_domain_lock, flags);
2205
2206         return ret;
2207 }
2208
2209 struct domain_context_mapping_data {
2210         struct dmar_domain *domain;
2211         struct intel_iommu *iommu;
2212         struct pasid_table *table;
2213 };
2214
2215 static int domain_context_mapping_cb(struct pci_dev *pdev,
2216                                      u16 alias, void *opaque)
2217 {
2218         struct domain_context_mapping_data *data = opaque;
2219
2220         return domain_context_mapping_one(data->domain, data->iommu,
2221                                           data->table, PCI_BUS_NUM(alias),
2222                                           alias & 0xff);
2223 }
2224
2225 static int
2226 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2227 {
2228         struct domain_context_mapping_data data;
2229         struct pasid_table *table;
2230         struct intel_iommu *iommu;
2231         u8 bus, devfn;
2232
2233         iommu = device_to_iommu(dev, &bus, &devfn);
2234         if (!iommu)
2235                 return -ENODEV;
2236
2237         table = intel_pasid_get_table(dev);
2238
2239         if (!dev_is_pci(dev))
2240                 return domain_context_mapping_one(domain, iommu, table,
2241                                                   bus, devfn);
2242
2243         data.domain = domain;
2244         data.iommu = iommu;
2245         data.table = table;
2246
2247         return pci_for_each_dma_alias(to_pci_dev(dev),
2248                                       &domain_context_mapping_cb, &data);
2249 }
2250
2251 static int domain_context_mapped_cb(struct pci_dev *pdev,
2252                                     u16 alias, void *opaque)
2253 {
2254         struct intel_iommu *iommu = opaque;
2255
2256         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2257 }
2258
2259 static int domain_context_mapped(struct device *dev)
2260 {
2261         struct intel_iommu *iommu;
2262         u8 bus, devfn;
2263
2264         iommu = device_to_iommu(dev, &bus, &devfn);
2265         if (!iommu)
2266                 return -ENODEV;
2267
2268         if (!dev_is_pci(dev))
2269                 return device_context_mapped(iommu, bus, devfn);
2270
2271         return !pci_for_each_dma_alias(to_pci_dev(dev),
2272                                        domain_context_mapped_cb, iommu);
2273 }
2274
2275 /* Returns a number of VTD pages, but aligned to MM page size */
2276 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2277                                             size_t size)
2278 {
2279         host_addr &= ~PAGE_MASK;
2280         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2281 }
2282
2283 /* Return largest possible superpage level for a given mapping */
2284 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2285                                           unsigned long iov_pfn,
2286                                           unsigned long phy_pfn,
2287                                           unsigned long pages)
2288 {
2289         int support, level = 1;
2290         unsigned long pfnmerge;
2291
2292         support = domain->iommu_superpage;
2293
2294         /* To use a large page, the virtual *and* physical addresses
2295            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2296            of them will mean we have to use smaller pages. So just
2297            merge them and check both at once. */
2298         pfnmerge = iov_pfn | phy_pfn;
2299
2300         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2301                 pages >>= VTD_STRIDE_SHIFT;
2302                 if (!pages)
2303                         break;
2304                 pfnmerge >>= VTD_STRIDE_SHIFT;
2305                 level++;
2306                 support--;
2307         }
2308         return level;
2309 }
2310
2311 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2312                             struct scatterlist *sg, unsigned long phys_pfn,
2313                             unsigned long nr_pages, int prot)
2314 {
2315         struct dma_pte *first_pte = NULL, *pte = NULL;
2316         phys_addr_t uninitialized_var(pteval);
2317         unsigned long sg_res = 0;
2318         unsigned int largepage_lvl = 0;
2319         unsigned long lvl_pages = 0;
2320         u64 attr;
2321
2322         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2323
2324         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2325                 return -EINVAL;
2326
2327         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2328         if (domain_use_first_level(domain))
2329                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2330
2331         if (!sg) {
2332                 sg_res = nr_pages;
2333                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2334         }
2335
2336         while (nr_pages > 0) {
2337                 uint64_t tmp;
2338
2339                 if (!sg_res) {
2340                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2341
2342                         sg_res = aligned_nrpages(sg->offset, sg->length);
2343                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2344                         sg->dma_length = sg->length;
2345                         pteval = (sg_phys(sg) - pgoff) | attr;
2346                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2347                 }
2348
2349                 if (!pte) {
2350                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2351
2352                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2353                         if (!pte)
2354                                 return -ENOMEM;
2355                         /* It is large page*/
2356                         if (largepage_lvl > 1) {
2357                                 unsigned long nr_superpages, end_pfn;
2358
2359                                 pteval |= DMA_PTE_LARGE_PAGE;
2360                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2361
2362                                 nr_superpages = sg_res / lvl_pages;
2363                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2364
2365                                 /*
2366                                  * Ensure that old small page tables are
2367                                  * removed to make room for superpage(s).
2368                                  * We're adding new large pages, so make sure
2369                                  * we don't remove their parent tables.
2370                                  */
2371                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2372                                                        largepage_lvl + 1);
2373                         } else {
2374                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2375                         }
2376
2377                 }
2378                 /* We don't need lock here, nobody else
2379                  * touches the iova range
2380                  */
2381                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2382                 if (tmp) {
2383                         static int dumps = 5;
2384                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2385                                 iov_pfn, tmp, (unsigned long long)pteval);
2386                         if (dumps) {
2387                                 dumps--;
2388                                 debug_dma_dump_mappings(NULL);
2389                         }
2390                         WARN_ON(1);
2391                 }
2392
2393                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2394
2395                 BUG_ON(nr_pages < lvl_pages);
2396                 BUG_ON(sg_res < lvl_pages);
2397
2398                 nr_pages -= lvl_pages;
2399                 iov_pfn += lvl_pages;
2400                 phys_pfn += lvl_pages;
2401                 pteval += lvl_pages * VTD_PAGE_SIZE;
2402                 sg_res -= lvl_pages;
2403
2404                 /* If the next PTE would be the first in a new page, then we
2405                    need to flush the cache on the entries we've just written.
2406                    And then we'll need to recalculate 'pte', so clear it and
2407                    let it get set again in the if (!pte) block above.
2408
2409                    If we're done (!nr_pages) we need to flush the cache too.
2410
2411                    Also if we've been setting superpages, we may need to
2412                    recalculate 'pte' and switch back to smaller pages for the
2413                    end of the mapping, if the trailing size is not enough to
2414                    use another superpage (i.e. sg_res < lvl_pages). */
2415                 pte++;
2416                 if (!nr_pages || first_pte_in_page(pte) ||
2417                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2418                         domain_flush_cache(domain, first_pte,
2419                                            (void *)pte - (void *)first_pte);
2420                         pte = NULL;
2421                 }
2422
2423                 if (!sg_res && nr_pages)
2424                         sg = sg_next(sg);
2425         }
2426         return 0;
2427 }
2428
2429 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2430                           struct scatterlist *sg, unsigned long phys_pfn,
2431                           unsigned long nr_pages, int prot)
2432 {
2433         int iommu_id, ret;
2434         struct intel_iommu *iommu;
2435
2436         /* Do the real mapping first */
2437         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2438         if (ret)
2439                 return ret;
2440
2441         for_each_domain_iommu(iommu_id, domain) {
2442                 iommu = g_iommus[iommu_id];
2443                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2444         }
2445
2446         return 0;
2447 }
2448
2449 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2450                                     struct scatterlist *sg, unsigned long nr_pages,
2451                                     int prot)
2452 {
2453         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2454 }
2455
2456 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2457                                      unsigned long phys_pfn, unsigned long nr_pages,
2458                                      int prot)
2459 {
2460         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2461 }
2462
2463 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2464 {
2465         unsigned long flags;
2466         struct context_entry *context;
2467         u16 did_old;
2468
2469         if (!iommu)
2470                 return;
2471
2472         spin_lock_irqsave(&iommu->lock, flags);
2473         context = iommu_context_addr(iommu, bus, devfn, 0);
2474         if (!context) {
2475                 spin_unlock_irqrestore(&iommu->lock, flags);
2476                 return;
2477         }
2478         did_old = context_domain_id(context);
2479         context_clear_entry(context);
2480         __iommu_flush_cache(iommu, context, sizeof(*context));
2481         spin_unlock_irqrestore(&iommu->lock, flags);
2482         iommu->flush.flush_context(iommu,
2483                                    did_old,
2484                                    (((u16)bus) << 8) | devfn,
2485                                    DMA_CCMD_MASK_NOBIT,
2486                                    DMA_CCMD_DEVICE_INVL);
2487         iommu->flush.flush_iotlb(iommu,
2488                                  did_old,
2489                                  0,
2490                                  0,
2491                                  DMA_TLB_DSI_FLUSH);
2492 }
2493
2494 static inline void unlink_domain_info(struct device_domain_info *info)
2495 {
2496         assert_spin_locked(&device_domain_lock);
2497         list_del(&info->link);
2498         list_del(&info->global);
2499         if (info->dev)
2500                 info->dev->archdata.iommu = NULL;
2501 }
2502
2503 static void domain_remove_dev_info(struct dmar_domain *domain)
2504 {
2505         struct device_domain_info *info, *tmp;
2506         unsigned long flags;
2507
2508         spin_lock_irqsave(&device_domain_lock, flags);
2509         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2510                 __dmar_remove_one_dev_info(info);
2511         spin_unlock_irqrestore(&device_domain_lock, flags);
2512 }
2513
2514 struct dmar_domain *find_domain(struct device *dev)
2515 {
2516         struct device_domain_info *info;
2517
2518         if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2519                 return NULL;
2520
2521         if (dev_is_pci(dev))
2522                 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2523
2524         /* No lock here, assumes no domain exit in normal case */
2525         info = dev->archdata.iommu;
2526         if (likely(info))
2527                 return info->domain;
2528
2529         return NULL;
2530 }
2531
2532 static void do_deferred_attach(struct device *dev)
2533 {
2534         struct iommu_domain *domain;
2535
2536         dev->archdata.iommu = NULL;
2537         domain = iommu_get_domain_for_dev(dev);
2538         if (domain)
2539                 intel_iommu_attach_device(domain, dev);
2540 }
2541
2542 static inline struct device_domain_info *
2543 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2544 {
2545         struct device_domain_info *info;
2546
2547         list_for_each_entry(info, &device_domain_list, global)
2548                 if (info->iommu->segment == segment && info->bus == bus &&
2549                     info->devfn == devfn)
2550                         return info;
2551
2552         return NULL;
2553 }
2554
2555 static int domain_setup_first_level(struct intel_iommu *iommu,
2556                                     struct dmar_domain *domain,
2557                                     struct device *dev,
2558                                     int pasid)
2559 {
2560         int flags = PASID_FLAG_SUPERVISOR_MODE;
2561         struct dma_pte *pgd = domain->pgd;
2562         int agaw, level;
2563
2564         /*
2565          * Skip top levels of page tables for iommu which has
2566          * less agaw than default. Unnecessary for PT mode.
2567          */
2568         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2569                 pgd = phys_to_virt(dma_pte_addr(pgd));
2570                 if (!dma_pte_present(pgd))
2571                         return -ENOMEM;
2572         }
2573
2574         level = agaw_to_level(agaw);
2575         if (level != 4 && level != 5)
2576                 return -EINVAL;
2577
2578         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2579
2580         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2581                                              domain->iommu_did[iommu->seq_id],
2582                                              flags);
2583 }
2584
2585 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2586                                                     int bus, int devfn,
2587                                                     struct device *dev,
2588                                                     struct dmar_domain *domain)
2589 {
2590         struct dmar_domain *found = NULL;
2591         struct device_domain_info *info;
2592         unsigned long flags;
2593         int ret;
2594
2595         info = alloc_devinfo_mem();
2596         if (!info)
2597                 return NULL;
2598
2599         info->bus = bus;
2600         info->devfn = devfn;
2601         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2602         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2603         info->ats_qdep = 0;
2604         info->dev = dev;
2605         info->domain = domain;
2606         info->iommu = iommu;
2607         info->pasid_table = NULL;
2608         info->auxd_enabled = 0;
2609         INIT_LIST_HEAD(&info->auxiliary_domains);
2610
2611         if (dev && dev_is_pci(dev)) {
2612                 struct pci_dev *pdev = to_pci_dev(info->dev);
2613
2614                 if (!pdev->untrusted &&
2615                     !pci_ats_disabled() &&
2616                     ecap_dev_iotlb_support(iommu->ecap) &&
2617                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2618                     dmar_find_matched_atsr_unit(pdev))
2619                         info->ats_supported = 1;
2620
2621                 if (sm_supported(iommu)) {
2622                         if (pasid_supported(iommu)) {
2623                                 int features = pci_pasid_features(pdev);
2624                                 if (features >= 0)
2625                                         info->pasid_supported = features | 1;
2626                         }
2627
2628                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2629                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2630                                 info->pri_supported = 1;
2631                 }
2632         }
2633
2634         spin_lock_irqsave(&device_domain_lock, flags);
2635         if (dev)
2636                 found = find_domain(dev);
2637
2638         if (!found) {
2639                 struct device_domain_info *info2;
2640                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2641                 if (info2) {
2642                         found      = info2->domain;
2643                         info2->dev = dev;
2644                 }
2645         }
2646
2647         if (found) {
2648                 spin_unlock_irqrestore(&device_domain_lock, flags);
2649                 free_devinfo_mem(info);
2650                 /* Caller must free the original domain */
2651                 return found;
2652         }
2653
2654         spin_lock(&iommu->lock);
2655         ret = domain_attach_iommu(domain, iommu);
2656         spin_unlock(&iommu->lock);
2657
2658         if (ret) {
2659                 spin_unlock_irqrestore(&device_domain_lock, flags);
2660                 free_devinfo_mem(info);
2661                 return NULL;
2662         }
2663
2664         list_add(&info->link, &domain->devices);
2665         list_add(&info->global, &device_domain_list);
2666         if (dev)
2667                 dev->archdata.iommu = info;
2668         spin_unlock_irqrestore(&device_domain_lock, flags);
2669
2670         /* PASID table is mandatory for a PCI device in scalable mode. */
2671         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2672                 ret = intel_pasid_alloc_table(dev);
2673                 if (ret) {
2674                         dev_err(dev, "PASID table allocation failed\n");
2675                         dmar_remove_one_dev_info(dev);
2676                         return NULL;
2677                 }
2678
2679                 /* Setup the PASID entry for requests without PASID: */
2680                 spin_lock(&iommu->lock);
2681                 if (hw_pass_through && domain_type_is_si(domain))
2682                         ret = intel_pasid_setup_pass_through(iommu, domain,
2683                                         dev, PASID_RID2PASID);
2684                 else if (domain_use_first_level(domain))
2685                         ret = domain_setup_first_level(iommu, domain, dev,
2686                                         PASID_RID2PASID);
2687                 else
2688                         ret = intel_pasid_setup_second_level(iommu, domain,
2689                                         dev, PASID_RID2PASID);
2690                 spin_unlock(&iommu->lock);
2691                 if (ret) {
2692                         dev_err(dev, "Setup RID2PASID failed\n");
2693                         dmar_remove_one_dev_info(dev);
2694                         return NULL;
2695                 }
2696         }
2697
2698         if (dev && domain_context_mapping(domain, dev)) {
2699                 dev_err(dev, "Domain context map failed\n");
2700                 dmar_remove_one_dev_info(dev);
2701                 return NULL;
2702         }
2703
2704         return domain;
2705 }
2706
2707 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2708 {
2709         *(u16 *)opaque = alias;
2710         return 0;
2711 }
2712
2713 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2714 {
2715         struct device_domain_info *info;
2716         struct dmar_domain *domain = NULL;
2717         struct intel_iommu *iommu;
2718         u16 dma_alias;
2719         unsigned long flags;
2720         u8 bus, devfn;
2721
2722         iommu = device_to_iommu(dev, &bus, &devfn);
2723         if (!iommu)
2724                 return NULL;
2725
2726         if (dev_is_pci(dev)) {
2727                 struct pci_dev *pdev = to_pci_dev(dev);
2728
2729                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2730
2731                 spin_lock_irqsave(&device_domain_lock, flags);
2732                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2733                                                       PCI_BUS_NUM(dma_alias),
2734                                                       dma_alias & 0xff);
2735                 if (info) {
2736                         iommu = info->iommu;
2737                         domain = info->domain;
2738                 }
2739                 spin_unlock_irqrestore(&device_domain_lock, flags);
2740
2741                 /* DMA alias already has a domain, use it */
2742                 if (info)
2743                         goto out;
2744         }
2745
2746         /* Allocate and initialize new domain for the device */
2747         domain = alloc_domain(0);
2748         if (!domain)
2749                 return NULL;
2750         if (domain_init(domain, iommu, gaw)) {
2751                 domain_exit(domain);
2752                 return NULL;
2753         }
2754
2755 out:
2756         return domain;
2757 }
2758
2759 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2760                                               struct dmar_domain *domain)
2761 {
2762         struct intel_iommu *iommu;
2763         struct dmar_domain *tmp;
2764         u16 req_id, dma_alias;
2765         u8 bus, devfn;
2766
2767         iommu = device_to_iommu(dev, &bus, &devfn);
2768         if (!iommu)
2769                 return NULL;
2770
2771         req_id = ((u16)bus << 8) | devfn;
2772
2773         if (dev_is_pci(dev)) {
2774                 struct pci_dev *pdev = to_pci_dev(dev);
2775
2776                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2777
2778                 /* register PCI DMA alias device */
2779                 if (req_id != dma_alias) {
2780                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2781                                         dma_alias & 0xff, NULL, domain);
2782
2783                         if (!tmp || tmp != domain)
2784                                 return tmp;
2785                 }
2786         }
2787
2788         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2789         if (!tmp || tmp != domain)
2790                 return tmp;
2791
2792         return domain;
2793 }
2794
2795 static int iommu_domain_identity_map(struct dmar_domain *domain,
2796                                      unsigned long long start,
2797                                      unsigned long long end)
2798 {
2799         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2800         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2801
2802         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2803                           dma_to_mm_pfn(last_vpfn))) {
2804                 pr_err("Reserving iova failed\n");
2805                 return -ENOMEM;
2806         }
2807
2808         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2809         /*
2810          * RMRR range might have overlap with physical memory range,
2811          * clear it first
2812          */
2813         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2814
2815         return __domain_mapping(domain, first_vpfn, NULL,
2816                                 first_vpfn, last_vpfn - first_vpfn + 1,
2817                                 DMA_PTE_READ|DMA_PTE_WRITE);
2818 }
2819
2820 static int domain_prepare_identity_map(struct device *dev,
2821                                        struct dmar_domain *domain,
2822                                        unsigned long long start,
2823                                        unsigned long long end)
2824 {
2825         /* For _hardware_ passthrough, don't bother. But for software
2826            passthrough, we do it anyway -- it may indicate a memory
2827            range which is reserved in E820, so which didn't get set
2828            up to start with in si_domain */
2829         if (domain == si_domain && hw_pass_through) {
2830                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2831                          start, end);
2832                 return 0;
2833         }
2834
2835         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2836
2837         if (end < start) {
2838                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2839                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2840                         dmi_get_system_info(DMI_BIOS_VENDOR),
2841                         dmi_get_system_info(DMI_BIOS_VERSION),
2842                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2843                 return -EIO;
2844         }
2845
2846         if (end >> agaw_to_width(domain->agaw)) {
2847                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2848                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2849                      agaw_to_width(domain->agaw),
2850                      dmi_get_system_info(DMI_BIOS_VENDOR),
2851                      dmi_get_system_info(DMI_BIOS_VERSION),
2852                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2853                 return -EIO;
2854         }
2855
2856         return iommu_domain_identity_map(domain, start, end);
2857 }
2858
2859 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2860
2861 static int __init si_domain_init(int hw)
2862 {
2863         struct dmar_rmrr_unit *rmrr;
2864         struct device *dev;
2865         int i, nid, ret;
2866
2867         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2868         if (!si_domain)
2869                 return -EFAULT;
2870
2871         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2872                 domain_exit(si_domain);
2873                 return -EFAULT;
2874         }
2875
2876         if (hw)
2877                 return 0;
2878
2879         for_each_online_node(nid) {
2880                 unsigned long start_pfn, end_pfn;
2881                 int i;
2882
2883                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2884                         ret = iommu_domain_identity_map(si_domain,
2885                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2886                         if (ret)
2887                                 return ret;
2888                 }
2889         }
2890
2891         /*
2892          * Identity map the RMRRs so that devices with RMRRs could also use
2893          * the si_domain.
2894          */
2895         for_each_rmrr_units(rmrr) {
2896                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2897                                           i, dev) {
2898                         unsigned long long start = rmrr->base_address;
2899                         unsigned long long end = rmrr->end_address;
2900
2901                         if (WARN_ON(end < start ||
2902                                     end >> agaw_to_width(si_domain->agaw)))
2903                                 continue;
2904
2905                         ret = iommu_domain_identity_map(si_domain, start, end);
2906                         if (ret)
2907                                 return ret;
2908                 }
2909         }
2910
2911         return 0;
2912 }
2913
2914 static int identity_mapping(struct device *dev)
2915 {
2916         struct device_domain_info *info;
2917
2918         info = dev->archdata.iommu;
2919         if (info)
2920                 return (info->domain == si_domain);
2921
2922         return 0;
2923 }
2924
2925 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2926 {
2927         struct dmar_domain *ndomain;
2928         struct intel_iommu *iommu;
2929         u8 bus, devfn;
2930
2931         iommu = device_to_iommu(dev, &bus, &devfn);
2932         if (!iommu)
2933                 return -ENODEV;
2934
2935         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2936         if (ndomain != domain)
2937                 return -EBUSY;
2938
2939         return 0;
2940 }
2941
2942 static bool device_has_rmrr(struct device *dev)
2943 {
2944         struct dmar_rmrr_unit *rmrr;
2945         struct device *tmp;
2946         int i;
2947
2948         rcu_read_lock();
2949         for_each_rmrr_units(rmrr) {
2950                 /*
2951                  * Return TRUE if this RMRR contains the device that
2952                  * is passed in.
2953                  */
2954                 for_each_active_dev_scope(rmrr->devices,
2955                                           rmrr->devices_cnt, i, tmp)
2956                         if (tmp == dev ||
2957                             is_downstream_to_pci_bridge(dev, tmp)) {
2958                                 rcu_read_unlock();
2959                                 return true;
2960                         }
2961         }
2962         rcu_read_unlock();
2963         return false;
2964 }
2965
2966 /**
2967  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2968  * is relaxable (ie. is allowed to be not enforced under some conditions)
2969  * @dev: device handle
2970  *
2971  * We assume that PCI USB devices with RMRRs have them largely
2972  * for historical reasons and that the RMRR space is not actively used post
2973  * boot.  This exclusion may change if vendors begin to abuse it.
2974  *
2975  * The same exception is made for graphics devices, with the requirement that
2976  * any use of the RMRR regions will be torn down before assigning the device
2977  * to a guest.
2978  *
2979  * Return: true if the RMRR is relaxable, false otherwise
2980  */
2981 static bool device_rmrr_is_relaxable(struct device *dev)
2982 {
2983         struct pci_dev *pdev;
2984
2985         if (!dev_is_pci(dev))
2986                 return false;
2987
2988         pdev = to_pci_dev(dev);
2989         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2990                 return true;
2991         else
2992                 return false;
2993 }
2994
2995 /*
2996  * There are a couple cases where we need to restrict the functionality of
2997  * devices associated with RMRRs.  The first is when evaluating a device for
2998  * identity mapping because problems exist when devices are moved in and out
2999  * of domains and their respective RMRR information is lost.  This means that
3000  * a device with associated RMRRs will never be in a "passthrough" domain.
3001  * The second is use of the device through the IOMMU API.  This interface
3002  * expects to have full control of the IOVA space for the device.  We cannot
3003  * satisfy both the requirement that RMRR access is maintained and have an
3004  * unencumbered IOVA space.  We also have no ability to quiesce the device's
3005  * use of the RMRR space or even inform the IOMMU API user of the restriction.
3006  * We therefore prevent devices associated with an RMRR from participating in
3007  * the IOMMU API, which eliminates them from device assignment.
3008  *
3009  * In both cases, devices which have relaxable RMRRs are not concerned by this
3010  * restriction. See device_rmrr_is_relaxable comment.
3011  */
3012 static bool device_is_rmrr_locked(struct device *dev)
3013 {
3014         if (!device_has_rmrr(dev))
3015                 return false;
3016
3017         if (device_rmrr_is_relaxable(dev))
3018                 return false;
3019
3020         return true;
3021 }
3022
3023 /*
3024  * Return the required default domain type for a specific device.
3025  *
3026  * @dev: the device in query
3027  * @startup: true if this is during early boot
3028  *
3029  * Returns:
3030  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3031  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3032  *  - 0: both identity and dynamic domains work for this device
3033  */
3034 static int device_def_domain_type(struct device *dev)
3035 {
3036         if (dev_is_pci(dev)) {
3037                 struct pci_dev *pdev = to_pci_dev(dev);
3038
3039                 /*
3040                  * Prevent any device marked as untrusted from getting
3041                  * placed into the statically identity mapping domain.
3042                  */
3043                 if (pdev->untrusted)
3044                         return IOMMU_DOMAIN_DMA;
3045
3046                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3047                         return IOMMU_DOMAIN_IDENTITY;
3048
3049                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3050                         return IOMMU_DOMAIN_IDENTITY;
3051
3052                 /*
3053                  * We want to start off with all devices in the 1:1 domain, and
3054                  * take them out later if we find they can't access all of memory.
3055                  *
3056                  * However, we can't do this for PCI devices behind bridges,
3057                  * because all PCI devices behind the same bridge will end up
3058                  * with the same source-id on their transactions.
3059                  *
3060                  * Practically speaking, we can't change things around for these
3061                  * devices at run-time, because we can't be sure there'll be no
3062                  * DMA transactions in flight for any of their siblings.
3063                  *
3064                  * So PCI devices (unless they're on the root bus) as well as
3065                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3066                  * the 1:1 domain, just in _case_ one of their siblings turns out
3067                  * not to be able to map all of memory.
3068                  */
3069                 if (!pci_is_pcie(pdev)) {
3070                         if (!pci_is_root_bus(pdev->bus))
3071                                 return IOMMU_DOMAIN_DMA;
3072                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3073                                 return IOMMU_DOMAIN_DMA;
3074                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3075                         return IOMMU_DOMAIN_DMA;
3076         }
3077
3078         return 0;
3079 }
3080
3081 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3082 {
3083         /*
3084          * Start from the sane iommu hardware state.
3085          * If the queued invalidation is already initialized by us
3086          * (for example, while enabling interrupt-remapping) then
3087          * we got the things already rolling from a sane state.
3088          */
3089         if (!iommu->qi) {
3090                 /*
3091                  * Clear any previous faults.
3092                  */
3093                 dmar_fault(-1, iommu);
3094                 /*
3095                  * Disable queued invalidation if supported and already enabled
3096                  * before OS handover.
3097                  */
3098                 dmar_disable_qi(iommu);
3099         }
3100
3101         if (dmar_enable_qi(iommu)) {
3102                 /*
3103                  * Queued Invalidate not enabled, use Register Based Invalidate
3104                  */
3105                 iommu->flush.flush_context = __iommu_flush_context;
3106                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3107                 pr_info("%s: Using Register based invalidation\n",
3108                         iommu->name);
3109         } else {
3110                 iommu->flush.flush_context = qi_flush_context;
3111                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3112                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3113         }
3114 }
3115
3116 static int copy_context_table(struct intel_iommu *iommu,
3117                               struct root_entry *old_re,
3118                               struct context_entry **tbl,
3119                               int bus, bool ext)
3120 {
3121         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3122         struct context_entry *new_ce = NULL, ce;
3123         struct context_entry *old_ce = NULL;
3124         struct root_entry re;
3125         phys_addr_t old_ce_phys;
3126
3127         tbl_idx = ext ? bus * 2 : bus;
3128         memcpy(&re, old_re, sizeof(re));
3129
3130         for (devfn = 0; devfn < 256; devfn++) {
3131                 /* First calculate the correct index */
3132                 idx = (ext ? devfn * 2 : devfn) % 256;
3133
3134                 if (idx == 0) {
3135                         /* First save what we may have and clean up */
3136                         if (new_ce) {
3137                                 tbl[tbl_idx] = new_ce;
3138                                 __iommu_flush_cache(iommu, new_ce,
3139                                                     VTD_PAGE_SIZE);
3140                                 pos = 1;
3141                         }
3142
3143                         if (old_ce)
3144                                 memunmap(old_ce);
3145
3146                         ret = 0;
3147                         if (devfn < 0x80)
3148                                 old_ce_phys = root_entry_lctp(&re);
3149                         else
3150                                 old_ce_phys = root_entry_uctp(&re);
3151
3152                         if (!old_ce_phys) {
3153                                 if (ext && devfn == 0) {
3154                                         /* No LCTP, try UCTP */
3155                                         devfn = 0x7f;
3156                                         continue;
3157                                 } else {
3158                                         goto out;
3159                                 }
3160                         }
3161
3162                         ret = -ENOMEM;
3163                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3164                                         MEMREMAP_WB);
3165                         if (!old_ce)
3166                                 goto out;
3167
3168                         new_ce = alloc_pgtable_page(iommu->node);
3169                         if (!new_ce)
3170                                 goto out_unmap;
3171
3172                         ret = 0;
3173                 }
3174
3175                 /* Now copy the context entry */
3176                 memcpy(&ce, old_ce + idx, sizeof(ce));
3177
3178                 if (!__context_present(&ce))
3179                         continue;
3180
3181                 did = context_domain_id(&ce);
3182                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3183                         set_bit(did, iommu->domain_ids);
3184
3185                 /*
3186                  * We need a marker for copied context entries. This
3187                  * marker needs to work for the old format as well as
3188                  * for extended context entries.
3189                  *
3190                  * Bit 67 of the context entry is used. In the old
3191                  * format this bit is available to software, in the
3192                  * extended format it is the PGE bit, but PGE is ignored
3193                  * by HW if PASIDs are disabled (and thus still
3194                  * available).
3195                  *
3196                  * So disable PASIDs first and then mark the entry
3197                  * copied. This means that we don't copy PASID
3198                  * translations from the old kernel, but this is fine as
3199                  * faults there are not fatal.
3200                  */
3201                 context_clear_pasid_enable(&ce);
3202                 context_set_copied(&ce);
3203
3204                 new_ce[idx] = ce;
3205         }
3206
3207         tbl[tbl_idx + pos] = new_ce;
3208
3209         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3210
3211 out_unmap:
3212         memunmap(old_ce);
3213
3214 out:
3215         return ret;
3216 }
3217
3218 static int copy_translation_tables(struct intel_iommu *iommu)
3219 {
3220         struct context_entry **ctxt_tbls;
3221         struct root_entry *old_rt;
3222         phys_addr_t old_rt_phys;
3223         int ctxt_table_entries;
3224         unsigned long flags;
3225         u64 rtaddr_reg;
3226         int bus, ret;
3227         bool new_ext, ext;
3228
3229         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3230         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3231         new_ext    = !!ecap_ecs(iommu->ecap);
3232
3233         /*
3234          * The RTT bit can only be changed when translation is disabled,
3235          * but disabling translation means to open a window for data
3236          * corruption. So bail out and don't copy anything if we would
3237          * have to change the bit.
3238          */
3239         if (new_ext != ext)
3240                 return -EINVAL;
3241
3242         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3243         if (!old_rt_phys)
3244                 return -EINVAL;
3245
3246         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3247         if (!old_rt)
3248                 return -ENOMEM;
3249
3250         /* This is too big for the stack - allocate it from slab */
3251         ctxt_table_entries = ext ? 512 : 256;
3252         ret = -ENOMEM;
3253         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3254         if (!ctxt_tbls)
3255                 goto out_unmap;
3256
3257         for (bus = 0; bus < 256; bus++) {
3258                 ret = copy_context_table(iommu, &old_rt[bus],
3259                                          ctxt_tbls, bus, ext);
3260                 if (ret) {
3261                         pr_err("%s: Failed to copy context table for bus %d\n",
3262                                 iommu->name, bus);
3263                         continue;
3264                 }
3265         }
3266
3267         spin_lock_irqsave(&iommu->lock, flags);
3268
3269         /* Context tables are copied, now write them to the root_entry table */
3270         for (bus = 0; bus < 256; bus++) {
3271                 int idx = ext ? bus * 2 : bus;
3272                 u64 val;
3273
3274                 if (ctxt_tbls[idx]) {
3275                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3276                         iommu->root_entry[bus].lo = val;
3277                 }
3278
3279                 if (!ext || !ctxt_tbls[idx + 1])
3280                         continue;
3281
3282                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3283                 iommu->root_entry[bus].hi = val;
3284         }
3285
3286         spin_unlock_irqrestore(&iommu->lock, flags);
3287
3288         kfree(ctxt_tbls);
3289
3290         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3291
3292         ret = 0;
3293
3294 out_unmap:
3295         memunmap(old_rt);
3296
3297         return ret;
3298 }
3299
3300 static int __init init_dmars(void)
3301 {
3302         struct dmar_drhd_unit *drhd;
3303         struct intel_iommu *iommu;
3304         int ret;
3305
3306         /*
3307          * for each drhd
3308          *    allocate root
3309          *    initialize and program root entry to not present
3310          * endfor
3311          */
3312         for_each_drhd_unit(drhd) {
3313                 /*
3314                  * lock not needed as this is only incremented in the single
3315                  * threaded kernel __init code path all other access are read
3316                  * only
3317                  */
3318                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3319                         g_num_of_iommus++;
3320                         continue;
3321                 }
3322                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3323         }
3324
3325         /* Preallocate enough resources for IOMMU hot-addition */
3326         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3327                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3328
3329         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3330                         GFP_KERNEL);
3331         if (!g_iommus) {
3332                 pr_err("Allocating global iommu array failed\n");
3333                 ret = -ENOMEM;
3334                 goto error;
3335         }
3336
3337         for_each_iommu(iommu, drhd) {
3338                 if (drhd->ignored) {
3339                         iommu_disable_translation(iommu);
3340                         continue;
3341                 }
3342
3343                 /*
3344                  * Find the max pasid size of all IOMMU's in the system.
3345                  * We need to ensure the system pasid table is no bigger
3346                  * than the smallest supported.
3347                  */
3348                 if (pasid_supported(iommu)) {
3349                         u32 temp = 2 << ecap_pss(iommu->ecap);
3350
3351                         intel_pasid_max_id = min_t(u32, temp,
3352                                                    intel_pasid_max_id);
3353                 }
3354
3355                 g_iommus[iommu->seq_id] = iommu;
3356
3357                 intel_iommu_init_qi(iommu);
3358
3359                 ret = iommu_init_domains(iommu);
3360                 if (ret)
3361                         goto free_iommu;
3362
3363                 init_translation_status(iommu);
3364
3365                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3366                         iommu_disable_translation(iommu);
3367                         clear_translation_pre_enabled(iommu);
3368                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3369                                 iommu->name);
3370                 }
3371
3372                 /*
3373                  * TBD:
3374                  * we could share the same root & context tables
3375                  * among all IOMMU's. Need to Split it later.
3376                  */
3377                 ret = iommu_alloc_root_entry(iommu);
3378                 if (ret)
3379                         goto free_iommu;
3380
3381                 if (translation_pre_enabled(iommu)) {
3382                         pr_info("Translation already enabled - trying to copy translation structures\n");
3383
3384                         ret = copy_translation_tables(iommu);
3385                         if (ret) {
3386                                 /*
3387                                  * We found the IOMMU with translation
3388                                  * enabled - but failed to copy over the
3389                                  * old root-entry table. Try to proceed
3390                                  * by disabling translation now and
3391                                  * allocating a clean root-entry table.
3392                                  * This might cause DMAR faults, but
3393                                  * probably the dump will still succeed.
3394                                  */
3395                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3396                                        iommu->name);
3397                                 iommu_disable_translation(iommu);
3398                                 clear_translation_pre_enabled(iommu);
3399                         } else {
3400                                 pr_info("Copied translation tables from previous kernel for %s\n",
3401                                         iommu->name);
3402                         }
3403                 }
3404
3405                 if (!ecap_pass_through(iommu->ecap))
3406                         hw_pass_through = 0;
3407                 intel_svm_check(iommu);
3408         }
3409
3410         /*
3411          * Now that qi is enabled on all iommus, set the root entry and flush
3412          * caches. This is required on some Intel X58 chipsets, otherwise the
3413          * flush_context function will loop forever and the boot hangs.
3414          */
3415         for_each_active_iommu(iommu, drhd) {
3416                 iommu_flush_write_buffer(iommu);
3417                 iommu_set_root_entry(iommu);
3418                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3419                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3420         }
3421
3422 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3423         dmar_map_gfx = 0;
3424 #endif
3425
3426         if (!dmar_map_gfx)
3427                 iommu_identity_mapping |= IDENTMAP_GFX;
3428
3429         check_tylersburg_isoch();
3430
3431         ret = si_domain_init(hw_pass_through);
3432         if (ret)
3433                 goto free_iommu;
3434
3435         /*
3436          * for each drhd
3437          *   enable fault log
3438          *   global invalidate context cache
3439          *   global invalidate iotlb
3440          *   enable translation
3441          */
3442         for_each_iommu(iommu, drhd) {
3443                 if (drhd->ignored) {
3444                         /*
3445                          * we always have to disable PMRs or DMA may fail on
3446                          * this device
3447                          */
3448                         if (force_on)
3449                                 iommu_disable_protect_mem_regions(iommu);
3450                         continue;
3451                 }
3452
3453                 iommu_flush_write_buffer(iommu);
3454
3455 #ifdef CONFIG_INTEL_IOMMU_SVM
3456                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3457                         /*
3458                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3459                          * could cause possible lock race condition.
3460                          */
3461                         up_write(&dmar_global_lock);
3462                         ret = intel_svm_enable_prq(iommu);
3463                         down_write(&dmar_global_lock);
3464                         if (ret)
3465                                 goto free_iommu;
3466                 }
3467 #endif
3468                 ret = dmar_set_interrupt(iommu);
3469                 if (ret)
3470                         goto free_iommu;
3471         }
3472
3473         return 0;
3474
3475 free_iommu:
3476         for_each_active_iommu(iommu, drhd) {
3477                 disable_dmar_iommu(iommu);
3478                 free_dmar_iommu(iommu);
3479         }
3480
3481         kfree(g_iommus);
3482
3483 error:
3484         return ret;
3485 }
3486
3487 /* This takes a number of _MM_ pages, not VTD pages */
3488 static unsigned long intel_alloc_iova(struct device *dev,
3489                                      struct dmar_domain *domain,
3490                                      unsigned long nrpages, uint64_t dma_mask)
3491 {
3492         unsigned long iova_pfn;
3493
3494         /*
3495          * Restrict dma_mask to the width that the iommu can handle.
3496          * First-level translation restricts the input-address to a
3497          * canonical address (i.e., address bits 63:N have the same
3498          * value as address bit [N-1], where N is 48-bits with 4-level
3499          * paging and 57-bits with 5-level paging). Hence, skip bit
3500          * [N-1].
3501          */
3502         if (domain_use_first_level(domain))
3503                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3504                                  dma_mask);
3505         else
3506                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3507                                  dma_mask);
3508
3509         /* Ensure we reserve the whole size-aligned region */
3510         nrpages = __roundup_pow_of_two(nrpages);
3511
3512         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3513                 /*
3514                  * First try to allocate an io virtual address in
3515                  * DMA_BIT_MASK(32) and if that fails then try allocating
3516                  * from higher range
3517                  */
3518                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3519                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3520                 if (iova_pfn)
3521                         return iova_pfn;
3522         }
3523         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3524                                    IOVA_PFN(dma_mask), true);
3525         if (unlikely(!iova_pfn)) {
3526                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3527                              nrpages);
3528                 return 0;
3529         }
3530
3531         return iova_pfn;
3532 }
3533
3534 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3535 {
3536         struct dmar_domain *domain, *tmp;
3537         struct dmar_rmrr_unit *rmrr;
3538         struct device *i_dev;
3539         int i, ret;
3540
3541         /* Device shouldn't be attached by any domains. */
3542         domain = find_domain(dev);
3543         if (domain)
3544                 return NULL;
3545
3546         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3547         if (!domain)
3548                 goto out;
3549
3550         /* We have a new domain - setup possible RMRRs for the device */
3551         rcu_read_lock();
3552         for_each_rmrr_units(rmrr) {
3553                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3554                                           i, i_dev) {
3555                         if (i_dev != dev)
3556                                 continue;
3557
3558                         ret = domain_prepare_identity_map(dev, domain,
3559                                                           rmrr->base_address,
3560                                                           rmrr->end_address);
3561                         if (ret)
3562                                 dev_err(dev, "Mapping reserved region failed\n");
3563                 }
3564         }
3565         rcu_read_unlock();
3566
3567         tmp = set_domain_for_dev(dev, domain);
3568         if (!tmp || domain != tmp) {
3569                 domain_exit(domain);
3570                 domain = tmp;
3571         }
3572
3573 out:
3574         if (!domain)
3575                 dev_err(dev, "Allocating domain failed\n");
3576         else
3577                 domain->domain.type = IOMMU_DOMAIN_DMA;
3578
3579         return domain;
3580 }
3581
3582 /* Check if the dev needs to go through non-identity map and unmap process.*/
3583 static bool iommu_need_mapping(struct device *dev)
3584 {
3585         int ret;
3586
3587         if (iommu_dummy(dev))
3588                 return false;
3589
3590         if (unlikely(attach_deferred(dev)))
3591                 do_deferred_attach(dev);
3592
3593         ret = identity_mapping(dev);
3594         if (ret) {
3595                 u64 dma_mask = *dev->dma_mask;
3596
3597                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3598                         dma_mask = dev->coherent_dma_mask;
3599
3600                 if (dma_mask >= dma_direct_get_required_mask(dev))
3601                         return false;
3602
3603                 /*
3604                  * 32 bit DMA is removed from si_domain and fall back to
3605                  * non-identity mapping.
3606                  */
3607                 dmar_remove_one_dev_info(dev);
3608                 ret = iommu_request_dma_domain_for_dev(dev);
3609                 if (ret) {
3610                         struct iommu_domain *domain;
3611                         struct dmar_domain *dmar_domain;
3612
3613                         domain = iommu_get_domain_for_dev(dev);
3614                         if (domain) {
3615                                 dmar_domain = to_dmar_domain(domain);
3616                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3617                         }
3618                         dmar_remove_one_dev_info(dev);
3619                         get_private_domain_for_dev(dev);
3620                 }
3621
3622                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3623         }
3624
3625         return true;
3626 }
3627
3628 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3629                                      size_t size, int dir, u64 dma_mask)
3630 {
3631         struct dmar_domain *domain;
3632         phys_addr_t start_paddr;
3633         unsigned long iova_pfn;
3634         int prot = 0;
3635         int ret;
3636         struct intel_iommu *iommu;
3637         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3638
3639         BUG_ON(dir == DMA_NONE);
3640
3641         domain = find_domain(dev);
3642         if (!domain)
3643                 return DMA_MAPPING_ERROR;
3644
3645         iommu = domain_get_iommu(domain);
3646         size = aligned_nrpages(paddr, size);
3647
3648         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3649         if (!iova_pfn)
3650                 goto error;
3651
3652         /*
3653          * Check if DMAR supports zero-length reads on write only
3654          * mappings..
3655          */
3656         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3657                         !cap_zlr(iommu->cap))
3658                 prot |= DMA_PTE_READ;
3659         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3660                 prot |= DMA_PTE_WRITE;
3661         /*
3662          * paddr - (paddr + size) might be partial page, we should map the whole
3663          * page.  Note: if two part of one page are separately mapped, we
3664          * might have two guest_addr mapping to the same host paddr, but this
3665          * is not a big problem
3666          */
3667         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3668                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3669         if (ret)
3670                 goto error;
3671
3672         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3673         start_paddr += paddr & ~PAGE_MASK;
3674
3675         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3676
3677         return start_paddr;
3678
3679 error:
3680         if (iova_pfn)
3681                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3682         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3683                 size, (unsigned long long)paddr, dir);
3684         return DMA_MAPPING_ERROR;
3685 }
3686
3687 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3688                                  unsigned long offset, size_t size,
3689                                  enum dma_data_direction dir,
3690                                  unsigned long attrs)
3691 {
3692         if (iommu_need_mapping(dev))
3693                 return __intel_map_single(dev, page_to_phys(page) + offset,
3694                                 size, dir, *dev->dma_mask);
3695         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3696 }
3697
3698 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3699                                      size_t size, enum dma_data_direction dir,
3700                                      unsigned long attrs)
3701 {
3702         if (iommu_need_mapping(dev))
3703                 return __intel_map_single(dev, phys_addr, size, dir,
3704                                 *dev->dma_mask);
3705         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3706 }
3707
3708 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3709 {
3710         struct dmar_domain *domain;
3711         unsigned long start_pfn, last_pfn;
3712         unsigned long nrpages;
3713         unsigned long iova_pfn;
3714         struct intel_iommu *iommu;
3715         struct page *freelist;
3716         struct pci_dev *pdev = NULL;
3717
3718         domain = find_domain(dev);
3719         BUG_ON(!domain);
3720
3721         iommu = domain_get_iommu(domain);
3722
3723         iova_pfn = IOVA_PFN(dev_addr);
3724
3725         nrpages = aligned_nrpages(dev_addr, size);
3726         start_pfn = mm_to_dma_pfn(iova_pfn);
3727         last_pfn = start_pfn + nrpages - 1;
3728
3729         if (dev_is_pci(dev))
3730                 pdev = to_pci_dev(dev);
3731
3732         freelist = domain_unmap(domain, start_pfn, last_pfn);
3733         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3734                         !has_iova_flush_queue(&domain->iovad)) {
3735                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3736                                       nrpages, !freelist, 0);
3737                 /* free iova */
3738                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3739                 dma_free_pagelist(freelist);
3740         } else {
3741                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3742                            (unsigned long)freelist);
3743                 /*
3744                  * queue up the release of the unmap to save the 1/6th of the
3745                  * cpu used up by the iotlb flush operation...
3746                  */
3747         }
3748
3749         trace_unmap_single(dev, dev_addr, size);
3750 }
3751
3752 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3753                              size_t size, enum dma_data_direction dir,
3754                              unsigned long attrs)
3755 {
3756         if (iommu_need_mapping(dev))
3757                 intel_unmap(dev, dev_addr, size);
3758         else
3759                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3760 }
3761
3762 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3763                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3764 {
3765         if (iommu_need_mapping(dev))
3766                 intel_unmap(dev, dev_addr, size);
3767 }
3768
3769 static void *intel_alloc_coherent(struct device *dev, size_t size,
3770                                   dma_addr_t *dma_handle, gfp_t flags,
3771                                   unsigned long attrs)
3772 {
3773         struct page *page = NULL;
3774         int order;
3775
3776         if (!iommu_need_mapping(dev))
3777                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3778
3779         size = PAGE_ALIGN(size);
3780         order = get_order(size);
3781
3782         if (gfpflags_allow_blocking(flags)) {
3783                 unsigned int count = size >> PAGE_SHIFT;
3784
3785                 page = dma_alloc_from_contiguous(dev, count, order,
3786                                                  flags & __GFP_NOWARN);
3787         }
3788
3789         if (!page)
3790                 page = alloc_pages(flags, order);
3791         if (!page)
3792                 return NULL;
3793         memset(page_address(page), 0, size);
3794
3795         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3796                                          DMA_BIDIRECTIONAL,
3797                                          dev->coherent_dma_mask);
3798         if (*dma_handle != DMA_MAPPING_ERROR)
3799                 return page_address(page);
3800         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3801                 __free_pages(page, order);
3802
3803         return NULL;
3804 }
3805
3806 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3807                                 dma_addr_t dma_handle, unsigned long attrs)
3808 {
3809         int order;
3810         struct page *page = virt_to_page(vaddr);
3811
3812         if (!iommu_need_mapping(dev))
3813                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3814
3815         size = PAGE_ALIGN(size);
3816         order = get_order(size);
3817
3818         intel_unmap(dev, dma_handle, size);
3819         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3820                 __free_pages(page, order);
3821 }
3822
3823 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3824                            int nelems, enum dma_data_direction dir,
3825                            unsigned long attrs)
3826 {
3827         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3828         unsigned long nrpages = 0;
3829         struct scatterlist *sg;
3830         int i;
3831
3832         if (!iommu_need_mapping(dev))
3833                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3834
3835         for_each_sg(sglist, sg, nelems, i) {
3836                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3837         }
3838
3839         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3840
3841         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3842 }
3843
3844 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3845                         enum dma_data_direction dir, unsigned long attrs)
3846 {
3847         int i;
3848         struct dmar_domain *domain;
3849         size_t size = 0;
3850         int prot = 0;
3851         unsigned long iova_pfn;
3852         int ret;
3853         struct scatterlist *sg;
3854         unsigned long start_vpfn;
3855         struct intel_iommu *iommu;
3856
3857         BUG_ON(dir == DMA_NONE);
3858         if (!iommu_need_mapping(dev))
3859                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3860
3861         domain = find_domain(dev);
3862         if (!domain)
3863                 return 0;
3864
3865         iommu = domain_get_iommu(domain);
3866
3867         for_each_sg(sglist, sg, nelems, i)
3868                 size += aligned_nrpages(sg->offset, sg->length);
3869
3870         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3871                                 *dev->dma_mask);
3872         if (!iova_pfn) {
3873                 sglist->dma_length = 0;
3874                 return 0;
3875         }
3876
3877         /*
3878          * Check if DMAR supports zero-length reads on write only
3879          * mappings..
3880          */
3881         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3882                         !cap_zlr(iommu->cap))
3883                 prot |= DMA_PTE_READ;
3884         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3885                 prot |= DMA_PTE_WRITE;
3886
3887         start_vpfn = mm_to_dma_pfn(iova_pfn);
3888
3889         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3890         if (unlikely(ret)) {
3891                 dma_pte_free_pagetable(domain, start_vpfn,
3892                                        start_vpfn + size - 1,
3893                                        agaw_to_level(domain->agaw) + 1);
3894                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3895                 return 0;
3896         }
3897
3898         for_each_sg(sglist, sg, nelems, i)
3899                 trace_map_sg(dev, i + 1, nelems, sg);
3900
3901         return nelems;
3902 }
3903
3904 static u64 intel_get_required_mask(struct device *dev)
3905 {
3906         if (!iommu_need_mapping(dev))
3907                 return dma_direct_get_required_mask(dev);
3908         return DMA_BIT_MASK(32);
3909 }
3910
3911 static const struct dma_map_ops intel_dma_ops = {
3912         .alloc = intel_alloc_coherent,
3913         .free = intel_free_coherent,
3914         .map_sg = intel_map_sg,
3915         .unmap_sg = intel_unmap_sg,
3916         .map_page = intel_map_page,
3917         .unmap_page = intel_unmap_page,
3918         .map_resource = intel_map_resource,
3919         .unmap_resource = intel_unmap_resource,
3920         .dma_supported = dma_direct_supported,
3921         .mmap = dma_common_mmap,
3922         .get_sgtable = dma_common_get_sgtable,
3923         .get_required_mask = intel_get_required_mask,
3924 };
3925
3926 static void
3927 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3928                    enum dma_data_direction dir, enum dma_sync_target target)
3929 {
3930         struct dmar_domain *domain;
3931         phys_addr_t tlb_addr;
3932
3933         domain = find_domain(dev);
3934         if (WARN_ON(!domain))
3935                 return;
3936
3937         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3938         if (is_swiotlb_buffer(tlb_addr))
3939                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3940 }
3941
3942 static dma_addr_t
3943 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3944                   enum dma_data_direction dir, unsigned long attrs,
3945                   u64 dma_mask)
3946 {
3947         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3948         struct dmar_domain *domain;
3949         struct intel_iommu *iommu;
3950         unsigned long iova_pfn;
3951         unsigned long nrpages;
3952         phys_addr_t tlb_addr;
3953         int prot = 0;
3954         int ret;
3955
3956         if (unlikely(attach_deferred(dev)))
3957                 do_deferred_attach(dev);
3958
3959         domain = find_domain(dev);
3960
3961         if (WARN_ON(dir == DMA_NONE || !domain))
3962                 return DMA_MAPPING_ERROR;
3963
3964         iommu = domain_get_iommu(domain);
3965         if (WARN_ON(!iommu))
3966                 return DMA_MAPPING_ERROR;
3967
3968         nrpages = aligned_nrpages(0, size);
3969         iova_pfn = intel_alloc_iova(dev, domain,
3970                                     dma_to_mm_pfn(nrpages), dma_mask);
3971         if (!iova_pfn)
3972                 return DMA_MAPPING_ERROR;
3973
3974         /*
3975          * Check if DMAR supports zero-length reads on write only
3976          * mappings..
3977          */
3978         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3979                         !cap_zlr(iommu->cap))
3980                 prot |= DMA_PTE_READ;
3981         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3982                 prot |= DMA_PTE_WRITE;
3983
3984         /*
3985          * If both the physical buffer start address and size are
3986          * page aligned, we don't need to use a bounce page.
3987          */
3988         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3989                 tlb_addr = swiotlb_tbl_map_single(dev,
3990                                 __phys_to_dma(dev, io_tlb_start),
3991                                 paddr, size, aligned_size, dir, attrs);
3992                 if (tlb_addr == DMA_MAPPING_ERROR) {
3993                         goto swiotlb_error;
3994                 } else {
3995                         /* Cleanup the padding area. */
3996                         void *padding_start = phys_to_virt(tlb_addr);
3997                         size_t padding_size = aligned_size;
3998
3999                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
4000                             (dir == DMA_TO_DEVICE ||
4001                              dir == DMA_BIDIRECTIONAL)) {
4002                                 padding_start += size;
4003                                 padding_size -= size;
4004                         }
4005
4006                         memset(padding_start, 0, padding_size);
4007                 }
4008         } else {
4009                 tlb_addr = paddr;
4010         }
4011
4012         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
4013                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
4014         if (ret)
4015                 goto mapping_error;
4016
4017         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
4018
4019         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
4020
4021 mapping_error:
4022         if (is_swiotlb_buffer(tlb_addr))
4023                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4024                                          aligned_size, dir, attrs);
4025 swiotlb_error:
4026         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
4027         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
4028                 size, (unsigned long long)paddr, dir);
4029
4030         return DMA_MAPPING_ERROR;
4031 }
4032
4033 static void
4034 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
4035                     enum dma_data_direction dir, unsigned long attrs)
4036 {
4037         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
4038         struct dmar_domain *domain;
4039         phys_addr_t tlb_addr;
4040
4041         domain = find_domain(dev);
4042         if (WARN_ON(!domain))
4043                 return;
4044
4045         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
4046         if (WARN_ON(!tlb_addr))
4047                 return;
4048
4049         intel_unmap(dev, dev_addr, size);
4050         if (is_swiotlb_buffer(tlb_addr))
4051                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4052                                          aligned_size, dir, attrs);
4053
4054         trace_bounce_unmap_single(dev, dev_addr, size);
4055 }
4056
4057 static dma_addr_t
4058 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
4059                 size_t size, enum dma_data_direction dir, unsigned long attrs)
4060 {
4061         return bounce_map_single(dev, page_to_phys(page) + offset,
4062                                  size, dir, attrs, *dev->dma_mask);
4063 }
4064
4065 static dma_addr_t
4066 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
4067                     enum dma_data_direction dir, unsigned long attrs)
4068 {
4069         return bounce_map_single(dev, phys_addr, size,
4070                                  dir, attrs, *dev->dma_mask);
4071 }
4072
4073 static void
4074 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4075                   enum dma_data_direction dir, unsigned long attrs)
4076 {
4077         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4078 }
4079
4080 static void
4081 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4082                       enum dma_data_direction dir, unsigned long attrs)
4083 {
4084         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4085 }
4086
4087 static void
4088 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4089                 enum dma_data_direction dir, unsigned long attrs)
4090 {
4091         struct scatterlist *sg;
4092         int i;
4093
4094         for_each_sg(sglist, sg, nelems, i)
4095                 bounce_unmap_page(dev, sg->dma_address,
4096                                   sg_dma_len(sg), dir, attrs);
4097 }
4098
4099 static int
4100 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4101               enum dma_data_direction dir, unsigned long attrs)
4102 {
4103         int i;
4104         struct scatterlist *sg;
4105
4106         for_each_sg(sglist, sg, nelems, i) {
4107                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4108                                                   sg->offset, sg->length,
4109                                                   dir, attrs);
4110                 if (sg->dma_address == DMA_MAPPING_ERROR)
4111                         goto out_unmap;
4112                 sg_dma_len(sg) = sg->length;
4113         }
4114
4115         for_each_sg(sglist, sg, nelems, i)
4116                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4117
4118         return nelems;
4119
4120 out_unmap:
4121         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4122         return 0;
4123 }
4124
4125 static void
4126 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4127                            size_t size, enum dma_data_direction dir)
4128 {
4129         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4130 }
4131
4132 static void
4133 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4134                               size_t size, enum dma_data_direction dir)
4135 {
4136         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4137 }
4138
4139 static void
4140 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4141                        int nelems, enum dma_data_direction dir)
4142 {
4143         struct scatterlist *sg;
4144         int i;
4145
4146         for_each_sg(sglist, sg, nelems, i)
4147                 bounce_sync_single(dev, sg_dma_address(sg),
4148                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4149 }
4150
4151 static void
4152 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4153                           int nelems, enum dma_data_direction dir)
4154 {
4155         struct scatterlist *sg;
4156         int i;
4157
4158         for_each_sg(sglist, sg, nelems, i)
4159                 bounce_sync_single(dev, sg_dma_address(sg),
4160                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4161 }
4162
4163 static const struct dma_map_ops bounce_dma_ops = {
4164         .alloc                  = intel_alloc_coherent,
4165         .free                   = intel_free_coherent,
4166         .map_sg                 = bounce_map_sg,
4167         .unmap_sg               = bounce_unmap_sg,
4168         .map_page               = bounce_map_page,
4169         .unmap_page             = bounce_unmap_page,
4170         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4171         .sync_single_for_device = bounce_sync_single_for_device,
4172         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4173         .sync_sg_for_device     = bounce_sync_sg_for_device,
4174         .map_resource           = bounce_map_resource,
4175         .unmap_resource         = bounce_unmap_resource,
4176         .dma_supported          = dma_direct_supported,
4177 };
4178
4179 static inline int iommu_domain_cache_init(void)
4180 {
4181         int ret = 0;
4182
4183         iommu_domain_cache = kmem_cache_create("iommu_domain",
4184                                          sizeof(struct dmar_domain),
4185                                          0,
4186                                          SLAB_HWCACHE_ALIGN,
4187
4188                                          NULL);
4189         if (!iommu_domain_cache) {
4190                 pr_err("Couldn't create iommu_domain cache\n");
4191                 ret = -ENOMEM;
4192         }
4193
4194         return ret;
4195 }
4196
4197 static inline int iommu_devinfo_cache_init(void)
4198 {
4199         int ret = 0;
4200
4201         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4202                                          sizeof(struct device_domain_info),
4203                                          0,
4204                                          SLAB_HWCACHE_ALIGN,
4205                                          NULL);
4206         if (!iommu_devinfo_cache) {
4207                 pr_err("Couldn't create devinfo cache\n");
4208                 ret = -ENOMEM;
4209         }
4210
4211         return ret;
4212 }
4213
4214 static int __init iommu_init_mempool(void)
4215 {
4216         int ret;
4217         ret = iova_cache_get();
4218         if (ret)
4219                 return ret;
4220
4221         ret = iommu_domain_cache_init();
4222         if (ret)
4223                 goto domain_error;
4224
4225         ret = iommu_devinfo_cache_init();
4226         if (!ret)
4227                 return ret;
4228
4229         kmem_cache_destroy(iommu_domain_cache);
4230 domain_error:
4231         iova_cache_put();
4232
4233         return -ENOMEM;
4234 }
4235
4236 static void __init iommu_exit_mempool(void)
4237 {
4238         kmem_cache_destroy(iommu_devinfo_cache);
4239         kmem_cache_destroy(iommu_domain_cache);
4240         iova_cache_put();
4241 }
4242
4243 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4244 {
4245         struct dmar_drhd_unit *drhd;
4246         u32 vtbar;
4247         int rc;
4248
4249         /* We know that this device on this chipset has its own IOMMU.
4250          * If we find it under a different IOMMU, then the BIOS is lying
4251          * to us. Hope that the IOMMU for this device is actually
4252          * disabled, and it needs no translation...
4253          */
4254         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4255         if (rc) {
4256                 /* "can't" happen */
4257                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4258                 return;
4259         }
4260         vtbar &= 0xffff0000;
4261
4262         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4263         drhd = dmar_find_matched_drhd_unit(pdev);
4264         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4265                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4266                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4267                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4268         }
4269 }
4270 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4271
4272 static void __init init_no_remapping_devices(void)
4273 {
4274         struct dmar_drhd_unit *drhd;
4275         struct device *dev;
4276         int i;
4277
4278         for_each_drhd_unit(drhd) {
4279                 if (!drhd->include_all) {
4280                         for_each_active_dev_scope(drhd->devices,
4281                                                   drhd->devices_cnt, i, dev)
4282                                 break;
4283                         /* ignore DMAR unit if no devices exist */
4284                         if (i == drhd->devices_cnt)
4285                                 drhd->ignored = 1;
4286                 }
4287         }
4288
4289         for_each_active_drhd_unit(drhd) {
4290                 if (drhd->include_all)
4291                         continue;
4292
4293                 for_each_active_dev_scope(drhd->devices,
4294                                           drhd->devices_cnt, i, dev)
4295                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4296                                 break;
4297                 if (i < drhd->devices_cnt)
4298                         continue;
4299
4300                 /* This IOMMU has *only* gfx devices. Either bypass it or
4301                    set the gfx_mapped flag, as appropriate */
4302                 if (!dmar_map_gfx) {
4303                         drhd->ignored = 1;
4304                         for_each_active_dev_scope(drhd->devices,
4305                                                   drhd->devices_cnt, i, dev)
4306                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4307                 }
4308         }
4309 }
4310
4311 #ifdef CONFIG_SUSPEND
4312 static int init_iommu_hw(void)
4313 {
4314         struct dmar_drhd_unit *drhd;
4315         struct intel_iommu *iommu = NULL;
4316
4317         for_each_active_iommu(iommu, drhd)
4318                 if (iommu->qi)
4319                         dmar_reenable_qi(iommu);
4320
4321         for_each_iommu(iommu, drhd) {
4322                 if (drhd->ignored) {
4323                         /*
4324                          * we always have to disable PMRs or DMA may fail on
4325                          * this device
4326                          */
4327                         if (force_on)
4328                                 iommu_disable_protect_mem_regions(iommu);
4329                         continue;
4330                 }
4331
4332                 iommu_flush_write_buffer(iommu);
4333
4334                 iommu_set_root_entry(iommu);
4335
4336                 iommu->flush.flush_context(iommu, 0, 0, 0,
4337                                            DMA_CCMD_GLOBAL_INVL);
4338                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4339                 iommu_enable_translation(iommu);
4340                 iommu_disable_protect_mem_regions(iommu);
4341         }
4342
4343         return 0;
4344 }
4345
4346 static void iommu_flush_all(void)
4347 {
4348         struct dmar_drhd_unit *drhd;
4349         struct intel_iommu *iommu;
4350
4351         for_each_active_iommu(iommu, drhd) {
4352                 iommu->flush.flush_context(iommu, 0, 0, 0,
4353                                            DMA_CCMD_GLOBAL_INVL);
4354                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4355                                          DMA_TLB_GLOBAL_FLUSH);
4356         }
4357 }
4358
4359 static int iommu_suspend(void)
4360 {
4361         struct dmar_drhd_unit *drhd;
4362         struct intel_iommu *iommu = NULL;
4363         unsigned long flag;
4364
4365         for_each_active_iommu(iommu, drhd) {
4366                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4367                                                  GFP_ATOMIC);
4368                 if (!iommu->iommu_state)
4369                         goto nomem;
4370         }
4371
4372         iommu_flush_all();
4373
4374         for_each_active_iommu(iommu, drhd) {
4375                 iommu_disable_translation(iommu);
4376
4377                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4378
4379                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4380                         readl(iommu->reg + DMAR_FECTL_REG);
4381                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4382                         readl(iommu->reg + DMAR_FEDATA_REG);
4383                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4384                         readl(iommu->reg + DMAR_FEADDR_REG);
4385                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4386                         readl(iommu->reg + DMAR_FEUADDR_REG);
4387
4388                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4389         }
4390         return 0;
4391
4392 nomem:
4393         for_each_active_iommu(iommu, drhd)
4394                 kfree(iommu->iommu_state);
4395
4396         return -ENOMEM;
4397 }
4398
4399 static void iommu_resume(void)
4400 {
4401         struct dmar_drhd_unit *drhd;
4402         struct intel_iommu *iommu = NULL;
4403         unsigned long flag;
4404
4405         if (init_iommu_hw()) {
4406                 if (force_on)
4407                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4408                 else
4409                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4410                 return;
4411         }
4412
4413         for_each_active_iommu(iommu, drhd) {
4414
4415                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4416
4417                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4418                         iommu->reg + DMAR_FECTL_REG);
4419                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4420                         iommu->reg + DMAR_FEDATA_REG);
4421                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4422                         iommu->reg + DMAR_FEADDR_REG);
4423                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4424                         iommu->reg + DMAR_FEUADDR_REG);
4425
4426                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4427         }
4428
4429         for_each_active_iommu(iommu, drhd)
4430                 kfree(iommu->iommu_state);
4431 }
4432
4433 static struct syscore_ops iommu_syscore_ops = {
4434         .resume         = iommu_resume,
4435         .suspend        = iommu_suspend,
4436 };
4437
4438 static void __init init_iommu_pm_ops(void)
4439 {
4440         register_syscore_ops(&iommu_syscore_ops);
4441 }
4442
4443 #else
4444 static inline void init_iommu_pm_ops(void) {}
4445 #endif  /* CONFIG_PM */
4446
4447 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4448 {
4449         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4450             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4451             rmrr->end_address <= rmrr->base_address ||
4452             arch_rmrr_sanity_check(rmrr))
4453                 return -EINVAL;
4454
4455         return 0;
4456 }
4457
4458 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4459 {
4460         struct acpi_dmar_reserved_memory *rmrr;
4461         struct dmar_rmrr_unit *rmrru;
4462
4463         rmrr = (struct acpi_dmar_reserved_memory *)header;
4464         if (rmrr_sanity_check(rmrr)) {
4465                 pr_warn(FW_BUG
4466                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4467                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4468                            rmrr->base_address, rmrr->end_address,
4469                            dmi_get_system_info(DMI_BIOS_VENDOR),
4470                            dmi_get_system_info(DMI_BIOS_VERSION),
4471                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4472                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4473         }
4474
4475         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4476         if (!rmrru)
4477                 goto out;
4478
4479         rmrru->hdr = header;
4480
4481         rmrru->base_address = rmrr->base_address;
4482         rmrru->end_address = rmrr->end_address;
4483
4484         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4485                                 ((void *)rmrr) + rmrr->header.length,
4486                                 &rmrru->devices_cnt);
4487         if (rmrru->devices_cnt && rmrru->devices == NULL)
4488                 goto free_rmrru;
4489
4490         list_add(&rmrru->list, &dmar_rmrr_units);
4491
4492         return 0;
4493 free_rmrru:
4494         kfree(rmrru);
4495 out:
4496         return -ENOMEM;
4497 }
4498
4499 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4500 {
4501         struct dmar_atsr_unit *atsru;
4502         struct acpi_dmar_atsr *tmp;
4503
4504         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4505                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4506                 if (atsr->segment != tmp->segment)
4507                         continue;
4508                 if (atsr->header.length != tmp->header.length)
4509                         continue;
4510                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4511                         return atsru;
4512         }
4513
4514         return NULL;
4515 }
4516
4517 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4518 {
4519         struct acpi_dmar_atsr *atsr;
4520         struct dmar_atsr_unit *atsru;
4521
4522         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4523                 return 0;
4524
4525         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4526         atsru = dmar_find_atsr(atsr);
4527         if (atsru)
4528                 return 0;
4529
4530         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4531         if (!atsru)
4532                 return -ENOMEM;
4533
4534         /*
4535          * If memory is allocated from slab by ACPI _DSM method, we need to
4536          * copy the memory content because the memory buffer will be freed
4537          * on return.
4538          */
4539         atsru->hdr = (void *)(atsru + 1);
4540         memcpy(atsru->hdr, hdr, hdr->length);
4541         atsru->include_all = atsr->flags & 0x1;
4542         if (!atsru->include_all) {
4543                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4544                                 (void *)atsr + atsr->header.length,
4545                                 &atsru->devices_cnt);
4546                 if (atsru->devices_cnt && atsru->devices == NULL) {
4547                         kfree(atsru);
4548                         return -ENOMEM;
4549                 }
4550         }
4551
4552         list_add_rcu(&atsru->list, &dmar_atsr_units);
4553
4554         return 0;
4555 }
4556
4557 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4558 {
4559         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4560         kfree(atsru);
4561 }
4562
4563 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4564 {
4565         struct acpi_dmar_atsr *atsr;
4566         struct dmar_atsr_unit *atsru;
4567
4568         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4569         atsru = dmar_find_atsr(atsr);
4570         if (atsru) {
4571                 list_del_rcu(&atsru->list);
4572                 synchronize_rcu();
4573                 intel_iommu_free_atsr(atsru);
4574         }
4575
4576         return 0;
4577 }
4578
4579 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4580 {
4581         int i;
4582         struct device *dev;
4583         struct acpi_dmar_atsr *atsr;
4584         struct dmar_atsr_unit *atsru;
4585
4586         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4587         atsru = dmar_find_atsr(atsr);
4588         if (!atsru)
4589                 return 0;
4590
4591         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4592                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4593                                           i, dev)
4594                         return -EBUSY;
4595         }
4596
4597         return 0;
4598 }
4599
4600 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4601 {
4602         int sp, ret;
4603         struct intel_iommu *iommu = dmaru->iommu;
4604
4605         if (g_iommus[iommu->seq_id])
4606                 return 0;
4607
4608         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4609                 pr_warn("%s: Doesn't support hardware pass through.\n",
4610                         iommu->name);
4611                 return -ENXIO;
4612         }
4613         if (!ecap_sc_support(iommu->ecap) &&
4614             domain_update_iommu_snooping(iommu)) {
4615                 pr_warn("%s: Doesn't support snooping.\n",
4616                         iommu->name);
4617                 return -ENXIO;
4618         }
4619         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4620         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4621                 pr_warn("%s: Doesn't support large page.\n",
4622                         iommu->name);
4623                 return -ENXIO;
4624         }
4625
4626         /*
4627          * Disable translation if already enabled prior to OS handover.
4628          */
4629         if (iommu->gcmd & DMA_GCMD_TE)
4630                 iommu_disable_translation(iommu);
4631
4632         g_iommus[iommu->seq_id] = iommu;
4633         ret = iommu_init_domains(iommu);
4634         if (ret == 0)
4635                 ret = iommu_alloc_root_entry(iommu);
4636         if (ret)
4637                 goto out;
4638
4639         intel_svm_check(iommu);
4640
4641         if (dmaru->ignored) {
4642                 /*
4643                  * we always have to disable PMRs or DMA may fail on this device
4644                  */
4645                 if (force_on)
4646                         iommu_disable_protect_mem_regions(iommu);
4647                 return 0;
4648         }
4649
4650         intel_iommu_init_qi(iommu);
4651         iommu_flush_write_buffer(iommu);
4652
4653 #ifdef CONFIG_INTEL_IOMMU_SVM
4654         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4655                 ret = intel_svm_enable_prq(iommu);
4656                 if (ret)
4657                         goto disable_iommu;
4658         }
4659 #endif
4660         ret = dmar_set_interrupt(iommu);
4661         if (ret)
4662                 goto disable_iommu;
4663
4664         iommu_set_root_entry(iommu);
4665         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4666         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4667         iommu_enable_translation(iommu);
4668
4669         iommu_disable_protect_mem_regions(iommu);
4670         return 0;
4671
4672 disable_iommu:
4673         disable_dmar_iommu(iommu);
4674 out:
4675         free_dmar_iommu(iommu);
4676         return ret;
4677 }
4678
4679 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4680 {
4681         int ret = 0;
4682         struct intel_iommu *iommu = dmaru->iommu;
4683
4684         if (!intel_iommu_enabled)
4685                 return 0;
4686         if (iommu == NULL)
4687                 return -EINVAL;
4688
4689         if (insert) {
4690                 ret = intel_iommu_add(dmaru);
4691         } else {
4692                 disable_dmar_iommu(iommu);
4693                 free_dmar_iommu(iommu);
4694         }
4695
4696         return ret;
4697 }
4698
4699 static void intel_iommu_free_dmars(void)
4700 {
4701         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4702         struct dmar_atsr_unit *atsru, *atsr_n;
4703
4704         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4705                 list_del(&rmrru->list);
4706                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4707                 kfree(rmrru);
4708         }
4709
4710         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4711                 list_del(&atsru->list);
4712                 intel_iommu_free_atsr(atsru);
4713         }
4714 }
4715
4716 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4717 {
4718         int i, ret = 1;
4719         struct pci_bus *bus;
4720         struct pci_dev *bridge = NULL;
4721         struct device *tmp;
4722         struct acpi_dmar_atsr *atsr;
4723         struct dmar_atsr_unit *atsru;
4724
4725         dev = pci_physfn(dev);
4726         for (bus = dev->bus; bus; bus = bus->parent) {
4727                 bridge = bus->self;
4728                 /* If it's an integrated device, allow ATS */
4729                 if (!bridge)
4730                         return 1;
4731                 /* Connected via non-PCIe: no ATS */
4732                 if (!pci_is_pcie(bridge) ||
4733                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4734                         return 0;
4735                 /* If we found the root port, look it up in the ATSR */
4736                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4737                         break;
4738         }
4739
4740         rcu_read_lock();
4741         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4742                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4743                 if (atsr->segment != pci_domain_nr(dev->bus))
4744                         continue;
4745
4746                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4747                         if (tmp == &bridge->dev)
4748                                 goto out;
4749
4750                 if (atsru->include_all)
4751                         goto out;
4752         }
4753         ret = 0;
4754 out:
4755         rcu_read_unlock();
4756
4757         return ret;
4758 }
4759
4760 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4761 {
4762         int ret;
4763         struct dmar_rmrr_unit *rmrru;
4764         struct dmar_atsr_unit *atsru;
4765         struct acpi_dmar_atsr *atsr;
4766         struct acpi_dmar_reserved_memory *rmrr;
4767
4768         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4769                 return 0;
4770
4771         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4772                 rmrr = container_of(rmrru->hdr,
4773                                     struct acpi_dmar_reserved_memory, header);
4774                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4775                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4776                                 ((void *)rmrr) + rmrr->header.length,
4777                                 rmrr->segment, rmrru->devices,
4778                                 rmrru->devices_cnt);
4779                         if (ret < 0)
4780                                 return ret;
4781                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4782                         dmar_remove_dev_scope(info, rmrr->segment,
4783                                 rmrru->devices, rmrru->devices_cnt);
4784                 }
4785         }
4786
4787         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4788                 if (atsru->include_all)
4789                         continue;
4790
4791                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4792                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4793                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4794                                         (void *)atsr + atsr->header.length,
4795                                         atsr->segment, atsru->devices,
4796                                         atsru->devices_cnt);
4797                         if (ret > 0)
4798                                 break;
4799                         else if (ret < 0)
4800                                 return ret;
4801                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4802                         if (dmar_remove_dev_scope(info, atsr->segment,
4803                                         atsru->devices, atsru->devices_cnt))
4804                                 break;
4805                 }
4806         }
4807
4808         return 0;
4809 }
4810
4811 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4812                                        unsigned long val, void *v)
4813 {
4814         struct memory_notify *mhp = v;
4815         unsigned long long start, end;
4816         unsigned long start_vpfn, last_vpfn;
4817
4818         switch (val) {
4819         case MEM_GOING_ONLINE:
4820                 start = mhp->start_pfn << PAGE_SHIFT;
4821                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4822                 if (iommu_domain_identity_map(si_domain, start, end)) {
4823                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4824                                 start, end);
4825                         return NOTIFY_BAD;
4826                 }
4827                 break;
4828
4829         case MEM_OFFLINE:
4830         case MEM_CANCEL_ONLINE:
4831                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4832                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4833                 while (start_vpfn <= last_vpfn) {
4834                         struct iova *iova;
4835                         struct dmar_drhd_unit *drhd;
4836                         struct intel_iommu *iommu;
4837                         struct page *freelist;
4838
4839                         iova = find_iova(&si_domain->iovad, start_vpfn);
4840                         if (iova == NULL) {
4841                                 pr_debug("Failed get IOVA for PFN %lx\n",
4842                                          start_vpfn);
4843                                 break;
4844                         }
4845
4846                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4847                                                      start_vpfn, last_vpfn);
4848                         if (iova == NULL) {
4849                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4850                                         start_vpfn, last_vpfn);
4851                                 return NOTIFY_BAD;
4852                         }
4853
4854                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4855                                                iova->pfn_hi);
4856
4857                         rcu_read_lock();
4858                         for_each_active_iommu(iommu, drhd)
4859                                 iommu_flush_iotlb_psi(iommu, si_domain,
4860                                         iova->pfn_lo, iova_size(iova),
4861                                         !freelist, 0);
4862                         rcu_read_unlock();
4863                         dma_free_pagelist(freelist);
4864
4865                         start_vpfn = iova->pfn_hi + 1;
4866                         free_iova_mem(iova);
4867                 }
4868                 break;
4869         }
4870
4871         return NOTIFY_OK;
4872 }
4873
4874 static struct notifier_block intel_iommu_memory_nb = {
4875         .notifier_call = intel_iommu_memory_notifier,
4876         .priority = 0
4877 };
4878
4879 static void free_all_cpu_cached_iovas(unsigned int cpu)
4880 {
4881         int i;
4882
4883         for (i = 0; i < g_num_of_iommus; i++) {
4884                 struct intel_iommu *iommu = g_iommus[i];
4885                 struct dmar_domain *domain;
4886                 int did;
4887
4888                 if (!iommu)
4889                         continue;
4890
4891                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4892                         domain = get_iommu_domain(iommu, (u16)did);
4893
4894                         if (!domain)
4895                                 continue;
4896                         free_cpu_cached_iovas(cpu, &domain->iovad);
4897                 }
4898         }
4899 }
4900
4901 static int intel_iommu_cpu_dead(unsigned int cpu)
4902 {
4903         free_all_cpu_cached_iovas(cpu);
4904         return 0;
4905 }
4906
4907 static void intel_disable_iommus(void)
4908 {
4909         struct intel_iommu *iommu = NULL;
4910         struct dmar_drhd_unit *drhd;
4911
4912         for_each_iommu(iommu, drhd)
4913                 iommu_disable_translation(iommu);
4914 }
4915
4916 void intel_iommu_shutdown(void)
4917 {
4918         struct dmar_drhd_unit *drhd;
4919         struct intel_iommu *iommu = NULL;
4920
4921         if (no_iommu || dmar_disabled)
4922                 return;
4923
4924         down_write(&dmar_global_lock);
4925
4926         /* Disable PMRs explicitly here. */
4927         for_each_iommu(iommu, drhd)
4928                 iommu_disable_protect_mem_regions(iommu);
4929
4930         /* Make sure the IOMMUs are switched off */
4931         intel_disable_iommus();
4932
4933         up_write(&dmar_global_lock);
4934 }
4935
4936 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4937 {
4938         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4939
4940         return container_of(iommu_dev, struct intel_iommu, iommu);
4941 }
4942
4943 static ssize_t intel_iommu_show_version(struct device *dev,
4944                                         struct device_attribute *attr,
4945                                         char *buf)
4946 {
4947         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4948         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4949         return sprintf(buf, "%d:%d\n",
4950                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4951 }
4952 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4953
4954 static ssize_t intel_iommu_show_address(struct device *dev,
4955                                         struct device_attribute *attr,
4956                                         char *buf)
4957 {
4958         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4959         return sprintf(buf, "%llx\n", iommu->reg_phys);
4960 }
4961 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4962
4963 static ssize_t intel_iommu_show_cap(struct device *dev,
4964                                     struct device_attribute *attr,
4965                                     char *buf)
4966 {
4967         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4968         return sprintf(buf, "%llx\n", iommu->cap);
4969 }
4970 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4971
4972 static ssize_t intel_iommu_show_ecap(struct device *dev,
4973                                     struct device_attribute *attr,
4974                                     char *buf)
4975 {
4976         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4977         return sprintf(buf, "%llx\n", iommu->ecap);
4978 }
4979 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4980
4981 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4982                                       struct device_attribute *attr,
4983                                       char *buf)
4984 {
4985         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4986         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4987 }
4988 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4989
4990 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4991                                            struct device_attribute *attr,
4992                                            char *buf)
4993 {
4994         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4995         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4996                                                   cap_ndoms(iommu->cap)));
4997 }
4998 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4999
5000 static struct attribute *intel_iommu_attrs[] = {
5001         &dev_attr_version.attr,
5002         &dev_attr_address.attr,
5003         &dev_attr_cap.attr,
5004         &dev_attr_ecap.attr,
5005         &dev_attr_domains_supported.attr,
5006         &dev_attr_domains_used.attr,
5007         NULL,
5008 };
5009
5010 static struct attribute_group intel_iommu_group = {
5011         .name = "intel-iommu",
5012         .attrs = intel_iommu_attrs,
5013 };
5014
5015 const struct attribute_group *intel_iommu_groups[] = {
5016         &intel_iommu_group,
5017         NULL,
5018 };
5019
5020 static inline bool has_untrusted_dev(void)
5021 {
5022         struct pci_dev *pdev = NULL;
5023
5024         for_each_pci_dev(pdev)
5025                 if (pdev->untrusted)
5026                         return true;
5027
5028         return false;
5029 }
5030
5031 static int __init platform_optin_force_iommu(void)
5032 {
5033         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
5034                 return 0;
5035
5036         if (no_iommu || dmar_disabled)
5037                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
5038
5039         /*
5040          * If Intel-IOMMU is disabled by default, we will apply identity
5041          * map for all devices except those marked as being untrusted.
5042          */
5043         if (dmar_disabled)
5044                 iommu_set_default_passthrough(false);
5045
5046         dmar_disabled = 0;
5047         no_iommu = 0;
5048
5049         return 1;
5050 }
5051
5052 static int __init probe_acpi_namespace_devices(void)
5053 {
5054         struct dmar_drhd_unit *drhd;
5055         /* To avoid a -Wunused-but-set-variable warning. */
5056         struct intel_iommu *iommu __maybe_unused;
5057         struct device *dev;
5058         int i, ret = 0;
5059
5060         for_each_active_iommu(iommu, drhd) {
5061                 for_each_active_dev_scope(drhd->devices,
5062                                           drhd->devices_cnt, i, dev) {
5063                         struct acpi_device_physical_node *pn;
5064                         struct iommu_group *group;
5065                         struct acpi_device *adev;
5066
5067                         if (dev->bus != &acpi_bus_type)
5068                                 continue;
5069
5070                         adev = to_acpi_device(dev);
5071                         mutex_lock(&adev->physical_node_lock);
5072                         list_for_each_entry(pn,
5073                                             &adev->physical_node_list, node) {
5074                                 group = iommu_group_get(pn->dev);
5075                                 if (group) {
5076                                         iommu_group_put(group);
5077                                         continue;
5078                                 }
5079
5080                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
5081                                 ret = iommu_probe_device(pn->dev);
5082                                 if (ret)
5083                                         break;
5084                         }
5085                         mutex_unlock(&adev->physical_node_lock);
5086
5087                         if (ret)
5088                                 return ret;
5089                 }
5090         }
5091
5092         return 0;
5093 }
5094
5095 int __init intel_iommu_init(void)
5096 {
5097         int ret = -ENODEV;
5098         struct dmar_drhd_unit *drhd;
5099         struct intel_iommu *iommu;
5100
5101         /*
5102          * Intel IOMMU is required for a TXT/tboot launch or platform
5103          * opt in, so enforce that.
5104          */
5105         force_on = tboot_force_iommu() || platform_optin_force_iommu();
5106
5107         if (iommu_init_mempool()) {
5108                 if (force_on)
5109                         panic("tboot: Failed to initialize iommu memory\n");
5110                 return -ENOMEM;
5111         }
5112
5113         down_write(&dmar_global_lock);
5114         if (dmar_table_init()) {
5115                 if (force_on)
5116                         panic("tboot: Failed to initialize DMAR table\n");
5117                 goto out_free_dmar;
5118         }
5119
5120         if (dmar_dev_scope_init() < 0) {
5121                 if (force_on)
5122                         panic("tboot: Failed to initialize DMAR device scope\n");
5123                 goto out_free_dmar;
5124         }
5125
5126         up_write(&dmar_global_lock);
5127
5128         /*
5129          * The bus notifier takes the dmar_global_lock, so lockdep will
5130          * complain later when we register it under the lock.
5131          */
5132         dmar_register_bus_notifier();
5133
5134         down_write(&dmar_global_lock);
5135
5136         if (!no_iommu)
5137                 intel_iommu_debugfs_init();
5138
5139         if (no_iommu || dmar_disabled) {
5140                 /*
5141                  * We exit the function here to ensure IOMMU's remapping and
5142                  * mempool aren't setup, which means that the IOMMU's PMRs
5143                  * won't be disabled via the call to init_dmars(). So disable
5144                  * it explicitly here. The PMRs were setup by tboot prior to
5145                  * calling SENTER, but the kernel is expected to reset/tear
5146                  * down the PMRs.
5147                  */
5148                 if (intel_iommu_tboot_noforce) {
5149                         for_each_iommu(iommu, drhd)
5150                                 iommu_disable_protect_mem_regions(iommu);
5151                 }
5152
5153                 /*
5154                  * Make sure the IOMMUs are switched off, even when we
5155                  * boot into a kexec kernel and the previous kernel left
5156                  * them enabled
5157                  */
5158                 intel_disable_iommus();
5159                 goto out_free_dmar;
5160         }
5161
5162         if (list_empty(&dmar_rmrr_units))
5163                 pr_info("No RMRR found\n");
5164
5165         if (list_empty(&dmar_atsr_units))
5166                 pr_info("No ATSR found\n");
5167
5168         if (dmar_init_reserved_ranges()) {
5169                 if (force_on)
5170                         panic("tboot: Failed to reserve iommu ranges\n");
5171                 goto out_free_reserved_range;
5172         }
5173
5174         if (dmar_map_gfx)
5175                 intel_iommu_gfx_mapped = 1;
5176
5177         init_no_remapping_devices();
5178
5179         ret = init_dmars();
5180         if (ret) {
5181                 if (force_on)
5182                         panic("tboot: Failed to initialize DMARs\n");
5183                 pr_err("Initialization failed\n");
5184                 goto out_free_reserved_range;
5185         }
5186         up_write(&dmar_global_lock);
5187
5188 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5189         /*
5190          * If the system has no untrusted device or the user has decided
5191          * to disable the bounce page mechanisms, we don't need swiotlb.
5192          * Mark this and the pre-allocated bounce pages will be released
5193          * later.
5194          */
5195         if (!has_untrusted_dev() || intel_no_bounce)
5196                 swiotlb = 0;
5197 #endif
5198         dma_ops = &intel_dma_ops;
5199
5200         init_iommu_pm_ops();
5201
5202         down_read(&dmar_global_lock);
5203         for_each_active_iommu(iommu, drhd) {
5204                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5205                                        intel_iommu_groups,
5206                                        "%s", iommu->name);
5207                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5208                 iommu_device_register(&iommu->iommu);
5209         }
5210         up_read(&dmar_global_lock);
5211
5212         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5213         if (si_domain && !hw_pass_through)
5214                 register_memory_notifier(&intel_iommu_memory_nb);
5215         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5216                           intel_iommu_cpu_dead);
5217
5218         down_read(&dmar_global_lock);
5219         if (probe_acpi_namespace_devices())
5220                 pr_warn("ACPI name space devices didn't probe correctly\n");
5221
5222         /* Finally, we enable the DMA remapping hardware. */
5223         for_each_iommu(iommu, drhd) {
5224                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5225                         iommu_enable_translation(iommu);
5226
5227                 iommu_disable_protect_mem_regions(iommu);
5228         }
5229         up_read(&dmar_global_lock);
5230
5231         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5232
5233         intel_iommu_enabled = 1;
5234
5235         return 0;
5236
5237 out_free_reserved_range:
5238         put_iova_domain(&reserved_iova_list);
5239 out_free_dmar:
5240         intel_iommu_free_dmars();
5241         up_write(&dmar_global_lock);
5242         iommu_exit_mempool();
5243         return ret;
5244 }
5245
5246 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5247 {
5248         struct intel_iommu *iommu = opaque;
5249
5250         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5251         return 0;
5252 }
5253
5254 /*
5255  * NB - intel-iommu lacks any sort of reference counting for the users of
5256  * dependent devices.  If multiple endpoints have intersecting dependent
5257  * devices, unbinding the driver from any one of them will possibly leave
5258  * the others unable to operate.
5259  */
5260 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5261 {
5262         if (!iommu || !dev || !dev_is_pci(dev))
5263                 return;
5264
5265         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5266 }
5267
5268 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5269 {
5270         struct dmar_domain *domain;
5271         struct intel_iommu *iommu;
5272         unsigned long flags;
5273
5274         assert_spin_locked(&device_domain_lock);
5275
5276         if (WARN_ON(!info))
5277                 return;
5278
5279         iommu = info->iommu;
5280         domain = info->domain;
5281
5282         if (info->dev) {
5283                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5284                         intel_pasid_tear_down_entry(iommu, info->dev,
5285                                         PASID_RID2PASID);
5286
5287                 iommu_disable_dev_iotlb(info);
5288                 domain_context_clear(iommu, info->dev);
5289                 intel_pasid_free_table(info->dev);
5290         }
5291
5292         unlink_domain_info(info);
5293
5294         spin_lock_irqsave(&iommu->lock, flags);
5295         domain_detach_iommu(domain, iommu);
5296         spin_unlock_irqrestore(&iommu->lock, flags);
5297
5298         /* free the private domain */
5299         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5300             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5301             list_empty(&domain->devices))
5302                 domain_exit(info->domain);
5303
5304         free_devinfo_mem(info);
5305 }
5306
5307 static void dmar_remove_one_dev_info(struct device *dev)
5308 {
5309         struct device_domain_info *info;
5310         unsigned long flags;
5311
5312         spin_lock_irqsave(&device_domain_lock, flags);
5313         info = dev->archdata.iommu;
5314         if (info && info != DEFER_DEVICE_DOMAIN_INFO
5315             && info != DUMMY_DEVICE_DOMAIN_INFO)
5316                 __dmar_remove_one_dev_info(info);
5317         spin_unlock_irqrestore(&device_domain_lock, flags);
5318 }
5319
5320 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5321 {
5322         int adjust_width;
5323
5324         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5325         domain_reserve_special_ranges(domain);
5326
5327         /* calculate AGAW */
5328         domain->gaw = guest_width;
5329         adjust_width = guestwidth_to_adjustwidth(guest_width);
5330         domain->agaw = width_to_agaw(adjust_width);
5331
5332         domain->iommu_coherency = 0;
5333         domain->iommu_snooping = 0;
5334         domain->iommu_superpage = 0;
5335         domain->max_addr = 0;
5336
5337         /* always allocate the top pgd */
5338         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5339         if (!domain->pgd)
5340                 return -ENOMEM;
5341         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5342         return 0;
5343 }
5344
5345 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5346 {
5347         struct dmar_domain *dmar_domain;
5348         struct iommu_domain *domain;
5349         int ret;
5350
5351         switch (type) {
5352         case IOMMU_DOMAIN_DMA:
5353         /* fallthrough */
5354         case IOMMU_DOMAIN_UNMANAGED:
5355                 dmar_domain = alloc_domain(0);
5356                 if (!dmar_domain) {
5357                         pr_err("Can't allocate dmar_domain\n");
5358                         return NULL;
5359                 }
5360                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5361                         pr_err("Domain initialization failed\n");
5362                         domain_exit(dmar_domain);
5363                         return NULL;
5364                 }
5365
5366                 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5367                         ret = init_iova_flush_queue(&dmar_domain->iovad,
5368                                                     iommu_flush_iova,
5369                                                     iova_entry_free);
5370                         if (ret)
5371                                 pr_info("iova flush queue initialization failed\n");
5372                 }
5373
5374                 domain_update_iommu_cap(dmar_domain);
5375
5376                 domain = &dmar_domain->domain;
5377                 domain->geometry.aperture_start = 0;
5378                 domain->geometry.aperture_end   =
5379                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5380                 domain->geometry.force_aperture = true;
5381
5382                 return domain;
5383         case IOMMU_DOMAIN_IDENTITY:
5384                 return &si_domain->domain;
5385         default:
5386                 return NULL;
5387         }
5388
5389         return NULL;
5390 }
5391
5392 static void intel_iommu_domain_free(struct iommu_domain *domain)
5393 {
5394         if (domain != &si_domain->domain)
5395                 domain_exit(to_dmar_domain(domain));
5396 }
5397
5398 /*
5399  * Check whether a @domain could be attached to the @dev through the
5400  * aux-domain attach/detach APIs.
5401  */
5402 static inline bool
5403 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5404 {
5405         struct device_domain_info *info = dev->archdata.iommu;
5406
5407         return info && info->auxd_enabled &&
5408                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5409 }
5410
5411 static void auxiliary_link_device(struct dmar_domain *domain,
5412                                   struct device *dev)
5413 {
5414         struct device_domain_info *info = dev->archdata.iommu;
5415
5416         assert_spin_locked(&device_domain_lock);
5417         if (WARN_ON(!info))
5418                 return;
5419
5420         domain->auxd_refcnt++;
5421         list_add(&domain->auxd, &info->auxiliary_domains);
5422 }
5423
5424 static void auxiliary_unlink_device(struct dmar_domain *domain,
5425                                     struct device *dev)
5426 {
5427         struct device_domain_info *info = dev->archdata.iommu;
5428
5429         assert_spin_locked(&device_domain_lock);
5430         if (WARN_ON(!info))
5431                 return;
5432
5433         list_del(&domain->auxd);
5434         domain->auxd_refcnt--;
5435
5436         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5437                 ioasid_free(domain->default_pasid);
5438 }
5439
5440 static int aux_domain_add_dev(struct dmar_domain *domain,
5441                               struct device *dev)
5442 {
5443         int ret;
5444         u8 bus, devfn;
5445         unsigned long flags;
5446         struct intel_iommu *iommu;
5447
5448         iommu = device_to_iommu(dev, &bus, &devfn);
5449         if (!iommu)
5450                 return -ENODEV;
5451
5452         if (domain->default_pasid <= 0) {
5453                 int pasid;
5454
5455                 /* No private data needed for the default pasid */
5456                 pasid = ioasid_alloc(NULL, PASID_MIN,
5457                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5458                                      NULL);
5459                 if (pasid == INVALID_IOASID) {
5460                         pr_err("Can't allocate default pasid\n");
5461                         return -ENODEV;
5462                 }
5463                 domain->default_pasid = pasid;
5464         }
5465
5466         spin_lock_irqsave(&device_domain_lock, flags);
5467         /*
5468          * iommu->lock must be held to attach domain to iommu and setup the
5469          * pasid entry for second level translation.
5470          */
5471         spin_lock(&iommu->lock);
5472         ret = domain_attach_iommu(domain, iommu);
5473         if (ret)
5474                 goto attach_failed;
5475
5476         /* Setup the PASID entry for mediated devices: */
5477         if (domain_use_first_level(domain))
5478                 ret = domain_setup_first_level(iommu, domain, dev,
5479                                                domain->default_pasid);
5480         else
5481                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5482                                                      domain->default_pasid);
5483         if (ret)
5484                 goto table_failed;
5485         spin_unlock(&iommu->lock);
5486
5487         auxiliary_link_device(domain, dev);
5488
5489         spin_unlock_irqrestore(&device_domain_lock, flags);
5490
5491         return 0;
5492
5493 table_failed:
5494         domain_detach_iommu(domain, iommu);
5495 attach_failed:
5496         spin_unlock(&iommu->lock);
5497         spin_unlock_irqrestore(&device_domain_lock, flags);
5498         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5499                 ioasid_free(domain->default_pasid);
5500
5501         return ret;
5502 }
5503
5504 static void aux_domain_remove_dev(struct dmar_domain *domain,
5505                                   struct device *dev)
5506 {
5507         struct device_domain_info *info;
5508         struct intel_iommu *iommu;
5509         unsigned long flags;
5510
5511         if (!is_aux_domain(dev, &domain->domain))
5512                 return;
5513
5514         spin_lock_irqsave(&device_domain_lock, flags);
5515         info = dev->archdata.iommu;
5516         iommu = info->iommu;
5517
5518         auxiliary_unlink_device(domain, dev);
5519
5520         spin_lock(&iommu->lock);
5521         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5522         domain_detach_iommu(domain, iommu);
5523         spin_unlock(&iommu->lock);
5524
5525         spin_unlock_irqrestore(&device_domain_lock, flags);
5526 }
5527
5528 static int prepare_domain_attach_device(struct iommu_domain *domain,
5529                                         struct device *dev)
5530 {
5531         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5532         struct intel_iommu *iommu;
5533         int addr_width;
5534         u8 bus, devfn;
5535
5536         iommu = device_to_iommu(dev, &bus, &devfn);
5537         if (!iommu)
5538                 return -ENODEV;
5539
5540         /* check if this iommu agaw is sufficient for max mapped address */
5541         addr_width = agaw_to_width(iommu->agaw);
5542         if (addr_width > cap_mgaw(iommu->cap))
5543                 addr_width = cap_mgaw(iommu->cap);
5544
5545         if (dmar_domain->max_addr > (1LL << addr_width)) {
5546                 dev_err(dev, "%s: iommu width (%d) is not "
5547                         "sufficient for the mapped address (%llx)\n",
5548                         __func__, addr_width, dmar_domain->max_addr);
5549                 return -EFAULT;
5550         }
5551         dmar_domain->gaw = addr_width;
5552
5553         /*
5554          * Knock out extra levels of page tables if necessary
5555          */
5556         while (iommu->agaw < dmar_domain->agaw) {
5557                 struct dma_pte *pte;
5558
5559                 pte = dmar_domain->pgd;
5560                 if (dma_pte_present(pte)) {
5561                         dmar_domain->pgd = (struct dma_pte *)
5562                                 phys_to_virt(dma_pte_addr(pte));
5563                         free_pgtable_page(pte);
5564                 }
5565                 dmar_domain->agaw--;
5566         }
5567
5568         return 0;
5569 }
5570
5571 static int intel_iommu_attach_device(struct iommu_domain *domain,
5572                                      struct device *dev)
5573 {
5574         int ret;
5575
5576         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5577             device_is_rmrr_locked(dev)) {
5578                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5579                 return -EPERM;
5580         }
5581
5582         if (is_aux_domain(dev, domain))
5583                 return -EPERM;
5584
5585         /* normally dev is not mapped */
5586         if (unlikely(domain_context_mapped(dev))) {
5587                 struct dmar_domain *old_domain;
5588
5589                 old_domain = find_domain(dev);
5590                 if (old_domain)
5591                         dmar_remove_one_dev_info(dev);
5592         }
5593
5594         ret = prepare_domain_attach_device(domain, dev);
5595         if (ret)
5596                 return ret;
5597
5598         return domain_add_dev_info(to_dmar_domain(domain), dev);
5599 }
5600
5601 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5602                                          struct device *dev)
5603 {
5604         int ret;
5605
5606         if (!is_aux_domain(dev, domain))
5607                 return -EPERM;
5608
5609         ret = prepare_domain_attach_device(domain, dev);
5610         if (ret)
5611                 return ret;
5612
5613         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5614 }
5615
5616 static void intel_iommu_detach_device(struct iommu_domain *domain,
5617                                       struct device *dev)
5618 {
5619         dmar_remove_one_dev_info(dev);
5620 }
5621
5622 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5623                                           struct device *dev)
5624 {
5625         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5626 }
5627
5628 static int intel_iommu_map(struct iommu_domain *domain,
5629                            unsigned long iova, phys_addr_t hpa,
5630                            size_t size, int iommu_prot, gfp_t gfp)
5631 {
5632         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5633         u64 max_addr;
5634         int prot = 0;
5635         int ret;
5636
5637         if (iommu_prot & IOMMU_READ)
5638                 prot |= DMA_PTE_READ;
5639         if (iommu_prot & IOMMU_WRITE)
5640                 prot |= DMA_PTE_WRITE;
5641         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5642                 prot |= DMA_PTE_SNP;
5643
5644         max_addr = iova + size;
5645         if (dmar_domain->max_addr < max_addr) {
5646                 u64 end;
5647
5648                 /* check if minimum agaw is sufficient for mapped address */
5649                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5650                 if (end < max_addr) {
5651                         pr_err("%s: iommu width (%d) is not "
5652                                "sufficient for the mapped address (%llx)\n",
5653                                __func__, dmar_domain->gaw, max_addr);
5654                         return -EFAULT;
5655                 }
5656                 dmar_domain->max_addr = max_addr;
5657         }
5658         /* Round up size to next multiple of PAGE_SIZE, if it and
5659            the low bits of hpa would take us onto the next page */
5660         size = aligned_nrpages(hpa, size);
5661         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5662                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5663         return ret;
5664 }
5665
5666 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5667                                 unsigned long iova, size_t size,
5668                                 struct iommu_iotlb_gather *gather)
5669 {
5670         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5671         struct page *freelist = NULL;
5672         unsigned long start_pfn, last_pfn;
5673         unsigned int npages;
5674         int iommu_id, level = 0;
5675
5676         /* Cope with horrid API which requires us to unmap more than the
5677            size argument if it happens to be a large-page mapping. */
5678         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5679
5680         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5681                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5682
5683         start_pfn = iova >> VTD_PAGE_SHIFT;
5684         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5685
5686         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5687
5688         npages = last_pfn - start_pfn + 1;
5689
5690         for_each_domain_iommu(iommu_id, dmar_domain)
5691                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5692                                       start_pfn, npages, !freelist, 0);
5693
5694         dma_free_pagelist(freelist);
5695
5696         if (dmar_domain->max_addr == iova + size)
5697                 dmar_domain->max_addr = iova;
5698
5699         return size;
5700 }
5701
5702 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5703                                             dma_addr_t iova)
5704 {
5705         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5706         struct dma_pte *pte;
5707         int level = 0;
5708         u64 phys = 0;
5709
5710         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5711         if (pte && dma_pte_present(pte))
5712                 phys = dma_pte_addr(pte) +
5713                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5714                                                 VTD_PAGE_SHIFT) - 1));
5715
5716         return phys;
5717 }
5718
5719 static inline bool scalable_mode_support(void)
5720 {
5721         struct dmar_drhd_unit *drhd;
5722         struct intel_iommu *iommu;
5723         bool ret = true;
5724
5725         rcu_read_lock();
5726         for_each_active_iommu(iommu, drhd) {
5727                 if (!sm_supported(iommu)) {
5728                         ret = false;
5729                         break;
5730                 }
5731         }
5732         rcu_read_unlock();
5733
5734         return ret;
5735 }
5736
5737 static inline bool iommu_pasid_support(void)
5738 {
5739         struct dmar_drhd_unit *drhd;
5740         struct intel_iommu *iommu;
5741         bool ret = true;
5742
5743         rcu_read_lock();
5744         for_each_active_iommu(iommu, drhd) {
5745                 if (!pasid_supported(iommu)) {
5746                         ret = false;
5747                         break;
5748                 }
5749         }
5750         rcu_read_unlock();
5751
5752         return ret;
5753 }
5754
5755 static inline bool nested_mode_support(void)
5756 {
5757         struct dmar_drhd_unit *drhd;
5758         struct intel_iommu *iommu;
5759         bool ret = true;
5760
5761         rcu_read_lock();
5762         for_each_active_iommu(iommu, drhd) {
5763                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5764                         ret = false;
5765                         break;
5766                 }
5767         }
5768         rcu_read_unlock();
5769
5770         return ret;
5771 }
5772
5773 static bool intel_iommu_capable(enum iommu_cap cap)
5774 {
5775         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5776                 return domain_update_iommu_snooping(NULL) == 1;
5777         if (cap == IOMMU_CAP_INTR_REMAP)
5778                 return irq_remapping_enabled == 1;
5779
5780         return false;
5781 }
5782
5783 static int intel_iommu_add_device(struct device *dev)
5784 {
5785         struct dmar_domain *dmar_domain;
5786         struct iommu_domain *domain;
5787         struct intel_iommu *iommu;
5788         struct iommu_group *group;
5789         u8 bus, devfn;
5790         int ret;
5791
5792         iommu = device_to_iommu(dev, &bus, &devfn);
5793         if (!iommu)
5794                 return -ENODEV;
5795
5796         iommu_device_link(&iommu->iommu, dev);
5797
5798         if (translation_pre_enabled(iommu))
5799                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5800
5801         group = iommu_group_get_for_dev(dev);
5802
5803         if (IS_ERR(group)) {
5804                 ret = PTR_ERR(group);
5805                 goto unlink;
5806         }
5807
5808         iommu_group_put(group);
5809
5810         domain = iommu_get_domain_for_dev(dev);
5811         dmar_domain = to_dmar_domain(domain);
5812         if (domain->type == IOMMU_DOMAIN_DMA) {
5813                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5814                         ret = iommu_request_dm_for_dev(dev);
5815                         if (ret) {
5816                                 dmar_remove_one_dev_info(dev);
5817                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5818                                 domain_add_dev_info(si_domain, dev);
5819                                 dev_info(dev,
5820                                          "Device uses a private identity domain.\n");
5821                         }
5822                 }
5823         } else {
5824                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5825                         ret = iommu_request_dma_domain_for_dev(dev);
5826                         if (ret) {
5827                                 dmar_remove_one_dev_info(dev);
5828                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5829                                 if (!get_private_domain_for_dev(dev)) {
5830                                         dev_warn(dev,
5831                                                  "Failed to get a private domain.\n");
5832                                         ret = -ENOMEM;
5833                                         goto unlink;
5834                                 }
5835
5836                                 dev_info(dev,
5837                                          "Device uses a private dma domain.\n");
5838                         }
5839                 }
5840         }
5841
5842         if (device_needs_bounce(dev)) {
5843                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5844                 set_dma_ops(dev, &bounce_dma_ops);
5845         }
5846
5847         return 0;
5848
5849 unlink:
5850         iommu_device_unlink(&iommu->iommu, dev);
5851         return ret;
5852 }
5853
5854 static void intel_iommu_remove_device(struct device *dev)
5855 {
5856         struct intel_iommu *iommu;
5857         u8 bus, devfn;
5858
5859         iommu = device_to_iommu(dev, &bus, &devfn);
5860         if (!iommu)
5861                 return;
5862
5863         dmar_remove_one_dev_info(dev);
5864
5865         iommu_group_remove_device(dev);
5866
5867         iommu_device_unlink(&iommu->iommu, dev);
5868
5869         if (device_needs_bounce(dev))
5870                 set_dma_ops(dev, NULL);
5871 }
5872
5873 static void intel_iommu_get_resv_regions(struct device *device,
5874                                          struct list_head *head)
5875 {
5876         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5877         struct iommu_resv_region *reg;
5878         struct dmar_rmrr_unit *rmrr;
5879         struct device *i_dev;
5880         int i;
5881
5882         down_read(&dmar_global_lock);
5883         for_each_rmrr_units(rmrr) {
5884                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5885                                           i, i_dev) {
5886                         struct iommu_resv_region *resv;
5887                         enum iommu_resv_type type;
5888                         size_t length;
5889
5890                         if (i_dev != device &&
5891                             !is_downstream_to_pci_bridge(device, i_dev))
5892                                 continue;
5893
5894                         length = rmrr->end_address - rmrr->base_address + 1;
5895
5896                         type = device_rmrr_is_relaxable(device) ?
5897                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5898
5899                         resv = iommu_alloc_resv_region(rmrr->base_address,
5900                                                        length, prot, type);
5901                         if (!resv)
5902                                 break;
5903
5904                         list_add_tail(&resv->list, head);
5905                 }
5906         }
5907         up_read(&dmar_global_lock);
5908
5909 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5910         if (dev_is_pci(device)) {
5911                 struct pci_dev *pdev = to_pci_dev(device);
5912
5913                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5914                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5915                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5916                         if (reg)
5917                                 list_add_tail(&reg->list, head);
5918                 }
5919         }
5920 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5921
5922         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5923                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5924                                       0, IOMMU_RESV_MSI);
5925         if (!reg)
5926                 return;
5927         list_add_tail(&reg->list, head);
5928 }
5929
5930 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5931 {
5932         struct device_domain_info *info;
5933         struct context_entry *context;
5934         struct dmar_domain *domain;
5935         unsigned long flags;
5936         u64 ctx_lo;
5937         int ret;
5938
5939         domain = find_domain(dev);
5940         if (!domain)
5941                 return -EINVAL;
5942
5943         spin_lock_irqsave(&device_domain_lock, flags);
5944         spin_lock(&iommu->lock);
5945
5946         ret = -EINVAL;
5947         info = dev->archdata.iommu;
5948         if (!info || !info->pasid_supported)
5949                 goto out;
5950
5951         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5952         if (WARN_ON(!context))
5953                 goto out;
5954
5955         ctx_lo = context[0].lo;
5956
5957         if (!(ctx_lo & CONTEXT_PASIDE)) {
5958                 ctx_lo |= CONTEXT_PASIDE;
5959                 context[0].lo = ctx_lo;
5960                 wmb();
5961                 iommu->flush.flush_context(iommu,
5962                                            domain->iommu_did[iommu->seq_id],
5963                                            PCI_DEVID(info->bus, info->devfn),
5964                                            DMA_CCMD_MASK_NOBIT,
5965                                            DMA_CCMD_DEVICE_INVL);
5966         }
5967
5968         /* Enable PASID support in the device, if it wasn't already */
5969         if (!info->pasid_enabled)
5970                 iommu_enable_dev_iotlb(info);
5971
5972         ret = 0;
5973
5974  out:
5975         spin_unlock(&iommu->lock);
5976         spin_unlock_irqrestore(&device_domain_lock, flags);
5977
5978         return ret;
5979 }
5980
5981 static void intel_iommu_apply_resv_region(struct device *dev,
5982                                           struct iommu_domain *domain,
5983                                           struct iommu_resv_region *region)
5984 {
5985         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5986         unsigned long start, end;
5987
5988         start = IOVA_PFN(region->start);
5989         end   = IOVA_PFN(region->start + region->length - 1);
5990
5991         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5992 }
5993
5994 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5995 {
5996         if (dev_is_pci(dev))
5997                 return pci_device_group(dev);
5998         return generic_device_group(dev);
5999 }
6000
6001 #ifdef CONFIG_INTEL_IOMMU_SVM
6002 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
6003 {
6004         struct intel_iommu *iommu;
6005         u8 bus, devfn;
6006
6007         if (iommu_dummy(dev)) {
6008                 dev_warn(dev,
6009                          "No IOMMU translation for device; cannot enable SVM\n");
6010                 return NULL;
6011         }
6012
6013         iommu = device_to_iommu(dev, &bus, &devfn);
6014         if ((!iommu)) {
6015                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
6016                 return NULL;
6017         }
6018
6019         return iommu;
6020 }
6021 #endif /* CONFIG_INTEL_IOMMU_SVM */
6022
6023 static int intel_iommu_enable_auxd(struct device *dev)
6024 {
6025         struct device_domain_info *info;
6026         struct intel_iommu *iommu;
6027         unsigned long flags;
6028         u8 bus, devfn;
6029         int ret;
6030
6031         iommu = device_to_iommu(dev, &bus, &devfn);
6032         if (!iommu || dmar_disabled)
6033                 return -EINVAL;
6034
6035         if (!sm_supported(iommu) || !pasid_supported(iommu))
6036                 return -EINVAL;
6037
6038         ret = intel_iommu_enable_pasid(iommu, dev);
6039         if (ret)
6040                 return -ENODEV;
6041
6042         spin_lock_irqsave(&device_domain_lock, flags);
6043         info = dev->archdata.iommu;
6044         info->auxd_enabled = 1;
6045         spin_unlock_irqrestore(&device_domain_lock, flags);
6046
6047         return 0;
6048 }
6049
6050 static int intel_iommu_disable_auxd(struct device *dev)
6051 {
6052         struct device_domain_info *info;
6053         unsigned long flags;
6054
6055         spin_lock_irqsave(&device_domain_lock, flags);
6056         info = dev->archdata.iommu;
6057         if (!WARN_ON(!info))
6058                 info->auxd_enabled = 0;
6059         spin_unlock_irqrestore(&device_domain_lock, flags);
6060
6061         return 0;
6062 }
6063
6064 /*
6065  * A PCI express designated vendor specific extended capability is defined
6066  * in the section 3.7 of Intel scalable I/O virtualization technical spec
6067  * for system software and tools to detect endpoint devices supporting the
6068  * Intel scalable IO virtualization without host driver dependency.
6069  *
6070  * Returns the address of the matching extended capability structure within
6071  * the device's PCI configuration space or 0 if the device does not support
6072  * it.
6073  */
6074 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6075 {
6076         int pos;
6077         u16 vendor, id;
6078
6079         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6080         while (pos) {
6081                 pci_read_config_word(pdev, pos + 4, &vendor);
6082                 pci_read_config_word(pdev, pos + 8, &id);
6083                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6084                         return pos;
6085
6086                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6087         }
6088
6089         return 0;
6090 }
6091
6092 static bool
6093 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6094 {
6095         if (feat == IOMMU_DEV_FEAT_AUX) {
6096                 int ret;
6097
6098                 if (!dev_is_pci(dev) || dmar_disabled ||
6099                     !scalable_mode_support() || !iommu_pasid_support())
6100                         return false;
6101
6102                 ret = pci_pasid_features(to_pci_dev(dev));
6103                 if (ret < 0)
6104                         return false;
6105
6106                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6107         }
6108
6109         return false;
6110 }
6111
6112 static int
6113 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6114 {
6115         if (feat == IOMMU_DEV_FEAT_AUX)
6116                 return intel_iommu_enable_auxd(dev);
6117
6118         return -ENODEV;
6119 }
6120
6121 static int
6122 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6123 {
6124         if (feat == IOMMU_DEV_FEAT_AUX)
6125                 return intel_iommu_disable_auxd(dev);
6126
6127         return -ENODEV;
6128 }
6129
6130 static bool
6131 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6132 {
6133         struct device_domain_info *info = dev->archdata.iommu;
6134
6135         if (feat == IOMMU_DEV_FEAT_AUX)
6136                 return scalable_mode_support() && info && info->auxd_enabled;
6137
6138         return false;
6139 }
6140
6141 static int
6142 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6143 {
6144         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6145
6146         return dmar_domain->default_pasid > 0 ?
6147                         dmar_domain->default_pasid : -EINVAL;
6148 }
6149
6150 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6151                                            struct device *dev)
6152 {
6153         return attach_deferred(dev);
6154 }
6155
6156 static int
6157 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6158                             enum iommu_attr attr, void *data)
6159 {
6160         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6161         unsigned long flags;
6162         int ret = 0;
6163
6164         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6165                 return -EINVAL;
6166
6167         switch (attr) {
6168         case DOMAIN_ATTR_NESTING:
6169                 spin_lock_irqsave(&device_domain_lock, flags);
6170                 if (nested_mode_support() &&
6171                     list_empty(&dmar_domain->devices)) {
6172                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6173                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6174                 } else {
6175                         ret = -ENODEV;
6176                 }
6177                 spin_unlock_irqrestore(&device_domain_lock, flags);
6178                 break;
6179         default:
6180                 ret = -EINVAL;
6181                 break;
6182         }
6183
6184         return ret;
6185 }
6186
6187 const struct iommu_ops intel_iommu_ops = {
6188         .capable                = intel_iommu_capable,
6189         .domain_alloc           = intel_iommu_domain_alloc,
6190         .domain_free            = intel_iommu_domain_free,
6191         .domain_set_attr        = intel_iommu_domain_set_attr,
6192         .attach_dev             = intel_iommu_attach_device,
6193         .detach_dev             = intel_iommu_detach_device,
6194         .aux_attach_dev         = intel_iommu_aux_attach_device,
6195         .aux_detach_dev         = intel_iommu_aux_detach_device,
6196         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6197         .map                    = intel_iommu_map,
6198         .unmap                  = intel_iommu_unmap,
6199         .iova_to_phys           = intel_iommu_iova_to_phys,
6200         .add_device             = intel_iommu_add_device,
6201         .remove_device          = intel_iommu_remove_device,
6202         .get_resv_regions       = intel_iommu_get_resv_regions,
6203         .put_resv_regions       = generic_iommu_put_resv_regions,
6204         .apply_resv_region      = intel_iommu_apply_resv_region,
6205         .device_group           = intel_iommu_device_group,
6206         .dev_has_feat           = intel_iommu_dev_has_feat,
6207         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6208         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6209         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6210         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6211         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6212 };
6213
6214 static void quirk_iommu_igfx(struct pci_dev *dev)
6215 {
6216         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6217         dmar_map_gfx = 0;
6218 }
6219
6220 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6228
6229 /* Broadwell igfx malfunctions with dmar */
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6236 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6237 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6242 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6254
6255 static void quirk_iommu_rwbf(struct pci_dev *dev)
6256 {
6257         /*
6258          * Mobile 4 Series Chipset neglects to set RWBF capability,
6259          * but needs it. Same seems to hold for the desktop versions.
6260          */
6261         pci_info(dev, "Forcing write-buffer flush capability\n");
6262         rwbf_quirk = 1;
6263 }
6264
6265 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6271 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6272
6273 #define GGC 0x52
6274 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6275 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6276 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6277 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6278 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6279 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6280 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6281 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6282
6283 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6284 {
6285         unsigned short ggc;
6286
6287         if (pci_read_config_word(dev, GGC, &ggc))
6288                 return;
6289
6290         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6291                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6292                 dmar_map_gfx = 0;
6293         } else if (dmar_map_gfx) {
6294                 /* we have to ensure the gfx device is idle before we flush */
6295                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6296                 intel_iommu_strict = 1;
6297        }
6298 }
6299 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6300 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6301 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6302 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6303
6304 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6305    ISOCH DMAR unit for the Azalia sound device, but not give it any
6306    TLB entries, which causes it to deadlock. Check for that.  We do
6307    this in a function called from init_dmars(), instead of in a PCI
6308    quirk, because we don't want to print the obnoxious "BIOS broken"
6309    message if VT-d is actually disabled.
6310 */
6311 static void __init check_tylersburg_isoch(void)
6312 {
6313         struct pci_dev *pdev;
6314         uint32_t vtisochctrl;
6315
6316         /* If there's no Azalia in the system anyway, forget it. */
6317         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6318         if (!pdev)
6319                 return;
6320         pci_dev_put(pdev);
6321
6322         /* System Management Registers. Might be hidden, in which case
6323            we can't do the sanity check. But that's OK, because the
6324            known-broken BIOSes _don't_ actually hide it, so far. */
6325         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6326         if (!pdev)
6327                 return;
6328
6329         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6330                 pci_dev_put(pdev);
6331                 return;
6332         }
6333
6334         pci_dev_put(pdev);
6335
6336         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6337         if (vtisochctrl & 1)
6338                 return;
6339
6340         /* Drop all bits other than the number of TLB entries */
6341         vtisochctrl &= 0x1c;
6342
6343         /* If we have the recommended number of TLB entries (16), fine. */
6344         if (vtisochctrl == 0x10)
6345                 return;
6346
6347         /* Zero TLB entries? You get to ride the short bus to school. */
6348         if (!vtisochctrl) {
6349                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6350                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6351                      dmi_get_system_info(DMI_BIOS_VENDOR),
6352                      dmi_get_system_info(DMI_BIOS_VERSION),
6353                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6354                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6355                 return;
6356         }
6357
6358         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6359                vtisochctrl);
6360 }