drivers/iommu/intel-iommu.c

   1 /*
   2  * Copyright © 2006-2014 Intel Corporation.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  * Authors: David Woodhouse <dwmw2@infradead.org>,
  14  *          Ashok Raj <ashok.raj@intel.com>,
  15  *          Shaohua Li <shaohua.li@intel.com>,
  16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17  *          Fenghua Yu <fenghua.yu@intel.com>
  18  *          Joerg Roedel <jroedel@suse.de>
  19  */
  20
  21 #define pr_fmt(fmt)     "DMAR: " fmt
  22
  23 #include <linux/init.h>
  24 #include <linux/bitmap.h>
  25 #include <linux/debugfs.h>
  26 #include <linux/export.h>
  27 #include <linux/slab.h>
  28 #include <linux/irq.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/spinlock.h>
  31 #include <linux/pci.h>
  32 #include <linux/dmar.h>
  33 #include <linux/dma-mapping.h>
  34 #include <linux/mempool.h>
  35 #include <linux/memory.h>
  36 #include <linux/cpu.h>
  37 #include <linux/timer.h>
  38 #include <linux/io.h>
  39 #include <linux/iova.h>
  40 #include <linux/iommu.h>
  41 #include <linux/intel-iommu.h>
  42 #include <linux/syscore_ops.h>
  43 #include <linux/tboot.h>
  44 #include <linux/dmi.h>
  45 #include <linux/pci-ats.h>
  46 #include <linux/memblock.h>
  47 #include <linux/dma-contiguous.h>
  48 #include <linux/crash_dump.h>
  49 #include <asm/irq_remapping.h>
  50 #include <asm/cacheflush.h>
  51 #include <asm/iommu.h>
  52
  53 #include "irq_remapping.h"
  54
  55 #define ROOT_SIZE               VTD_PAGE_SIZE
  56 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  57
  58 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  59 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  60 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  61 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  62
  63 #define IOAPIC_RANGE_START      (0xfee00000)
  64 #define IOAPIC_RANGE_END        (0xfeefffff)
  65 #define IOVA_START_ADDR         (0x1000)
  66
  67 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  68
  69 #define MAX_AGAW_WIDTH 64
  70 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  71
  72 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  73 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  74
  75 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  76    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  77 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  78                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  79 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  80
  81 /* IO virtual address start page frame number */
  82 #define IOVA_START_PFN          (1)
  83
  84 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  85 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  86 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  87
  88 /* page table handling */
  89 #define LEVEL_STRIDE            (9)
  90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  91
  92 /*
  93  * This bitmap is used to advertise the page sizes our hardware support
  94  * to the IOMMU core, which will then use this information to split
  95  * physically contiguous memory regions it is mapping into page sizes
  96  * that we support.
  97  *
  98  * Traditionally the IOMMU core just handed us the mappings directly,
  99  * after making sure the size is an order of a 4KiB page and that the
 100  * mapping has natural alignment.
 101  *
 102  * To retain this behavior, we currently advertise that we support
 103  * all page sizes that are an order of 4KiB.
 104  *
 105  * If at some point we'd like to utilize the IOMMU core's new behavior,
 106  * we could change this to advertise the real page sizes we support.
 107  */
 108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 109
 110 static inline int agaw_to_level(int agaw)
 111 {
 112         return agaw + 2;
 113 }
 114
 115 static inline int agaw_to_width(int agaw)
 116 {
 117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 118 }
 119
 120 static inline int width_to_agaw(int width)
 121 {
 122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 123 }
 124
 125 static inline unsigned int level_to_offset_bits(int level)
 126 {
 127         return (level - 1) * LEVEL_STRIDE;
 128 }
 129
 130 static inline int pfn_level_offset(unsigned long pfn, int level)
 131 {
 132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 133 }
 134
 135 static inline unsigned long level_mask(int level)
 136 {
 137         return -1UL << level_to_offset_bits(level);
 138 }
 139
 140 static inline unsigned long level_size(int level)
 141 {
 142         return 1UL << level_to_offset_bits(level);
 143 }
 144
 145 static inline unsigned long align_to_level(unsigned long pfn, int level)
 146 {
 147         return (pfn + level_size(level) - 1) & level_mask(level);
 148 }
 149
 150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 151 {
 152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 153 }
 154
 155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 156    are never going to work. */
 157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 158 {
 159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 160 }
 161
 162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 163 {
 164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 165 }
 166 static inline unsigned long page_to_dma_pfn(struct page *pg)
 167 {
 168         return mm_to_dma_pfn(page_to_pfn(pg));
 169 }
 170 static inline unsigned long virt_to_dma_pfn(void *p)
 171 {
 172         return page_to_dma_pfn(virt_to_page(p));
 173 }
 174
 175 /* global iommu list, set NULL for ignored DMAR units */
 176 static struct intel_iommu **g_iommus;
 177
 178 static void __init check_tylersburg_isoch(void);
 179 static int rwbf_quirk;
 180
 181 /*
 182  * set to 1 to panic kernel if can't successfully enable VT-d
 183  * (used when kernel is launched w/ TXT)
 184  */
 185 static int force_on = 0;
 186
 187 /*
 188  * 0: Present
 189  * 1-11: Reserved
 190  * 12-63: Context Ptr (12 - (haw-1))
 191  * 64-127: Reserved
 192  */
 193 struct root_entry {
 194         u64     lo;
 195         u64     hi;
 196 };
 197 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 198
 199 /*
 200  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 201  * if marked present.
 202  */
 203 static phys_addr_t root_entry_lctp(struct root_entry *re)
 204 {
 205         if (!(re->lo & 1))
 206                 return 0;
 207
 208         return re->lo & VTD_PAGE_MASK;
 209 }
 210
 211 /*
 212  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 213  * if marked present.
 214  */
 215 static phys_addr_t root_entry_uctp(struct root_entry *re)
 216 {
 217         if (!(re->hi & 1))
 218                 return 0;
 219
 220         return re->hi & VTD_PAGE_MASK;
 221 }
 222 /*
 223  * low 64 bits:
 224  * 0: present
 225  * 1: fault processing disable
 226  * 2-3: translation type
 227  * 12-63: address space root
 228  * high 64 bits:
 229  * 0-2: address width
 230  * 3-6: aval
 231  * 8-23: domain id
 232  */
 233 struct context_entry {
 234         u64 lo;
 235         u64 hi;
 236 };
 237
 238 static inline void context_clear_pasid_enable(struct context_entry *context)
 239 {
 240         context->lo &= ~(1ULL << 11);
 241 }
 242
 243 static inline bool context_pasid_enabled(struct context_entry *context)
 244 {
 245         return !!(context->lo & (1ULL << 11));
 246 }
 247
 248 static inline void context_set_copied(struct context_entry *context)
 249 {
 250         context->hi |= (1ull << 3);
 251 }
 252
 253 static inline bool context_copied(struct context_entry *context)
 254 {
 255         return !!(context->hi & (1ULL << 3));
 256 }
 257
 258 static inline bool __context_present(struct context_entry *context)
 259 {
 260         return (context->lo & 1);
 261 }
 262
 263 static inline bool context_present(struct context_entry *context)
 264 {
 265         return context_pasid_enabled(context) ?
 266              __context_present(context) :
 267              __context_present(context) && !context_copied(context);
 268 }
 269
 270 static inline void context_set_present(struct context_entry *context)
 271 {
 272         context->lo |= 1;
 273 }
 274
 275 static inline void context_set_fault_enable(struct context_entry *context)
 276 {
 277         context->lo &= (((u64)-1) << 2) | 1;
 278 }
 279
 280 static inline void context_set_translation_type(struct context_entry *context,
 281                                                 unsigned long value)
 282 {
 283         context->lo &= (((u64)-1) << 4) | 3;
 284         context->lo |= (value & 3) << 2;
 285 }
 286
 287 static inline void context_set_address_root(struct context_entry *context,
 288                                             unsigned long value)
 289 {
 290         context->lo &= ~VTD_PAGE_MASK;
 291         context->lo |= value & VTD_PAGE_MASK;
 292 }
 293
 294 static inline void context_set_address_width(struct context_entry *context,
 295                                              unsigned long value)
 296 {
 297         context->hi |= value & 7;
 298 }
 299
 300 static inline void context_set_domain_id(struct context_entry *context,
 301                                          unsigned long value)
 302 {
 303         context->hi |= (value & ((1 << 16) - 1)) << 8;
 304 }
 305
 306 static inline int context_domain_id(struct context_entry *c)
 307 {
 308         return((c->hi >> 8) & 0xffff);
 309 }
 310
 311 static inline void context_clear_entry(struct context_entry *context)
 312 {
 313         context->lo = 0;
 314         context->hi = 0;
 315 }
 316
 317 /*
 318  * 0: readable
 319  * 1: writable
 320  * 2-6: reserved
 321  * 7: super page
 322  * 8-10: available
 323  * 11: snoop behavior
 324  * 12-63: Host physcial address
 325  */
 326 struct dma_pte {
 327         u64 val;
 328 };
 329
 330 static inline void dma_clear_pte(struct dma_pte *pte)
 331 {
 332         pte->val = 0;
 333 }
 334
 335 static inline u64 dma_pte_addr(struct dma_pte *pte)
 336 {
 337 #ifdef CONFIG_64BIT
 338         return pte->val & VTD_PAGE_MASK;
 339 #else
 340         /* Must have a full atomic 64-bit read */
 341         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 342 #endif
 343 }
 344
 345 static inline bool dma_pte_present(struct dma_pte *pte)
 346 {
 347         return (pte->val & 3) != 0;
 348 }
 349
 350 static inline bool dma_pte_superpage(struct dma_pte *pte)
 351 {
 352         return (pte->val & DMA_PTE_LARGE_PAGE);
 353 }
 354
 355 static inline int first_pte_in_page(struct dma_pte *pte)
 356 {
 357         return !((unsigned long)pte & ~VTD_PAGE_MASK);
 358 }
 359
 360 /*
 361  * This domain is a statically identity mapping domain.
 362  *      1. This domain creats a static 1:1 mapping to all usable memory.
 363  *      2. It maps to each iommu if successful.
 364  *      3. Each iommu mapps to this domain if successful.
 365  */
 366 static struct dmar_domain *si_domain;
 367 static int hw_pass_through = 1;
 368
 369 /*
 370  * Domain represents a virtual machine, more than one devices
 371  * across iommus may be owned in one domain, e.g. kvm guest.
 372  */
 373 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
 374
 375 /* si_domain contains mulitple devices */
 376 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
 377
 378 #define for_each_domain_iommu(idx, domain)                      \
 379         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 380                 if (domain->iommu_refcnt[idx])
 381
 382 struct dmar_domain {
 383         int     nid;                    /* node id */
 384
 385         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
 386                                         /* Refcount of devices per iommu */
 387
 388
 389         u16             iommu_did[DMAR_UNITS_SUPPORTED];
 390                                         /* Domain ids per IOMMU. Use u16 since
 391                                          * domain ids are 16 bit wide according
 392                                          * to VT-d spec, section 9.3 */
 393
 394         bool has_iotlb_device;
 395         struct list_head devices;       /* all devices' list */
 396         struct iova_domain iovad;       /* iova's that belong to this domain */
 397
 398         struct dma_pte  *pgd;           /* virtual address */
 399         int             gaw;            /* max guest address width */
 400
 401         /* adjusted guest address width, 0 is level 2 30-bit */
 402         int             agaw;
 403
 404         int             flags;          /* flags to find out type of domain */
 405
 406         int             iommu_coherency;/* indicate coherency of iommu access */
 407         int             iommu_snooping; /* indicate snooping control feature*/
 408         int             iommu_count;    /* reference count of iommu */
 409         int             iommu_superpage;/* Level of superpages supported:
 410                                            0 == 4KiB (no superpages), 1 == 2MiB,
 411                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 412         u64             max_addr;       /* maximum mapped address */
 413
 414         struct iommu_domain domain;     /* generic domain data structure for
 415                                            iommu core */
 416 };
 417
 418 /* PCI domain-device relationship */
 419 struct device_domain_info {
 420         struct list_head link;  /* link to domain siblings */
 421         struct list_head global; /* link to global list */
 422         u8 bus;                 /* PCI bus number */
 423         u8 devfn;               /* PCI devfn number */
 424         u8 pasid_supported:3;
 425         u8 pasid_enabled:1;
 426         u8 pri_supported:1;
 427         u8 pri_enabled:1;
 428         u8 ats_supported:1;
 429         u8 ats_enabled:1;
 430         u8 ats_qdep;
 431         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
 432         struct intel_iommu *iommu; /* IOMMU used by this device */
 433         struct dmar_domain *domain; /* pointer to domain */
 434 };
 435
 436 struct dmar_rmrr_unit {
 437         struct list_head list;          /* list of rmrr units   */
 438         struct acpi_dmar_header *hdr;   /* ACPI header          */
 439         u64     base_address;           /* reserved base address*/
 440         u64     end_address;            /* reserved end address */
 441         struct dmar_dev_scope *devices; /* target devices */
 442         int     devices_cnt;            /* target device count */
 443         struct iommu_resv_region *resv; /* reserved region handle */
 444 };
 445
 446 struct dmar_atsr_unit {
 447         struct list_head list;          /* list of ATSR units */
 448         struct acpi_dmar_header *hdr;   /* ACPI header */
 449         struct dmar_dev_scope *devices; /* target devices */
 450         int devices_cnt;                /* target device count */
 451         u8 include_all:1;               /* include all ports */
 452 };
 453
 454 static LIST_HEAD(dmar_atsr_units);
 455 static LIST_HEAD(dmar_rmrr_units);
 456
 457 #define for_each_rmrr_units(rmrr) \
 458         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 459
 460 static void flush_unmaps_timeout(unsigned long data);
 461
 462 struct deferred_flush_entry {
 463         unsigned long iova_pfn;
 464         unsigned long nrpages;
 465         struct dmar_domain *domain;
 466         struct page *freelist;
 467 };
 468
 469 #define HIGH_WATER_MARK 250
 470 struct deferred_flush_table {
 471         int next;
 472         struct deferred_flush_entry entries[HIGH_WATER_MARK];
 473 };
 474
 475 struct deferred_flush_data {
 476         spinlock_t lock;
 477         int timer_on;
 478         struct timer_list timer;
 479         long size;
 480         struct deferred_flush_table *tables;
 481 };
 482
 483 DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
 484
 485 /* bitmap for indexing intel_iommus */
 486 static int g_num_of_iommus;
 487
 488 static void domain_exit(struct dmar_domain *domain);
 489 static void domain_remove_dev_info(struct dmar_domain *domain);
 490 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 491                                      struct device *dev);
 492 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 493 static void domain_context_clear(struct intel_iommu *iommu,
 494                                  struct device *dev);
 495 static int domain_detach_iommu(struct dmar_domain *domain,
 496                                struct intel_iommu *iommu);
 497
 498 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 499 int dmar_disabled = 0;
 500 #else
 501 int dmar_disabled = 1;
 502 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 503
 504 int intel_iommu_enabled = 0;
 505 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 506
 507 static int dmar_map_gfx = 1;
 508 static int dmar_forcedac;
 509 static int intel_iommu_strict;
 510 static int intel_iommu_superpage = 1;
 511 static int intel_iommu_ecs = 1;
 512 static int intel_iommu_pasid28;
 513 static int iommu_identity_mapping;
 514
 515 #define IDENTMAP_ALL            1
 516 #define IDENTMAP_GFX            2
 517 #define IDENTMAP_AZALIA         4
 518
 519 /* Broadwell and Skylake have broken ECS support — normal so-called "second
 520  * level" translation of DMA requests-without-PASID doesn't actually happen
 521  * unless you also set the NESTE bit in an extended context-entry. Which of
 522  * course means that SVM doesn't work because it's trying to do nested
 523  * translation of the physical addresses it finds in the process page tables,
 524  * through the IOVA->phys mapping found in the "second level" page tables.
 525  *
 526  * The VT-d specification was retroactively changed to change the definition
 527  * of the capability bits and pretend that Broadwell/Skylake never happened...
 528  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
 529  * for some reason it was the PASID capability bit which was redefined (from
 530  * bit 28 on BDW/SKL to bit 40 in future).
 531  *
 532  * So our test for ECS needs to eschew those implementations which set the old
 533  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
 534  * Unless we are working around the 'pasid28' limitations, that is, by putting
 535  * the device into passthrough mode for normal DMA and thus masking the bug.
 536  */
 537 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
 538                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
 539 /* PASID support is thus enabled if ECS is enabled and *either* of the old
 540  * or new capability bits are set. */
 541 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
 542                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
 543
 544 int intel_iommu_gfx_mapped;
 545 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 546
 547 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 548 static DEFINE_SPINLOCK(device_domain_lock);
 549 static LIST_HEAD(device_domain_list);
 550
 551 const struct iommu_ops intel_iommu_ops;
 552
 553 static bool translation_pre_enabled(struct intel_iommu *iommu)
 554 {
 555         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 556 }
 557
 558 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 559 {
 560         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 561 }
 562
 563 static void init_translation_status(struct intel_iommu *iommu)
 564 {
 565         u32 gsts;
 566
 567         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 568         if (gsts & DMA_GSTS_TES)
 569                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 570 }
 571
 572 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 573 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 574 {
 575         return container_of(dom, struct dmar_domain, domain);
 576 }
 577
 578 static int __init intel_iommu_setup(char *str)
 579 {
 580         if (!str)
 581                 return -EINVAL;
 582         while (*str) {
 583                 if (!strncmp(str, "on", 2)) {
 584                         dmar_disabled = 0;
 585                         pr_info("IOMMU enabled\n");
 586                 } else if (!strncmp(str, "off", 3)) {
 587                         dmar_disabled = 1;
 588                         pr_info("IOMMU disabled\n");
 589                 } else if (!strncmp(str, "igfx_off", 8)) {
 590                         dmar_map_gfx = 0;
 591                         pr_info("Disable GFX device mapping\n");
 592                 } else if (!strncmp(str, "forcedac", 8)) {
 593                         pr_info("Forcing DAC for PCI devices\n");
 594                         dmar_forcedac = 1;
 595                 } else if (!strncmp(str, "strict", 6)) {
 596                         pr_info("Disable batched IOTLB flush\n");
 597                         intel_iommu_strict = 1;
 598                 } else if (!strncmp(str, "sp_off", 6)) {
 599                         pr_info("Disable supported super page\n");
 600                         intel_iommu_superpage = 0;
 601                 } else if (!strncmp(str, "ecs_off", 7)) {
 602                         printk(KERN_INFO
 603                                 "Intel-IOMMU: disable extended context table support\n");
 604                         intel_iommu_ecs = 0;
 605                 } else if (!strncmp(str, "pasid28", 7)) {
 606                         printk(KERN_INFO
 607                                 "Intel-IOMMU: enable pre-production PASID support\n");
 608                         intel_iommu_pasid28 = 1;
 609                         iommu_identity_mapping |= IDENTMAP_GFX;
 610                 }
 611
 612                 str += strcspn(str, ",");
 613                 while (*str == ',')
 614                         str++;
 615         }
 616         return 0;
 617 }
 618 __setup("intel_iommu=", intel_iommu_setup);
 619
 620 static struct kmem_cache *iommu_domain_cache;
 621 static struct kmem_cache *iommu_devinfo_cache;
 622
 623 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 624 {
 625         struct dmar_domain **domains;
 626         int idx = did >> 8;
 627
 628         domains = iommu->domains[idx];
 629         if (!domains)
 630                 return NULL;
 631
 632         return domains[did & 0xff];
 633 }
 634
 635 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 636                              struct dmar_domain *domain)
 637 {
 638         struct dmar_domain **domains;
 639         int idx = did >> 8;
 640
 641         if (!iommu->domains[idx]) {
 642                 size_t size = 256 * sizeof(struct dmar_domain *);
 643                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 644         }
 645
 646         domains = iommu->domains[idx];
 647         if (WARN_ON(!domains))
 648                 return;
 649         else
 650                 domains[did & 0xff] = domain;
 651 }
 652
 653 static inline void *alloc_pgtable_page(int node)
 654 {
 655         struct page *page;
 656         void *vaddr = NULL;
 657
 658         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 659         if (page)
 660                 vaddr = page_address(page);
 661         return vaddr;
 662 }
 663
 664 static inline void free_pgtable_page(void *vaddr)
 665 {
 666         free_page((unsigned long)vaddr);
 667 }
 668
 669 static inline void *alloc_domain_mem(void)
 670 {
 671         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 672 }
 673
 674 static void free_domain_mem(void *vaddr)
 675 {
 676         kmem_cache_free(iommu_domain_cache, vaddr);
 677 }
 678
 679 static inline void * alloc_devinfo_mem(void)
 680 {
 681         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 682 }
 683
 684 static inline void free_devinfo_mem(void *vaddr)
 685 {
 686         kmem_cache_free(iommu_devinfo_cache, vaddr);
 687 }
 688
 689 static inline int domain_type_is_vm(struct dmar_domain *domain)
 690 {
 691         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 692 }
 693
 694 static inline int domain_type_is_si(struct dmar_domain *domain)
 695 {
 696         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 697 }
 698
 699 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 700 {
 701         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 702                                 DOMAIN_FLAG_STATIC_IDENTITY);
 703 }
 704
 705 static inline int domain_pfn_supported(struct dmar_domain *domain,
 706                                        unsigned long pfn)
 707 {
 708         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 709
 710         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 711 }
 712
 713 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 714 {
 715         unsigned long sagaw;
 716         int agaw = -1;
 717
 718         sagaw = cap_sagaw(iommu->cap);
 719         for (agaw = width_to_agaw(max_gaw);
 720              agaw >= 0; agaw--) {
 721                 if (test_bit(agaw, &sagaw))
 722                         break;
 723         }
 724
 725         return agaw;
 726 }
 727
 728 /*
 729  * Calculate max SAGAW for each iommu.
 730  */
 731 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 732 {
 733         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 734 }
 735
 736 /*
 737  * calculate agaw for each iommu.
 738  * "SAGAW" may be different across iommus, use a default agaw, and
 739  * get a supported less agaw for iommus that don't support the default agaw.
 740  */
 741 int iommu_calculate_agaw(struct intel_iommu *iommu)
 742 {
 743         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 744 }
 745
 746 /* This functionin only returns single iommu in a domain */
 747 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 748 {
 749         int iommu_id;
 750
 751         /* si_domain and vm domain should not get here. */
 752         BUG_ON(domain_type_is_vm_or_si(domain));
 753         for_each_domain_iommu(iommu_id, domain)
 754                 break;
 755
 756         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 757                 return NULL;
 758
 759         return g_iommus[iommu_id];
 760 }
 761
 762 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 763 {
 764         struct dmar_drhd_unit *drhd;
 765         struct intel_iommu *iommu;
 766         bool found = false;
 767         int i;
 768
 769         domain->iommu_coherency = 1;
 770
 771         for_each_domain_iommu(i, domain) {
 772                 found = true;
 773                 if (!ecap_coherent(g_iommus[i]->ecap)) {
 774                         domain->iommu_coherency = 0;
 775                         break;
 776                 }
 777         }
 778         if (found)
 779                 return;
 780
 781         /* No hardware attached; use lowest common denominator */
 782         rcu_read_lock();
 783         for_each_active_iommu(iommu, drhd) {
 784                 if (!ecap_coherent(iommu->ecap)) {
 785                         domain->iommu_coherency = 0;
 786                         break;
 787                 }
 788         }
 789         rcu_read_unlock();
 790 }
 791
 792 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 793 {
 794         struct dmar_drhd_unit *drhd;
 795         struct intel_iommu *iommu;
 796         int ret = 1;
 797
 798         rcu_read_lock();
 799         for_each_active_iommu(iommu, drhd) {
 800                 if (iommu != skip) {
 801                         if (!ecap_sc_support(iommu->ecap)) {
 802                                 ret = 0;
 803                                 break;
 804                         }
 805                 }
 806         }
 807         rcu_read_unlock();
 808
 809         return ret;
 810 }
 811
 812 static int domain_update_iommu_superpage(struct intel_iommu *skip)
 813 {
 814         struct dmar_drhd_unit *drhd;
 815         struct intel_iommu *iommu;
 816         int mask = 0xf;
 817
 818         if (!intel_iommu_superpage) {
 819                 return 0;
 820         }
 821
 822         /* set iommu_superpage to the smallest common denominator */
 823         rcu_read_lock();
 824         for_each_active_iommu(iommu, drhd) {
 825                 if (iommu != skip) {
 826                         mask &= cap_super_page_val(iommu->cap);
 827                         if (!mask)
 828                                 break;
 829                 }
 830         }
 831         rcu_read_unlock();
 832
 833         return fls(mask);
 834 }
 835
 836 /* Some capabilities may be different across iommus */
 837 static void domain_update_iommu_cap(struct dmar_domain *domain)
 838 {
 839         domain_update_iommu_coherency(domain);
 840         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 841         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 842 }
 843
 844 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
 845                                                        u8 bus, u8 devfn, int alloc)
 846 {
 847         struct root_entry *root = &iommu->root_entry[bus];
 848         struct context_entry *context;
 849         u64 *entry;
 850
 851         entry = &root->lo;
 852         if (ecs_enabled(iommu)) {
 853                 if (devfn >= 0x80) {
 854                         devfn -= 0x80;
 855                         entry = &root->hi;
 856                 }
 857                 devfn *= 2;
 858         }
 859         if (*entry & 1)
 860                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 861         else {
 862                 unsigned long phy_addr;
 863                 if (!alloc)
 864                         return NULL;
 865
 866                 context = alloc_pgtable_page(iommu->node);
 867                 if (!context)
 868                         return NULL;
 869
 870                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 871                 phy_addr = virt_to_phys((void *)context);
 872                 *entry = phy_addr | 1;
 873                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 874         }
 875         return &context[devfn];
 876 }
 877
 878 static int iommu_dummy(struct device *dev)
 879 {
 880         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 881 }
 882
 883 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 884 {
 885         struct dmar_drhd_unit *drhd = NULL;
 886         struct intel_iommu *iommu;
 887         struct device *tmp;
 888         struct pci_dev *ptmp, *pdev = NULL;
 889         u16 segment = 0;
 890         int i;
 891
 892         if (iommu_dummy(dev))
 893                 return NULL;
 894
 895         if (dev_is_pci(dev)) {
 896                 struct pci_dev *pf_pdev;
 897
 898                 pdev = to_pci_dev(dev);
 899                 /* VFs aren't listed in scope tables; we need to look up
 900                  * the PF instead to find the IOMMU. */
 901                 pf_pdev = pci_physfn(pdev);
 902                 dev = &pf_pdev->dev;
 903                 segment = pci_domain_nr(pdev->bus);
 904         } else if (has_acpi_companion(dev))
 905                 dev = &ACPI_COMPANION(dev)->dev;
 906
 907         rcu_read_lock();
 908         for_each_active_iommu(iommu, drhd) {
 909                 if (pdev && segment != drhd->segment)
 910                         continue;
 911
 912                 for_each_active_dev_scope(drhd->devices,
 913                                           drhd->devices_cnt, i, tmp) {
 914                         if (tmp == dev) {
 915                                 /* For a VF use its original BDF# not that of the PF
 916                                  * which we used for the IOMMU lookup. Strictly speaking
 917                                  * we could do this for all PCI devices; we only need to
 918                                  * get the BDF# from the scope table for ACPI matches. */
 919                                 if (pdev && pdev->is_virtfn)
 920                                         goto got_pdev;
 921
 922                                 *bus = drhd->devices[i].bus;
 923                                 *devfn = drhd->devices[i].devfn;
 924                                 goto out;
 925                         }
 926
 927                         if (!pdev || !dev_is_pci(tmp))
 928                                 continue;
 929
 930                         ptmp = to_pci_dev(tmp);
 931                         if (ptmp->subordinate &&
 932                             ptmp->subordinate->number <= pdev->bus->number &&
 933                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
 934                                 goto got_pdev;
 935                 }
 936
 937                 if (pdev && drhd->include_all) {
 938                 got_pdev:
 939                         *bus = pdev->bus->number;
 940                         *devfn = pdev->devfn;
 941                         goto out;
 942                 }
 943         }
 944         iommu = NULL;
 945  out:
 946         rcu_read_unlock();
 947
 948         return iommu;
 949 }
 950
 951 static void domain_flush_cache(struct dmar_domain *domain,
 952                                void *addr, int size)
 953 {
 954         if (!domain->iommu_coherency)
 955                 clflush_cache_range(addr, size);
 956 }
 957
 958 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 959 {
 960         struct context_entry *context;
 961         int ret = 0;
 962         unsigned long flags;
 963
 964         spin_lock_irqsave(&iommu->lock, flags);
 965         context = iommu_context_addr(iommu, bus, devfn, 0);
 966         if (context)
 967                 ret = context_present(context);
 968         spin_unlock_irqrestore(&iommu->lock, flags);
 969         return ret;
 970 }
 971
 972 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 973 {
 974         struct context_entry *context;
 975         unsigned long flags;
 976
 977         spin_lock_irqsave(&iommu->lock, flags);
 978         context = iommu_context_addr(iommu, bus, devfn, 0);
 979         if (context) {
 980                 context_clear_entry(context);
 981                 __iommu_flush_cache(iommu, context, sizeof(*context));
 982         }
 983         spin_unlock_irqrestore(&iommu->lock, flags);
 984 }
 985
 986 static void free_context_table(struct intel_iommu *iommu)
 987 {
 988         int i;
 989         unsigned long flags;
 990         struct context_entry *context;
 991
 992         spin_lock_irqsave(&iommu->lock, flags);
 993         if (!iommu->root_entry) {
 994                 goto out;
 995         }
 996         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 997                 context = iommu_context_addr(iommu, i, 0, 0);
 998                 if (context)
 999                         free_pgtable_page(context);
1000
1001                 if (!ecs_enabled(iommu))
1002                         continue;
1003
1004                 context = iommu_context_addr(iommu, i, 0x80, 0);
1005                 if (context)
1006                         free_pgtable_page(context);
1007
1008         }
1009         free_pgtable_page(iommu->root_entry);
1010         iommu->root_entry = NULL;
1011 out:
1012         spin_unlock_irqrestore(&iommu->lock, flags);
1013 }
1014
1015 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1016                                       unsigned long pfn, int *target_level)
1017 {
1018         struct dma_pte *parent, *pte = NULL;
1019         int level = agaw_to_level(domain->agaw);
1020         int offset;
1021
1022         BUG_ON(!domain->pgd);
1023
1024         if (!domain_pfn_supported(domain, pfn))
1025                 /* Address beyond IOMMU's addressing capabilities. */
1026                 return NULL;
1027
1028         parent = domain->pgd;
1029
1030         while (1) {
1031                 void *tmp_page;
1032
1033                 offset = pfn_level_offset(pfn, level);
1034                 pte = &parent[offset];
1035                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1036                         break;
1037                 if (level == *target_level)
1038                         break;
1039
1040                 if (!dma_pte_present(pte)) {
1041                         uint64_t pteval;
1042
1043                         tmp_page = alloc_pgtable_page(domain->nid);
1044
1045                         if (!tmp_page)
1046                                 return NULL;
1047
1048                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1049                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1050                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1051                                 /* Someone else set it while we were thinking; use theirs. */
1052                                 free_pgtable_page(tmp_page);
1053                         else
1054                                 domain_flush_cache(domain, pte, sizeof(*pte));
1055                 }
1056                 if (level == 1)
1057                         break;
1058
1059                 parent = phys_to_virt(dma_pte_addr(pte));
1060                 level--;
1061         }
1062
1063         if (!*target_level)
1064                 *target_level = level;
1065
1066         return pte;
1067 }
1068
1069
1070 /* return address's pte at specific level */
1071 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1072                                          unsigned long pfn,
1073                                          int level, int *large_page)
1074 {
1075         struct dma_pte *parent, *pte = NULL;
1076         int total = agaw_to_level(domain->agaw);
1077         int offset;
1078
1079         parent = domain->pgd;
1080         while (level <= total) {
1081                 offset = pfn_level_offset(pfn, total);
1082                 pte = &parent[offset];
1083                 if (level == total)
1084                         return pte;
1085
1086                 if (!dma_pte_present(pte)) {
1087                         *large_page = total;
1088                         break;
1089                 }
1090
1091                 if (dma_pte_superpage(pte)) {
1092                         *large_page = total;
1093                         return pte;
1094                 }
1095
1096                 parent = phys_to_virt(dma_pte_addr(pte));
1097                 total--;
1098         }
1099         return NULL;
1100 }
1101
1102 /* clear last level pte, a tlb flush should be followed */
1103 static void dma_pte_clear_range(struct dmar_domain *domain,
1104                                 unsigned long start_pfn,
1105                                 unsigned long last_pfn)
1106 {
1107         unsigned int large_page = 1;
1108         struct dma_pte *first_pte, *pte;
1109
1110         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1111         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1112         BUG_ON(start_pfn > last_pfn);
1113
1114         /* we don't need lock here; nobody else touches the iova range */
1115         do {
1116                 large_page = 1;
1117                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1118                 if (!pte) {
1119                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1120                         continue;
1121                 }
1122                 do {
1123                         dma_clear_pte(pte);
1124                         start_pfn += lvl_to_nr_pages(large_page);
1125                         pte++;
1126                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1127
1128                 domain_flush_cache(domain, first_pte,
1129                                    (void *)pte - (void *)first_pte);
1130
1131         } while (start_pfn && start_pfn <= last_pfn);
1132 }
1133
1134 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1135                                struct dma_pte *pte, unsigned long pfn,
1136                                unsigned long start_pfn, unsigned long last_pfn)
1137 {
1138         pfn = max(start_pfn, pfn);
1139         pte = &pte[pfn_level_offset(pfn, level)];
1140
1141         do {
1142                 unsigned long level_pfn;
1143                 struct dma_pte *level_pte;
1144
1145                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1146                         goto next;
1147
1148                 level_pfn = pfn & level_mask(level);
1149                 level_pte = phys_to_virt(dma_pte_addr(pte));
1150
1151                 if (level > 2)
1152                         dma_pte_free_level(domain, level - 1, level_pte,
1153                                            level_pfn, start_pfn, last_pfn);
1154
1155                 /* If range covers entire pagetable, free it */
1156                 if (!(start_pfn > level_pfn ||
1157                       last_pfn < level_pfn + level_size(level) - 1)) {
1158                         dma_clear_pte(pte);
1159                         domain_flush_cache(domain, pte, sizeof(*pte));
1160                         free_pgtable_page(level_pte);
1161                 }
1162 next:
1163                 pfn += level_size(level);
1164         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1165 }
1166
1167 /* clear last level (leaf) ptes and free page table pages. */
1168 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1169                                    unsigned long start_pfn,
1170                                    unsigned long last_pfn)
1171 {
1172         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1173         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1174         BUG_ON(start_pfn > last_pfn);
1175
1176         dma_pte_clear_range(domain, start_pfn, last_pfn);
1177
1178         /* We don't need lock here; nobody else touches the iova range */
1179         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1180                            domain->pgd, 0, start_pfn, last_pfn);
1181
1182         /* free pgd */
1183         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1184                 free_pgtable_page(domain->pgd);
1185                 domain->pgd = NULL;
1186         }
1187 }
1188
1189 /* When a page at a given level is being unlinked from its parent, we don't
1190    need to *modify* it at all. All we need to do is make a list of all the
1191    pages which can be freed just as soon as we've flushed the IOTLB and we
1192    know the hardware page-walk will no longer touch them.
1193    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1194    be freed. */
1195 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1196                                             int level, struct dma_pte *pte,
1197                                             struct page *freelist)
1198 {
1199         struct page *pg;
1200
1201         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1202         pg->freelist = freelist;
1203         freelist = pg;
1204
1205         if (level == 1)
1206                 return freelist;
1207
1208         pte = page_address(pg);
1209         do {
1210                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1211                         freelist = dma_pte_list_pagetables(domain, level - 1,
1212                                                            pte, freelist);
1213                 pte++;
1214         } while (!first_pte_in_page(pte));
1215
1216         return freelist;
1217 }
1218
1219 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1220                                         struct dma_pte *pte, unsigned long pfn,
1221                                         unsigned long start_pfn,
1222                                         unsigned long last_pfn,
1223                                         struct page *freelist)
1224 {
1225         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1226
1227         pfn = max(start_pfn, pfn);
1228         pte = &pte[pfn_level_offset(pfn, level)];
1229
1230         do {
1231                 unsigned long level_pfn;
1232
1233                 if (!dma_pte_present(pte))
1234                         goto next;
1235
1236                 level_pfn = pfn & level_mask(level);
1237
1238                 /* If range covers entire pagetable, free it */
1239                 if (start_pfn <= level_pfn &&
1240                     last_pfn >= level_pfn + level_size(level) - 1) {
1241                         /* These suborbinate page tables are going away entirely. Don't
1242                            bother to clear them; we're just going to *free* them. */
1243                         if (level > 1 && !dma_pte_superpage(pte))
1244                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1245
1246                         dma_clear_pte(pte);
1247                         if (!first_pte)
1248                                 first_pte = pte;
1249                         last_pte = pte;
1250                 } else if (level > 1) {
1251                         /* Recurse down into a level that isn't *entirely* obsolete */
1252                         freelist = dma_pte_clear_level(domain, level - 1,
1253                                                        phys_to_virt(dma_pte_addr(pte)),
1254                                                        level_pfn, start_pfn, last_pfn,
1255                                                        freelist);
1256                 }
1257 next:
1258                 pfn += level_size(level);
1259         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1260
1261         if (first_pte)
1262                 domain_flush_cache(domain, first_pte,
1263                                    (void *)++last_pte - (void *)first_pte);
1264
1265         return freelist;
1266 }
1267
1268 /* We can't just free the pages because the IOMMU may still be walking
1269    the page tables, and may have cached the intermediate levels. The
1270    pages can only be freed after the IOTLB flush has been done. */
1271 static struct page *domain_unmap(struct dmar_domain *domain,
1272                                  unsigned long start_pfn,
1273                                  unsigned long last_pfn)
1274 {
1275         struct page *freelist = NULL;
1276
1277         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1278         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1279         BUG_ON(start_pfn > last_pfn);
1280
1281         /* we don't need lock here; nobody else touches the iova range */
1282         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1283                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1284
1285         /* free pgd */
1286         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1287                 struct page *pgd_page = virt_to_page(domain->pgd);
1288                 pgd_page->freelist = freelist;
1289                 freelist = pgd_page;
1290
1291                 domain->pgd = NULL;
1292         }
1293
1294         return freelist;
1295 }
1296
1297 static void dma_free_pagelist(struct page *freelist)
1298 {
1299         struct page *pg;
1300
1301         while ((pg = freelist)) {
1302                 freelist = pg->freelist;
1303                 free_pgtable_page(page_address(pg));
1304         }
1305 }
1306
1307 /* iommu handling */
1308 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1309 {
1310         struct root_entry *root;
1311         unsigned long flags;
1312
1313         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1314         if (!root) {
1315                 pr_err("Allocating root entry for %s failed\n",
1316                         iommu->name);
1317                 return -ENOMEM;
1318         }
1319
1320         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1321
1322         spin_lock_irqsave(&iommu->lock, flags);
1323         iommu->root_entry = root;
1324         spin_unlock_irqrestore(&iommu->lock, flags);
1325
1326         return 0;
1327 }
1328
1329 static void iommu_set_root_entry(struct intel_iommu *iommu)
1330 {
1331         u64 addr;
1332         u32 sts;
1333         unsigned long flag;
1334
1335         addr = virt_to_phys(iommu->root_entry);
1336         if (ecs_enabled(iommu))
1337                 addr |= DMA_RTADDR_RTT;
1338
1339         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1341
1342         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1343
1344         /* Make sure hardware complete it */
1345         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1346                       readl, (sts & DMA_GSTS_RTPS), sts);
1347
1348         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1349 }
1350
1351 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1352 {
1353         u32 val;
1354         unsigned long flag;
1355
1356         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1357                 return;
1358
1359         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1360         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1361
1362         /* Make sure hardware complete it */
1363         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1364                       readl, (!(val & DMA_GSTS_WBFS)), val);
1365
1366         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1367 }
1368
1369 /* return value determine if we need a write buffer flush */
1370 static void __iommu_flush_context(struct intel_iommu *iommu,
1371                                   u16 did, u16 source_id, u8 function_mask,
1372                                   u64 type)
1373 {
1374         u64 val = 0;
1375         unsigned long flag;
1376
1377         switch (type) {
1378         case DMA_CCMD_GLOBAL_INVL:
1379                 val = DMA_CCMD_GLOBAL_INVL;
1380                 break;
1381         case DMA_CCMD_DOMAIN_INVL:
1382                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1383                 break;
1384         case DMA_CCMD_DEVICE_INVL:
1385                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1386                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1387                 break;
1388         default:
1389                 BUG();
1390         }
1391         val |= DMA_CCMD_ICC;
1392
1393         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1394         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1395
1396         /* Make sure hardware complete it */
1397         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1398                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1399
1400         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1401 }
1402
1403 /* return value determine if we need a write buffer flush */
1404 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1405                                 u64 addr, unsigned int size_order, u64 type)
1406 {
1407         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1408         u64 val = 0, val_iva = 0;
1409         unsigned long flag;
1410
1411         switch (type) {
1412         case DMA_TLB_GLOBAL_FLUSH:
1413                 /* global flush doesn't need set IVA_REG */
1414                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1415                 break;
1416         case DMA_TLB_DSI_FLUSH:
1417                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1418                 break;
1419         case DMA_TLB_PSI_FLUSH:
1420                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1421                 /* IH bit is passed in as part of address */
1422                 val_iva = size_order | addr;
1423                 break;
1424         default:
1425                 BUG();
1426         }
1427         /* Note: set drain read/write */
1428 #if 0
1429         /*
1430          * This is probably to be super secure.. Looks like we can
1431          * ignore it without any impact.
1432          */
1433         if (cap_read_drain(iommu->cap))
1434                 val |= DMA_TLB_READ_DRAIN;
1435 #endif
1436         if (cap_write_drain(iommu->cap))
1437                 val |= DMA_TLB_WRITE_DRAIN;
1438
1439         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1440         /* Note: Only uses first TLB reg currently */
1441         if (val_iva)
1442                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1443         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1444
1445         /* Make sure hardware complete it */
1446         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1447                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1448
1449         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1450
1451         /* check IOTLB invalidation granularity */
1452         if (DMA_TLB_IAIG(val) == 0)
1453                 pr_err("Flush IOTLB failed\n");
1454         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1455                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1456                         (unsigned long long)DMA_TLB_IIRG(type),
1457                         (unsigned long long)DMA_TLB_IAIG(val));
1458 }
1459
1460 static struct device_domain_info *
1461 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1462                          u8 bus, u8 devfn)
1463 {
1464         struct device_domain_info *info;
1465
1466         assert_spin_locked(&device_domain_lock);
1467
1468         if (!iommu->qi)
1469                 return NULL;
1470
1471         list_for_each_entry(info, &domain->devices, link)
1472                 if (info->iommu == iommu && info->bus == bus &&
1473                     info->devfn == devfn) {
1474                         if (info->ats_supported && info->dev)
1475                                 return info;
1476                         break;
1477                 }
1478
1479         return NULL;
1480 }
1481
1482 static void domain_update_iotlb(struct dmar_domain *domain)
1483 {
1484         struct device_domain_info *info;
1485         bool has_iotlb_device = false;
1486
1487         assert_spin_locked(&device_domain_lock);
1488
1489         list_for_each_entry(info, &domain->devices, link) {
1490                 struct pci_dev *pdev;
1491
1492                 if (!info->dev || !dev_is_pci(info->dev))
1493                         continue;
1494
1495                 pdev = to_pci_dev(info->dev);
1496                 if (pdev->ats_enabled) {
1497                         has_iotlb_device = true;
1498                         break;
1499                 }
1500         }
1501
1502         domain->has_iotlb_device = has_iotlb_device;
1503 }
1504
1505 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1506 {
1507         struct pci_dev *pdev;
1508
1509         assert_spin_locked(&device_domain_lock);
1510
1511         if (!info || !dev_is_pci(info->dev))
1512                 return;
1513
1514         pdev = to_pci_dev(info->dev);
1515
1516 #ifdef CONFIG_INTEL_IOMMU_SVM
1517         /* The PCIe spec, in its wisdom, declares that the behaviour of
1518            the device if you enable PASID support after ATS support is
1519            undefined. So always enable PASID support on devices which
1520            have it, even if we can't yet know if we're ever going to
1521            use it. */
1522         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1523                 info->pasid_enabled = 1;
1524
1525         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1526                 info->pri_enabled = 1;
1527 #endif
1528         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1529                 info->ats_enabled = 1;
1530                 domain_update_iotlb(info->domain);
1531                 info->ats_qdep = pci_ats_queue_depth(pdev);
1532         }
1533 }
1534
1535 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1536 {
1537         struct pci_dev *pdev;
1538
1539         assert_spin_locked(&device_domain_lock);
1540
1541         if (!dev_is_pci(info->dev))
1542                 return;
1543
1544         pdev = to_pci_dev(info->dev);
1545
1546         if (info->ats_enabled) {
1547                 pci_disable_ats(pdev);
1548                 info->ats_enabled = 0;
1549                 domain_update_iotlb(info->domain);
1550         }
1551 #ifdef CONFIG_INTEL_IOMMU_SVM
1552         if (info->pri_enabled) {
1553                 pci_disable_pri(pdev);
1554                 info->pri_enabled = 0;
1555         }
1556         if (info->pasid_enabled) {
1557                 pci_disable_pasid(pdev);
1558                 info->pasid_enabled = 0;
1559         }
1560 #endif
1561 }
1562
1563 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1564                                   u64 addr, unsigned mask)
1565 {
1566         u16 sid, qdep;
1567         unsigned long flags;
1568         struct device_domain_info *info;
1569
1570         if (!domain->has_iotlb_device)
1571                 return;
1572
1573         spin_lock_irqsave(&device_domain_lock, flags);
1574         list_for_each_entry(info, &domain->devices, link) {
1575                 if (!info->ats_enabled)
1576                         continue;
1577
1578                 sid = info->bus << 8 | info->devfn;
1579                 qdep = info->ats_qdep;
1580                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1581         }
1582         spin_unlock_irqrestore(&device_domain_lock, flags);
1583 }
1584
1585 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1586                                   struct dmar_domain *domain,
1587                                   unsigned long pfn, unsigned int pages,
1588                                   int ih, int map)
1589 {
1590         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1591         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1592         u16 did = domain->iommu_did[iommu->seq_id];
1593
1594         BUG_ON(pages == 0);
1595
1596         if (ih)
1597                 ih = 1 << 6;
1598         /*
1599          * Fallback to domain selective flush if no PSI support or the size is
1600          * too big.
1601          * PSI requires page size to be 2 ^ x, and the base address is naturally
1602          * aligned to the size
1603          */
1604         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1605                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1606                                                 DMA_TLB_DSI_FLUSH);
1607         else
1608                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1609                                                 DMA_TLB_PSI_FLUSH);
1610
1611         /*
1612          * In caching mode, changes of pages from non-present to present require
1613          * flush. However, device IOTLB doesn't need to be flushed in this case.
1614          */
1615         if (!cap_caching_mode(iommu->cap) || !map)
1616                 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1617                                       addr, mask);
1618 }
1619
1620 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1621 {
1622         u32 pmen;
1623         unsigned long flags;
1624
1625         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1626         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1627         pmen &= ~DMA_PMEN_EPM;
1628         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1629
1630         /* wait for the protected region status bit to clear */
1631         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1632                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1633
1634         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1635 }
1636
1637 static void iommu_enable_translation(struct intel_iommu *iommu)
1638 {
1639         u32 sts;
1640         unsigned long flags;
1641
1642         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1643         iommu->gcmd |= DMA_GCMD_TE;
1644         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1645
1646         /* Make sure hardware complete it */
1647         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1648                       readl, (sts & DMA_GSTS_TES), sts);
1649
1650         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1651 }
1652
1653 static void iommu_disable_translation(struct intel_iommu *iommu)
1654 {
1655         u32 sts;
1656         unsigned long flag;
1657
1658         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1659         iommu->gcmd &= ~DMA_GCMD_TE;
1660         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1661
1662         /* Make sure hardware complete it */
1663         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1664                       readl, (!(sts & DMA_GSTS_TES)), sts);
1665
1666         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1667 }
1668
1669
1670 static int iommu_init_domains(struct intel_iommu *iommu)
1671 {
1672         u32 ndomains, nlongs;
1673         size_t size;
1674
1675         ndomains = cap_ndoms(iommu->cap);
1676         pr_debug("%s: Number of Domains supported <%d>\n",
1677                  iommu->name, ndomains);
1678         nlongs = BITS_TO_LONGS(ndomains);
1679
1680         spin_lock_init(&iommu->lock);
1681
1682         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1683         if (!iommu->domain_ids) {
1684                 pr_err("%s: Allocating domain id array failed\n",
1685                        iommu->name);
1686                 return -ENOMEM;
1687         }
1688
1689         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1690         iommu->domains = kzalloc(size, GFP_KERNEL);
1691
1692         if (iommu->domains) {
1693                 size = 256 * sizeof(struct dmar_domain *);
1694                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1695         }
1696
1697         if (!iommu->domains || !iommu->domains[0]) {
1698                 pr_err("%s: Allocating domain array failed\n",
1699                        iommu->name);
1700                 kfree(iommu->domain_ids);
1701                 kfree(iommu->domains);
1702                 iommu->domain_ids = NULL;
1703                 iommu->domains    = NULL;
1704                 return -ENOMEM;
1705         }
1706
1707
1708
1709         /*
1710          * If Caching mode is set, then invalid translations are tagged
1711          * with domain-id 0, hence we need to pre-allocate it. We also
1712          * use domain-id 0 as a marker for non-allocated domain-id, so
1713          * make sure it is not used for a real domain.
1714          */
1715         set_bit(0, iommu->domain_ids);
1716
1717         return 0;
1718 }
1719
1720 static void disable_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722         struct device_domain_info *info, *tmp;
1723         unsigned long flags;
1724
1725         if (!iommu->domains || !iommu->domain_ids)
1726                 return;
1727
1728 again:
1729         spin_lock_irqsave(&device_domain_lock, flags);
1730         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1731                 struct dmar_domain *domain;
1732
1733                 if (info->iommu != iommu)
1734                         continue;
1735
1736                 if (!info->dev || !info->domain)
1737                         continue;
1738
1739                 domain = info->domain;
1740
1741                 __dmar_remove_one_dev_info(info);
1742
1743                 if (!domain_type_is_vm_or_si(domain)) {
1744                         /*
1745                          * The domain_exit() function  can't be called under
1746                          * device_domain_lock, as it takes this lock itself.
1747                          * So release the lock here and re-run the loop
1748                          * afterwards.
1749                          */
1750                         spin_unlock_irqrestore(&device_domain_lock, flags);
1751                         domain_exit(domain);
1752                         goto again;
1753                 }
1754         }
1755         spin_unlock_irqrestore(&device_domain_lock, flags);
1756
1757         if (iommu->gcmd & DMA_GCMD_TE)
1758                 iommu_disable_translation(iommu);
1759 }
1760
1761 static void free_dmar_iommu(struct intel_iommu *iommu)
1762 {
1763         if ((iommu->domains) && (iommu->domain_ids)) {
1764                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1765                 int i;
1766
1767                 for (i = 0; i < elems; i++)
1768                         kfree(iommu->domains[i]);
1769                 kfree(iommu->domains);
1770                 kfree(iommu->domain_ids);
1771                 iommu->domains = NULL;
1772                 iommu->domain_ids = NULL;
1773         }
1774
1775         g_iommus[iommu->seq_id] = NULL;
1776
1777         /* free context mapping */
1778         free_context_table(iommu);
1779
1780 #ifdef CONFIG_INTEL_IOMMU_SVM
1781         if (pasid_enabled(iommu)) {
1782                 if (ecap_prs(iommu->ecap))
1783                         intel_svm_finish_prq(iommu);
1784                 intel_svm_free_pasid_tables(iommu);
1785         }
1786 #endif
1787 }
1788
1789 static struct dmar_domain *alloc_domain(int flags)
1790 {
1791         struct dmar_domain *domain;
1792
1793         domain = alloc_domain_mem();
1794         if (!domain)
1795                 return NULL;
1796
1797         memset(domain, 0, sizeof(*domain));
1798         domain->nid = -1;
1799         domain->flags = flags;
1800         domain->has_iotlb_device = false;
1801         INIT_LIST_HEAD(&domain->devices);
1802
1803         return domain;
1804 }
1805
1806 /* Must be called with iommu->lock */
1807 static int domain_attach_iommu(struct dmar_domain *domain,
1808                                struct intel_iommu *iommu)
1809 {
1810         unsigned long ndomains;
1811         int num;
1812
1813         assert_spin_locked(&device_domain_lock);
1814         assert_spin_locked(&iommu->lock);
1815
1816         domain->iommu_refcnt[iommu->seq_id] += 1;
1817         domain->iommu_count += 1;
1818         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1819                 ndomains = cap_ndoms(iommu->cap);
1820                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1821
1822                 if (num >= ndomains) {
1823                         pr_err("%s: No free domain ids\n", iommu->name);
1824                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1825                         domain->iommu_count -= 1;
1826                         return -ENOSPC;
1827                 }
1828
1829                 set_bit(num, iommu->domain_ids);
1830                 set_iommu_domain(iommu, num, domain);
1831
1832                 domain->iommu_did[iommu->seq_id] = num;
1833                 domain->nid                      = iommu->node;
1834
1835                 domain_update_iommu_cap(domain);
1836         }
1837
1838         return 0;
1839 }
1840
1841 static int domain_detach_iommu(struct dmar_domain *domain,
1842                                struct intel_iommu *iommu)
1843 {
1844         int num, count = INT_MAX;
1845
1846         assert_spin_locked(&device_domain_lock);
1847         assert_spin_locked(&iommu->lock);
1848
1849         domain->iommu_refcnt[iommu->seq_id] -= 1;
1850         count = --domain->iommu_count;
1851         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1852                 num = domain->iommu_did[iommu->seq_id];
1853                 clear_bit(num, iommu->domain_ids);
1854                 set_iommu_domain(iommu, num, NULL);
1855
1856                 domain_update_iommu_cap(domain);
1857                 domain->iommu_did[iommu->seq_id] = 0;
1858         }
1859
1860         return count;
1861 }
1862
1863 static struct iova_domain reserved_iova_list;
1864 static struct lock_class_key reserved_rbtree_key;
1865
1866 static int dmar_init_reserved_ranges(void)
1867 {
1868         struct pci_dev *pdev = NULL;
1869         struct iova *iova;
1870         int i;
1871
1872         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1873                         DMA_32BIT_PFN);
1874
1875         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1876                 &reserved_rbtree_key);
1877
1878         /* IOAPIC ranges shouldn't be accessed by DMA */
1879         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1880                 IOVA_PFN(IOAPIC_RANGE_END));
1881         if (!iova) {
1882                 pr_err("Reserve IOAPIC range failed\n");
1883                 return -ENODEV;
1884         }
1885
1886         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1887         for_each_pci_dev(pdev) {
1888                 struct resource *r;
1889
1890                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1891                         r = &pdev->resource[i];
1892                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1893                                 continue;
1894                         iova = reserve_iova(&reserved_iova_list,
1895                                             IOVA_PFN(r->start),
1896                                             IOVA_PFN(r->end));
1897                         if (!iova) {
1898                                 pr_err("Reserve iova failed\n");
1899                                 return -ENODEV;
1900                         }
1901                 }
1902         }
1903         return 0;
1904 }
1905
1906 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1907 {
1908         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1909 }
1910
1911 static inline int guestwidth_to_adjustwidth(int gaw)
1912 {
1913         int agaw;
1914         int r = (gaw - 12) % 9;
1915
1916         if (r == 0)
1917                 agaw = gaw;
1918         else
1919                 agaw = gaw + 9 - r;
1920         if (agaw > 64)
1921                 agaw = 64;
1922         return agaw;
1923 }
1924
1925 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1926                        int guest_width)
1927 {
1928         int adjust_width, agaw;
1929         unsigned long sagaw;
1930
1931         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1932                         DMA_32BIT_PFN);
1933         domain_reserve_special_ranges(domain);
1934
1935         /* calculate AGAW */
1936         if (guest_width > cap_mgaw(iommu->cap))
1937                 guest_width = cap_mgaw(iommu->cap);
1938         domain->gaw = guest_width;
1939         adjust_width = guestwidth_to_adjustwidth(guest_width);
1940         agaw = width_to_agaw(adjust_width);
1941         sagaw = cap_sagaw(iommu->cap);
1942         if (!test_bit(agaw, &sagaw)) {
1943                 /* hardware doesn't support it, choose a bigger one */
1944                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1945                 agaw = find_next_bit(&sagaw, 5, agaw);
1946                 if (agaw >= 5)
1947                         return -ENODEV;
1948         }
1949         domain->agaw = agaw;
1950
1951         if (ecap_coherent(iommu->ecap))
1952                 domain->iommu_coherency = 1;
1953         else
1954                 domain->iommu_coherency = 0;
1955
1956         if (ecap_sc_support(iommu->ecap))
1957                 domain->iommu_snooping = 1;
1958         else
1959                 domain->iommu_snooping = 0;
1960
1961         if (intel_iommu_superpage)
1962                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1963         else
1964                 domain->iommu_superpage = 0;
1965
1966         domain->nid = iommu->node;
1967
1968         /* always allocate the top pgd */
1969         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1970         if (!domain->pgd)
1971                 return -ENOMEM;
1972         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1973         return 0;
1974 }
1975
1976 static void domain_exit(struct dmar_domain *domain)
1977 {
1978         struct page *freelist = NULL;
1979
1980         /* Domain 0 is reserved, so dont process it */
1981         if (!domain)
1982                 return;
1983
1984         /* Flush any lazy unmaps that may reference this domain */
1985         if (!intel_iommu_strict) {
1986                 int cpu;
1987
1988                 for_each_possible_cpu(cpu)
1989                         flush_unmaps_timeout(cpu);
1990         }
1991
1992         /* Remove associated devices and clear attached or cached domains */
1993         rcu_read_lock();
1994         domain_remove_dev_info(domain);
1995         rcu_read_unlock();
1996
1997         /* destroy iovas */
1998         put_iova_domain(&domain->iovad);
1999
2000         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2001
2002         dma_free_pagelist(freelist);
2003
2004         free_domain_mem(domain);
2005 }
2006
2007 static int domain_context_mapping_one(struct dmar_domain *domain,
2008                                       struct intel_iommu *iommu,
2009                                       u8 bus, u8 devfn)
2010 {
2011         u16 did = domain->iommu_did[iommu->seq_id];
2012         int translation = CONTEXT_TT_MULTI_LEVEL;
2013         struct device_domain_info *info = NULL;
2014         struct context_entry *context;
2015         unsigned long flags;
2016         struct dma_pte *pgd;
2017         int ret, agaw;
2018
2019         WARN_ON(did == 0);
2020
2021         if (hw_pass_through && domain_type_is_si(domain))
2022                 translation = CONTEXT_TT_PASS_THROUGH;
2023
2024         pr_debug("Set context mapping for %02x:%02x.%d\n",
2025                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2026
2027         BUG_ON(!domain->pgd);
2028
2029         spin_lock_irqsave(&device_domain_lock, flags);
2030         spin_lock(&iommu->lock);
2031
2032         ret = -ENOMEM;
2033         context = iommu_context_addr(iommu, bus, devfn, 1);
2034         if (!context)
2035                 goto out_unlock;
2036
2037         ret = 0;
2038         if (context_present(context))
2039                 goto out_unlock;
2040
2041         /*
2042          * For kdump cases, old valid entries may be cached due to the
2043          * in-flight DMA and copied pgtable, but there is no unmapping
2044          * behaviour for them, thus we need an explicit cache flush for
2045          * the newly-mapped device. For kdump, at this point, the device
2046          * is supposed to finish reset at its driver probe stage, so no
2047          * in-flight DMA will exist, and we don't need to worry anymore
2048          * hereafter.
2049          */
2050         if (context_copied(context)) {
2051                 u16 did_old = context_domain_id(context);
2052
2053                 if (did_old >= 0 && did_old < cap_ndoms(iommu->cap))
2054                         iommu->flush.flush_context(iommu, did_old,
2055                                                    (((u16)bus) << 8) | devfn,
2056                                                    DMA_CCMD_MASK_NOBIT,
2057                                                    DMA_CCMD_DEVICE_INVL);
2058         }
2059
2060         pgd = domain->pgd;
2061
2062         context_clear_entry(context);
2063         context_set_domain_id(context, did);
2064
2065         /*
2066          * Skip top levels of page tables for iommu which has less agaw
2067          * than default.  Unnecessary for PT mode.
2068          */
2069         if (translation != CONTEXT_TT_PASS_THROUGH) {
2070                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2071                         ret = -ENOMEM;
2072                         pgd = phys_to_virt(dma_pte_addr(pgd));
2073                         if (!dma_pte_present(pgd))
2074                                 goto out_unlock;
2075                 }
2076
2077                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2078                 if (info && info->ats_supported)
2079                         translation = CONTEXT_TT_DEV_IOTLB;
2080                 else
2081                         translation = CONTEXT_TT_MULTI_LEVEL;
2082
2083                 context_set_address_root(context, virt_to_phys(pgd));
2084                 context_set_address_width(context, iommu->agaw);
2085         } else {
2086                 /*
2087                  * In pass through mode, AW must be programmed to
2088                  * indicate the largest AGAW value supported by
2089                  * hardware. And ASR is ignored by hardware.
2090                  */
2091                 context_set_address_width(context, iommu->msagaw);
2092         }
2093
2094         context_set_translation_type(context, translation);
2095         context_set_fault_enable(context);
2096         context_set_present(context);
2097         domain_flush_cache(domain, context, sizeof(*context));
2098
2099         /*
2100          * It's a non-present to present mapping. If hardware doesn't cache
2101          * non-present entry we only need to flush the write-buffer. If the
2102          * _does_ cache non-present entries, then it does so in the special
2103          * domain #0, which we have to flush:
2104          */
2105         if (cap_caching_mode(iommu->cap)) {
2106                 iommu->flush.flush_context(iommu, 0,
2107                                            (((u16)bus) << 8) | devfn,
2108                                            DMA_CCMD_MASK_NOBIT,
2109                                            DMA_CCMD_DEVICE_INVL);
2110                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2111         } else {
2112                 iommu_flush_write_buffer(iommu);
2113         }
2114         iommu_enable_dev_iotlb(info);
2115
2116         ret = 0;
2117
2118 out_unlock:
2119         spin_unlock(&iommu->lock);
2120         spin_unlock_irqrestore(&device_domain_lock, flags);
2121
2122         return ret;
2123 }
2124
2125 struct domain_context_mapping_data {
2126         struct dmar_domain *domain;
2127         struct intel_iommu *iommu;
2128 };
2129
2130 static int domain_context_mapping_cb(struct pci_dev *pdev,
2131                                      u16 alias, void *opaque)
2132 {
2133         struct domain_context_mapping_data *data = opaque;
2134
2135         return domain_context_mapping_one(data->domain, data->iommu,
2136                                           PCI_BUS_NUM(alias), alias & 0xff);
2137 }
2138
2139 static int
2140 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2141 {
2142         struct intel_iommu *iommu;
2143         u8 bus, devfn;
2144         struct domain_context_mapping_data data;
2145
2146         iommu = device_to_iommu(dev, &bus, &devfn);
2147         if (!iommu)
2148                 return -ENODEV;
2149
2150         if (!dev_is_pci(dev))
2151                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2152
2153         data.domain = domain;
2154         data.iommu = iommu;
2155
2156         return pci_for_each_dma_alias(to_pci_dev(dev),
2157                                       &domain_context_mapping_cb, &data);
2158 }
2159
2160 static int domain_context_mapped_cb(struct pci_dev *pdev,
2161                                     u16 alias, void *opaque)
2162 {
2163         struct intel_iommu *iommu = opaque;
2164
2165         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2166 }
2167
2168 static int domain_context_mapped(struct device *dev)
2169 {
2170         struct intel_iommu *iommu;
2171         u8 bus, devfn;
2172
2173         iommu = device_to_iommu(dev, &bus, &devfn);
2174         if (!iommu)
2175                 return -ENODEV;
2176
2177         if (!dev_is_pci(dev))
2178                 return device_context_mapped(iommu, bus, devfn);
2179
2180         return !pci_for_each_dma_alias(to_pci_dev(dev),
2181                                        domain_context_mapped_cb, iommu);
2182 }
2183
2184 /* Returns a number of VTD pages, but aligned to MM page size */
2185 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2186                                             size_t size)
2187 {
2188         host_addr &= ~PAGE_MASK;
2189         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2190 }
2191
2192 /* Return largest possible superpage level for a given mapping */
2193 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2194                                           unsigned long iov_pfn,
2195                                           unsigned long phy_pfn,
2196                                           unsigned long pages)
2197 {
2198         int support, level = 1;
2199         unsigned long pfnmerge;
2200
2201         support = domain->iommu_superpage;
2202
2203         /* To use a large page, the virtual *and* physical addresses
2204            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2205            of them will mean we have to use smaller pages. So just
2206            merge them and check both at once. */
2207         pfnmerge = iov_pfn | phy_pfn;
2208
2209         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2210                 pages >>= VTD_STRIDE_SHIFT;
2211                 if (!pages)
2212                         break;
2213                 pfnmerge >>= VTD_STRIDE_SHIFT;
2214                 level++;
2215                 support--;
2216         }
2217         return level;
2218 }
2219
2220 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2221                             struct scatterlist *sg, unsigned long phys_pfn,
2222                             unsigned long nr_pages, int prot)
2223 {
2224         struct dma_pte *first_pte = NULL, *pte = NULL;
2225         phys_addr_t uninitialized_var(pteval);
2226         unsigned long sg_res = 0;
2227         unsigned int largepage_lvl = 0;
2228         unsigned long lvl_pages = 0;
2229
2230         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2231
2232         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2233                 return -EINVAL;
2234
2235         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2236
2237         if (!sg) {
2238                 sg_res = nr_pages;
2239                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2240         }
2241
2242         while (nr_pages > 0) {
2243                 uint64_t tmp;
2244
2245                 if (!sg_res) {
2246                         sg_res = aligned_nrpages(sg->offset, sg->length);
2247                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2248                         sg->dma_length = sg->length;
2249                         pteval = page_to_phys(sg_page(sg)) | prot;
2250                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2251                 }
2252
2253                 if (!pte) {
2254                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2255
2256                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2257                         if (!pte)
2258                                 return -ENOMEM;
2259                         /* It is large page*/
2260                         if (largepage_lvl > 1) {
2261                                 unsigned long nr_superpages, end_pfn;
2262
2263                                 pteval |= DMA_PTE_LARGE_PAGE;
2264                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2265
2266                                 nr_superpages = sg_res / lvl_pages;
2267                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2268
2269                                 /*
2270                                  * Ensure that old small page tables are
2271                                  * removed to make room for superpage(s).
2272                                  */
2273                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
2274                         } else {
2275                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2276                         }
2277
2278                 }
2279                 /* We don't need lock here, nobody else
2280                  * touches the iova range
2281                  */
2282                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2283                 if (tmp) {
2284                         static int dumps = 5;
2285                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2286                                 iov_pfn, tmp, (unsigned long long)pteval);
2287                         if (dumps) {
2288                                 dumps--;
2289                                 debug_dma_dump_mappings(NULL);
2290                         }
2291                         WARN_ON(1);
2292                 }
2293
2294                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2295
2296                 BUG_ON(nr_pages < lvl_pages);
2297                 BUG_ON(sg_res < lvl_pages);
2298
2299                 nr_pages -= lvl_pages;
2300                 iov_pfn += lvl_pages;
2301                 phys_pfn += lvl_pages;
2302                 pteval += lvl_pages * VTD_PAGE_SIZE;
2303                 sg_res -= lvl_pages;
2304
2305                 /* If the next PTE would be the first in a new page, then we
2306                    need to flush the cache on the entries we've just written.
2307                    And then we'll need to recalculate 'pte', so clear it and
2308                    let it get set again in the if (!pte) block above.
2309
2310                    If we're done (!nr_pages) we need to flush the cache too.
2311
2312                    Also if we've been setting superpages, we may need to
2313                    recalculate 'pte' and switch back to smaller pages for the
2314                    end of the mapping, if the trailing size is not enough to
2315                    use another superpage (i.e. sg_res < lvl_pages). */
2316                 pte++;
2317                 if (!nr_pages || first_pte_in_page(pte) ||
2318                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2319                         domain_flush_cache(domain, first_pte,
2320                                            (void *)pte - (void *)first_pte);
2321                         pte = NULL;
2322                 }
2323
2324                 if (!sg_res && nr_pages)
2325                         sg = sg_next(sg);
2326         }
2327         return 0;
2328 }
2329
2330 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2331                                     struct scatterlist *sg, unsigned long nr_pages,
2332                                     int prot)
2333 {
2334         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2335 }
2336
2337 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2338                                      unsigned long phys_pfn, unsigned long nr_pages,
2339                                      int prot)
2340 {
2341         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2342 }
2343
2344 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2345 {
2346         if (!iommu)
2347                 return;
2348
2349         clear_context_table(iommu, bus, devfn);
2350         iommu->flush.flush_context(iommu, 0, 0, 0,
2351                                            DMA_CCMD_GLOBAL_INVL);
2352         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2353 }
2354
2355 static inline void unlink_domain_info(struct device_domain_info *info)
2356 {
2357         assert_spin_locked(&device_domain_lock);
2358         list_del(&info->link);
2359         list_del(&info->global);
2360         if (info->dev)
2361                 info->dev->archdata.iommu = NULL;
2362 }
2363
2364 static void domain_remove_dev_info(struct dmar_domain *domain)
2365 {
2366         struct device_domain_info *info, *tmp;
2367         unsigned long flags;
2368
2369         spin_lock_irqsave(&device_domain_lock, flags);
2370         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2371                 __dmar_remove_one_dev_info(info);
2372         spin_unlock_irqrestore(&device_domain_lock, flags);
2373 }
2374
2375 /*
2376  * find_domain
2377  * Note: we use struct device->archdata.iommu stores the info
2378  */
2379 static struct dmar_domain *find_domain(struct device *dev)
2380 {
2381         struct device_domain_info *info;
2382
2383         /* No lock here, assumes no domain exit in normal case */
2384         info = dev->archdata.iommu;
2385         if (info)
2386                 return info->domain;
2387         return NULL;
2388 }
2389
2390 static inline struct device_domain_info *
2391 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2392 {
2393         struct device_domain_info *info;
2394
2395         list_for_each_entry(info, &device_domain_list, global)
2396                 if (info->iommu->segment == segment && info->bus == bus &&
2397                     info->devfn == devfn)
2398                         return info;
2399
2400         return NULL;
2401 }
2402
2403 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2404                                                     int bus, int devfn,
2405                                                     struct device *dev,
2406                                                     struct dmar_domain *domain)
2407 {
2408         struct dmar_domain *found = NULL;
2409         struct device_domain_info *info;
2410         unsigned long flags;
2411         int ret;
2412
2413         info = alloc_devinfo_mem();
2414         if (!info)
2415                 return NULL;
2416
2417         info->bus = bus;
2418         info->devfn = devfn;
2419         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2420         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2421         info->ats_qdep = 0;
2422         info->dev = dev;
2423         info->domain = domain;
2424         info->iommu = iommu;
2425
2426         if (dev && dev_is_pci(dev)) {
2427                 struct pci_dev *pdev = to_pci_dev(info->dev);
2428
2429                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2430                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2431                     dmar_find_matched_atsr_unit(pdev))
2432                         info->ats_supported = 1;
2433
2434                 if (ecs_enabled(iommu)) {
2435                         if (pasid_enabled(iommu)) {
2436                                 int features = pci_pasid_features(pdev);
2437                                 if (features >= 0)
2438                                         info->pasid_supported = features | 1;
2439                         }
2440
2441                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2442                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2443                                 info->pri_supported = 1;
2444                 }
2445         }
2446
2447         spin_lock_irqsave(&device_domain_lock, flags);
2448         if (dev)
2449                 found = find_domain(dev);
2450
2451         if (!found) {
2452                 struct device_domain_info *info2;
2453                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2454                 if (info2) {
2455                         found      = info2->domain;
2456                         info2->dev = dev;
2457                 }
2458         }
2459
2460         if (found) {
2461                 spin_unlock_irqrestore(&device_domain_lock, flags);
2462                 free_devinfo_mem(info);
2463                 /* Caller must free the original domain */
2464                 return found;
2465         }
2466
2467         spin_lock(&iommu->lock);
2468         ret = domain_attach_iommu(domain, iommu);
2469         spin_unlock(&iommu->lock);
2470
2471         if (ret) {
2472                 spin_unlock_irqrestore(&device_domain_lock, flags);
2473                 free_devinfo_mem(info);
2474                 return NULL;
2475         }
2476
2477         list_add(&info->link, &domain->devices);
2478         list_add(&info->global, &device_domain_list);
2479         if (dev)
2480                 dev->archdata.iommu = info;
2481         spin_unlock_irqrestore(&device_domain_lock, flags);
2482
2483         if (dev && domain_context_mapping(domain, dev)) {
2484                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2485                 dmar_remove_one_dev_info(domain, dev);
2486                 return NULL;
2487         }
2488
2489         return domain;
2490 }
2491
2492 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2493 {
2494         *(u16 *)opaque = alias;
2495         return 0;
2496 }
2497
2498 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2499 {
2500         struct device_domain_info *info = NULL;
2501         struct dmar_domain *domain = NULL;
2502         struct intel_iommu *iommu;
2503         u16 req_id, dma_alias;
2504         unsigned long flags;
2505         u8 bus, devfn;
2506
2507         iommu = device_to_iommu(dev, &bus, &devfn);
2508         if (!iommu)
2509                 return NULL;
2510
2511         req_id = ((u16)bus << 8) | devfn;
2512
2513         if (dev_is_pci(dev)) {
2514                 struct pci_dev *pdev = to_pci_dev(dev);
2515
2516                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2517
2518                 spin_lock_irqsave(&device_domain_lock, flags);
2519                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2520                                                       PCI_BUS_NUM(dma_alias),
2521                                                       dma_alias & 0xff);
2522                 if (info) {
2523                         iommu = info->iommu;
2524                         domain = info->domain;
2525                 }
2526                 spin_unlock_irqrestore(&device_domain_lock, flags);
2527
2528                 /* DMA alias already has a domain, use it */
2529                 if (info)
2530                         goto out;
2531         }
2532
2533         /* Allocate and initialize new domain for the device */
2534         domain = alloc_domain(0);
2535         if (!domain)
2536                 return NULL;
2537         if (domain_init(domain, iommu, gaw)) {
2538                 domain_exit(domain);
2539                 return NULL;
2540         }
2541
2542 out:
2543
2544         return domain;
2545 }
2546
2547 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2548                                               struct dmar_domain *domain)
2549 {
2550         struct intel_iommu *iommu;
2551         struct dmar_domain *tmp;
2552         u16 req_id, dma_alias;
2553         u8 bus, devfn;
2554
2555         iommu = device_to_iommu(dev, &bus, &devfn);
2556         if (!iommu)
2557                 return NULL;
2558
2559         req_id = ((u16)bus << 8) | devfn;
2560
2561         if (dev_is_pci(dev)) {
2562                 struct pci_dev *pdev = to_pci_dev(dev);
2563
2564                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2565
2566                 /* register PCI DMA alias device */
2567                 if (req_id != dma_alias) {
2568                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2569                                         dma_alias & 0xff, NULL, domain);
2570
2571                         if (!tmp || tmp != domain)
2572                                 return tmp;
2573                 }
2574         }
2575
2576         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2577         if (!tmp || tmp != domain)
2578                 return tmp;
2579
2580         return domain;
2581 }
2582
2583 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2584 {
2585         struct dmar_domain *domain, *tmp;
2586
2587         domain = find_domain(dev);
2588         if (domain)
2589                 goto out;
2590
2591         domain = find_or_alloc_domain(dev, gaw);
2592         if (!domain)
2593                 goto out;
2594
2595         tmp = set_domain_for_dev(dev, domain);
2596         if (!tmp || domain != tmp) {
2597                 domain_exit(domain);
2598                 domain = tmp;
2599         }
2600
2601 out:
2602
2603         return domain;
2604 }
2605
2606 static int iommu_domain_identity_map(struct dmar_domain *domain,
2607                                      unsigned long long start,
2608                                      unsigned long long end)
2609 {
2610         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2611         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2612
2613         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2614                           dma_to_mm_pfn(last_vpfn))) {
2615                 pr_err("Reserving iova failed\n");
2616                 return -ENOMEM;
2617         }
2618
2619         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2620         /*
2621          * RMRR range might have overlap with physical memory range,
2622          * clear it first
2623          */
2624         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2625
2626         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2627                                   last_vpfn - first_vpfn + 1,
2628                                   DMA_PTE_READ|DMA_PTE_WRITE);
2629 }
2630
2631 static int domain_prepare_identity_map(struct device *dev,
2632                                        struct dmar_domain *domain,
2633                                        unsigned long long start,
2634                                        unsigned long long end)
2635 {
2636         /* For _hardware_ passthrough, don't bother. But for software
2637            passthrough, we do it anyway -- it may indicate a memory
2638            range which is reserved in E820, so which didn't get set
2639            up to start with in si_domain */
2640         if (domain == si_domain && hw_pass_through) {
2641                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2642                         dev_name(dev), start, end);
2643                 return 0;
2644         }
2645
2646         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2647                 dev_name(dev), start, end);
2648
2649         if (end < start) {
2650                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2651                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2652                         dmi_get_system_info(DMI_BIOS_VENDOR),
2653                         dmi_get_system_info(DMI_BIOS_VERSION),
2654                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2655                 return -EIO;
2656         }
2657
2658         if (end >> agaw_to_width(domain->agaw)) {
2659                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2660                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2661                      agaw_to_width(domain->agaw),
2662                      dmi_get_system_info(DMI_BIOS_VENDOR),
2663                      dmi_get_system_info(DMI_BIOS_VERSION),
2664                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2665                 return -EIO;
2666         }
2667
2668         return iommu_domain_identity_map(domain, start, end);
2669 }
2670
2671 static int iommu_prepare_identity_map(struct device *dev,
2672                                       unsigned long long start,
2673                                       unsigned long long end)
2674 {
2675         struct dmar_domain *domain;
2676         int ret;
2677
2678         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2679         if (!domain)
2680                 return -ENOMEM;
2681
2682         ret = domain_prepare_identity_map(dev, domain, start, end);
2683         if (ret)
2684                 domain_exit(domain);
2685
2686         return ret;
2687 }
2688
2689 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2690                                          struct device *dev)
2691 {
2692         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2693                 return 0;
2694         return iommu_prepare_identity_map(dev, rmrr->base_address,
2695                                           rmrr->end_address);
2696 }
2697
2698 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2699 static inline void iommu_prepare_isa(void)
2700 {
2701         struct pci_dev *pdev;
2702         int ret;
2703
2704         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2705         if (!pdev)
2706                 return;
2707
2708         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2709         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2710
2711         if (ret)
2712                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2713
2714         pci_dev_put(pdev);
2715 }
2716 #else
2717 static inline void iommu_prepare_isa(void)
2718 {
2719         return;
2720 }
2721 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2722
2723 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2724
2725 static int __init si_domain_init(int hw)
2726 {
2727         int nid, ret = 0;
2728
2729         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2730         if (!si_domain)
2731                 return -EFAULT;
2732
2733         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2734                 domain_exit(si_domain);
2735                 return -EFAULT;
2736         }
2737
2738         pr_debug("Identity mapping domain allocated\n");
2739
2740         if (hw)
2741                 return 0;
2742
2743         for_each_online_node(nid) {
2744                 unsigned long start_pfn, end_pfn;
2745                 int i;
2746
2747                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2748                         ret = iommu_domain_identity_map(si_domain,
2749                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2750                         if (ret)
2751                                 return ret;
2752                 }
2753         }
2754
2755         return 0;
2756 }
2757
2758 static int identity_mapping(struct device *dev)
2759 {
2760         struct device_domain_info *info;
2761
2762         if (likely(!iommu_identity_mapping))
2763                 return 0;
2764
2765         info = dev->archdata.iommu;
2766         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2767                 return (info->domain == si_domain);
2768
2769         return 0;
2770 }
2771
2772 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2773 {
2774         struct dmar_domain *ndomain;
2775         struct intel_iommu *iommu;
2776         u8 bus, devfn;
2777
2778         iommu = device_to_iommu(dev, &bus, &devfn);
2779         if (!iommu)
2780                 return -ENODEV;
2781
2782         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2783         if (ndomain != domain)
2784                 return -EBUSY;
2785
2786         return 0;
2787 }
2788
2789 static bool device_has_rmrr(struct device *dev)
2790 {
2791         struct dmar_rmrr_unit *rmrr;
2792         struct device *tmp;
2793         int i;
2794
2795         rcu_read_lock();
2796         for_each_rmrr_units(rmrr) {
2797                 /*
2798                  * Return TRUE if this RMRR contains the device that
2799                  * is passed in.
2800                  */
2801                 for_each_active_dev_scope(rmrr->devices,
2802                                           rmrr->devices_cnt, i, tmp)
2803                         if (tmp == dev) {
2804                                 rcu_read_unlock();
2805                                 return true;
2806                         }
2807         }
2808         rcu_read_unlock();
2809         return false;
2810 }
2811
2812 /*
2813  * There are a couple cases where we need to restrict the functionality of
2814  * devices associated with RMRRs.  The first is when evaluating a device for
2815  * identity mapping because problems exist when devices are moved in and out
2816  * of domains and their respective RMRR information is lost.  This means that
2817  * a device with associated RMRRs will never be in a "passthrough" domain.
2818  * The second is use of the device through the IOMMU API.  This interface
2819  * expects to have full control of the IOVA space for the device.  We cannot
2820  * satisfy both the requirement that RMRR access is maintained and have an
2821  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2822  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2823  * We therefore prevent devices associated with an RMRR from participating in
2824  * the IOMMU API, which eliminates them from device assignment.
2825  *
2826  * In both cases we assume that PCI USB devices with RMRRs have them largely
2827  * for historical reasons and that the RMRR space is not actively used post
2828  * boot.  This exclusion may change if vendors begin to abuse it.
2829  *
2830  * The same exception is made for graphics devices, with the requirement that
2831  * any use of the RMRR regions will be torn down before assigning the device
2832  * to a guest.
2833  */
2834 static bool device_is_rmrr_locked(struct device *dev)
2835 {
2836         if (!device_has_rmrr(dev))
2837                 return false;
2838
2839         if (dev_is_pci(dev)) {
2840                 struct pci_dev *pdev = to_pci_dev(dev);
2841
2842                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2843                         return false;
2844         }
2845
2846         return true;
2847 }
2848
2849 static int iommu_should_identity_map(struct device *dev, int startup)
2850 {
2851
2852         if (dev_is_pci(dev)) {
2853                 struct pci_dev *pdev = to_pci_dev(dev);
2854
2855                 if (device_is_rmrr_locked(dev))
2856                         return 0;
2857
2858                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2859                         return 1;
2860
2861                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2862                         return 1;
2863
2864                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2865                         return 0;
2866
2867                 /*
2868                  * We want to start off with all devices in the 1:1 domain, and
2869                  * take them out later if we find they can't access all of memory.
2870                  *
2871                  * However, we can't do this for PCI devices behind bridges,
2872                  * because all PCI devices behind the same bridge will end up
2873                  * with the same source-id on their transactions.
2874                  *
2875                  * Practically speaking, we can't change things around for these
2876                  * devices at run-time, because we can't be sure there'll be no
2877                  * DMA transactions in flight for any of their siblings.
2878                  *
2879                  * So PCI devices (unless they're on the root bus) as well as
2880                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2881                  * the 1:1 domain, just in _case_ one of their siblings turns out
2882                  * not to be able to map all of memory.
2883                  */
2884                 if (!pci_is_pcie(pdev)) {
2885                         if (!pci_is_root_bus(pdev->bus))
2886                                 return 0;
2887                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2888                                 return 0;
2889                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2890                         return 0;
2891         } else {
2892                 if (device_has_rmrr(dev))
2893                         return 0;
2894         }
2895
2896         /*
2897          * At boot time, we don't yet know if devices will be 64-bit capable.
2898          * Assume that they will — if they turn out not to be, then we can
2899          * take them out of the 1:1 domain later.
2900          */
2901         if (!startup) {
2902                 /*
2903                  * If the device's dma_mask is less than the system's memory
2904                  * size then this is not a candidate for identity mapping.
2905                  */
2906                 u64 dma_mask = *dev->dma_mask;
2907
2908                 if (dev->coherent_dma_mask &&
2909                     dev->coherent_dma_mask < dma_mask)
2910                         dma_mask = dev->coherent_dma_mask;
2911
2912                 return dma_mask >= dma_get_required_mask(dev);
2913         }
2914
2915         return 1;
2916 }
2917
2918 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2919 {
2920         int ret;
2921
2922         if (!iommu_should_identity_map(dev, 1))
2923                 return 0;
2924
2925         ret = domain_add_dev_info(si_domain, dev);
2926         if (!ret)
2927                 pr_info("%s identity mapping for device %s\n",
2928                         hw ? "Hardware" : "Software", dev_name(dev));
2929         else if (ret == -ENODEV)
2930                 /* device not associated with an iommu */
2931                 ret = 0;
2932
2933         return ret;
2934 }
2935
2936
2937 static int __init iommu_prepare_static_identity_mapping(int hw)
2938 {
2939         struct pci_dev *pdev = NULL;
2940         struct dmar_drhd_unit *drhd;
2941         struct intel_iommu *iommu;
2942         struct device *dev;
2943         int i;
2944         int ret = 0;
2945
2946         for_each_pci_dev(pdev) {
2947                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2948                 if (ret)
2949                         return ret;
2950         }
2951
2952         for_each_active_iommu(iommu, drhd)
2953                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2954                         struct acpi_device_physical_node *pn;
2955                         struct acpi_device *adev;
2956
2957                         if (dev->bus != &acpi_bus_type)
2958                                 continue;
2959
2960                         adev= to_acpi_device(dev);
2961                         mutex_lock(&adev->physical_node_lock);
2962                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2963                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2964                                 if (ret)
2965                                         break;
2966                         }
2967                         mutex_unlock(&adev->physical_node_lock);
2968                         if (ret)
2969                                 return ret;
2970                 }
2971
2972         return 0;
2973 }
2974
2975 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2976 {
2977         /*
2978          * Start from the sane iommu hardware state.
2979          * If the queued invalidation is already initialized by us
2980          * (for example, while enabling interrupt-remapping) then
2981          * we got the things already rolling from a sane state.
2982          */
2983         if (!iommu->qi) {
2984                 /*
2985                  * Clear any previous faults.
2986                  */
2987                 dmar_fault(-1, iommu);
2988                 /*
2989                  * Disable queued invalidation if supported and already enabled
2990                  * before OS handover.
2991                  */
2992                 dmar_disable_qi(iommu);
2993         }
2994
2995         if (dmar_enable_qi(iommu)) {
2996                 /*
2997                  * Queued Invalidate not enabled, use Register Based Invalidate
2998                  */
2999                 iommu->flush.flush_context = __iommu_flush_context;
3000                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3001                 pr_info("%s: Using Register based invalidation\n",
3002                         iommu->name);
3003         } else {
3004                 iommu->flush.flush_context = qi_flush_context;
3005                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3006                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3007         }
3008 }
3009
3010 static int copy_context_table(struct intel_iommu *iommu,
3011                               struct root_entry *old_re,
3012                               struct context_entry **tbl,
3013                               int bus, bool ext)
3014 {
3015         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3016         struct context_entry *new_ce = NULL, ce;
3017         struct context_entry *old_ce = NULL;
3018         struct root_entry re;
3019         phys_addr_t old_ce_phys;
3020
3021         tbl_idx = ext ? bus * 2 : bus;
3022         memcpy(&re, old_re, sizeof(re));
3023
3024         for (devfn = 0; devfn < 256; devfn++) {
3025                 /* First calculate the correct index */
3026                 idx = (ext ? devfn * 2 : devfn) % 256;
3027
3028                 if (idx == 0) {
3029                         /* First save what we may have and clean up */
3030                         if (new_ce) {
3031                                 tbl[tbl_idx] = new_ce;
3032                                 __iommu_flush_cache(iommu, new_ce,
3033                                                     VTD_PAGE_SIZE);
3034                                 pos = 1;
3035                         }
3036
3037                         if (old_ce)
3038                                 iounmap(old_ce);
3039
3040                         ret = 0;
3041                         if (devfn < 0x80)
3042                                 old_ce_phys = root_entry_lctp(&re);
3043                         else
3044                                 old_ce_phys = root_entry_uctp(&re);
3045
3046                         if (!old_ce_phys) {
3047                                 if (ext && devfn == 0) {
3048                                         /* No LCTP, try UCTP */
3049                                         devfn = 0x7f;
3050                                         continue;
3051                                 } else {
3052                                         goto out;
3053                                 }
3054                         }
3055
3056                         ret = -ENOMEM;
3057                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3058                                         MEMREMAP_WB);
3059                         if (!old_ce)
3060                                 goto out;
3061
3062                         new_ce = alloc_pgtable_page(iommu->node);
3063                         if (!new_ce)
3064                                 goto out_unmap;
3065
3066                         ret = 0;
3067                 }
3068
3069                 /* Now copy the context entry */
3070                 memcpy(&ce, old_ce + idx, sizeof(ce));
3071
3072                 if (!__context_present(&ce))
3073                         continue;
3074
3075                 did = context_domain_id(&ce);
3076                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3077                         set_bit(did, iommu->domain_ids);
3078
3079                 /*
3080                  * We need a marker for copied context entries. This
3081                  * marker needs to work for the old format as well as
3082                  * for extended context entries.
3083                  *
3084                  * Bit 67 of the context entry is used. In the old
3085                  * format this bit is available to software, in the
3086                  * extended format it is the PGE bit, but PGE is ignored
3087                  * by HW if PASIDs are disabled (and thus still
3088                  * available).
3089                  *
3090                  * So disable PASIDs first and then mark the entry
3091                  * copied. This means that we don't copy PASID
3092                  * translations from the old kernel, but this is fine as
3093                  * faults there are not fatal.
3094                  */
3095                 context_clear_pasid_enable(&ce);
3096                 context_set_copied(&ce);
3097
3098                 new_ce[idx] = ce;
3099         }
3100
3101         tbl[tbl_idx + pos] = new_ce;
3102
3103         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3104
3105 out_unmap:
3106         memunmap(old_ce);
3107
3108 out:
3109         return ret;
3110 }
3111
3112 static int copy_translation_tables(struct intel_iommu *iommu)
3113 {
3114         struct context_entry **ctxt_tbls;
3115         struct root_entry *old_rt;
3116         phys_addr_t old_rt_phys;
3117         int ctxt_table_entries;
3118         unsigned long flags;
3119         u64 rtaddr_reg;
3120         int bus, ret;
3121         bool new_ext, ext;
3122
3123         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3124         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3125         new_ext    = !!ecap_ecs(iommu->ecap);
3126
3127         /*
3128          * The RTT bit can only be changed when translation is disabled,
3129          * but disabling translation means to open a window for data
3130          * corruption. So bail out and don't copy anything if we would
3131          * have to change the bit.
3132          */
3133         if (new_ext != ext)
3134                 return -EINVAL;
3135
3136         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3137         if (!old_rt_phys)
3138                 return -EINVAL;
3139
3140         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3141         if (!old_rt)
3142                 return -ENOMEM;
3143
3144         /* This is too big for the stack - allocate it from slab */
3145         ctxt_table_entries = ext ? 512 : 256;
3146         ret = -ENOMEM;
3147         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3148         if (!ctxt_tbls)
3149                 goto out_unmap;
3150
3151         for (bus = 0; bus < 256; bus++) {
3152                 ret = copy_context_table(iommu, &old_rt[bus],
3153                                          ctxt_tbls, bus, ext);
3154                 if (ret) {
3155                         pr_err("%s: Failed to copy context table for bus %d\n",
3156                                 iommu->name, bus);
3157                         continue;
3158                 }
3159         }
3160
3161         spin_lock_irqsave(&iommu->lock, flags);
3162
3163         /* Context tables are copied, now write them to the root_entry table */
3164         for (bus = 0; bus < 256; bus++) {
3165                 int idx = ext ? bus * 2 : bus;
3166                 u64 val;
3167
3168                 if (ctxt_tbls[idx]) {
3169                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3170                         iommu->root_entry[bus].lo = val;
3171                 }
3172
3173                 if (!ext || !ctxt_tbls[idx + 1])
3174                         continue;
3175
3176                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3177                 iommu->root_entry[bus].hi = val;
3178         }
3179
3180         spin_unlock_irqrestore(&iommu->lock, flags);
3181
3182         kfree(ctxt_tbls);
3183
3184         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3185
3186         ret = 0;
3187
3188 out_unmap:
3189         memunmap(old_rt);
3190
3191         return ret;
3192 }
3193
3194 static int __init init_dmars(void)
3195 {
3196         struct dmar_drhd_unit *drhd;
3197         struct dmar_rmrr_unit *rmrr;
3198         bool copied_tables = false;
3199         struct device *dev;
3200         struct intel_iommu *iommu;
3201         int i, ret, cpu;
3202
3203         /*
3204          * for each drhd
3205          *    allocate root
3206          *    initialize and program root entry to not present
3207          * endfor
3208          */
3209         for_each_drhd_unit(drhd) {
3210                 /*
3211                  * lock not needed as this is only incremented in the single
3212                  * threaded kernel __init code path all other access are read
3213                  * only
3214                  */
3215                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3216                         g_num_of_iommus++;
3217                         continue;
3218                 }
3219                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3220         }
3221
3222         /* Preallocate enough resources for IOMMU hot-addition */
3223         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3224                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3225
3226         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3227                         GFP_KERNEL);
3228         if (!g_iommus) {
3229                 pr_err("Allocating global iommu array failed\n");
3230                 ret = -ENOMEM;
3231                 goto error;
3232         }
3233
3234         for_each_possible_cpu(cpu) {
3235                 struct deferred_flush_data *dfd = per_cpu_ptr(&deferred_flush,
3236                                                               cpu);
3237
3238                 dfd->tables = kzalloc(g_num_of_iommus *
3239                                       sizeof(struct deferred_flush_table),
3240                                       GFP_KERNEL);
3241                 if (!dfd->tables) {
3242                         ret = -ENOMEM;
3243                         goto free_g_iommus;
3244                 }
3245
3246                 spin_lock_init(&dfd->lock);
3247                 setup_timer(&dfd->timer, flush_unmaps_timeout, cpu);
3248         }
3249
3250         for_each_active_iommu(iommu, drhd) {
3251                 g_iommus[iommu->seq_id] = iommu;
3252
3253                 intel_iommu_init_qi(iommu);
3254
3255                 ret = iommu_init_domains(iommu);
3256                 if (ret)
3257                         goto free_iommu;
3258
3259                 init_translation_status(iommu);
3260
3261                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3262                         iommu_disable_translation(iommu);
3263                         clear_translation_pre_enabled(iommu);
3264                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3265                                 iommu->name);
3266                 }
3267
3268                 /*
3269                  * TBD:
3270                  * we could share the same root & context tables
3271                  * among all IOMMU's. Need to Split it later.
3272                  */
3273                 ret = iommu_alloc_root_entry(iommu);
3274                 if (ret)
3275                         goto free_iommu;
3276
3277                 if (translation_pre_enabled(iommu)) {
3278                         pr_info("Translation already enabled - trying to copy translation structures\n");
3279
3280                         ret = copy_translation_tables(iommu);
3281                         if (ret) {
3282                                 /*
3283                                  * We found the IOMMU with translation
3284                                  * enabled - but failed to copy over the
3285                                  * old root-entry table. Try to proceed
3286                                  * by disabling translation now and
3287                                  * allocating a clean root-entry table.
3288                                  * This might cause DMAR faults, but
3289                                  * probably the dump will still succeed.
3290                                  */
3291                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3292                                        iommu->name);
3293                                 iommu_disable_translation(iommu);
3294                                 clear_translation_pre_enabled(iommu);
3295                         } else {
3296                                 pr_info("Copied translation tables from previous kernel for %s\n",
3297                                         iommu->name);
3298                                 copied_tables = true;
3299                         }
3300                 }
3301
3302                 if (!ecap_pass_through(iommu->ecap))
3303                         hw_pass_through = 0;
3304 #ifdef CONFIG_INTEL_IOMMU_SVM
3305                 if (pasid_enabled(iommu))
3306                         intel_svm_alloc_pasid_tables(iommu);
3307 #endif
3308         }
3309
3310         /*
3311          * Now that qi is enabled on all iommus, set the root entry and flush
3312          * caches. This is required on some Intel X58 chipsets, otherwise the
3313          * flush_context function will loop forever and the boot hangs.
3314          */
3315         for_each_active_iommu(iommu, drhd) {
3316                 iommu_flush_write_buffer(iommu);
3317                 iommu_set_root_entry(iommu);
3318                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3319                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3320         }
3321
3322         if (iommu_pass_through)
3323                 iommu_identity_mapping |= IDENTMAP_ALL;
3324
3325 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3326         iommu_identity_mapping |= IDENTMAP_GFX;
3327 #endif
3328
3329         check_tylersburg_isoch();
3330
3331         if (iommu_identity_mapping) {
3332                 ret = si_domain_init(hw_pass_through);
3333                 if (ret)
3334                         goto free_iommu;
3335         }
3336
3337
3338         /*
3339          * If we copied translations from a previous kernel in the kdump
3340          * case, we can not assign the devices to domains now, as that
3341          * would eliminate the old mappings. So skip this part and defer
3342          * the assignment to device driver initialization time.
3343          */
3344         if (copied_tables)
3345                 goto domains_done;
3346
3347         /*
3348          * If pass through is not set or not enabled, setup context entries for
3349          * identity mappings for rmrr, gfx, and isa and may fall back to static
3350          * identity mapping if iommu_identity_mapping is set.
3351          */
3352         if (iommu_identity_mapping) {
3353                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3354                 if (ret) {
3355                         pr_crit("Failed to setup IOMMU pass-through\n");
3356                         goto free_iommu;
3357                 }
3358         }
3359         /*
3360          * For each rmrr
3361          *   for each dev attached to rmrr
3362          *   do
3363          *     locate drhd for dev, alloc domain for dev
3364          *     allocate free domain
3365          *     allocate page table entries for rmrr
3366          *     if context not allocated for bus
3367          *           allocate and init context
3368          *           set present in root table for this bus
3369          *     init context with domain, translation etc
3370          *    endfor
3371          * endfor
3372          */
3373         pr_info("Setting RMRR:\n");
3374         for_each_rmrr_units(rmrr) {
3375                 /* some BIOS lists non-exist devices in DMAR table. */
3376                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3377                                           i, dev) {
3378                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3379                         if (ret)
3380                                 pr_err("Mapping reserved region failed\n");
3381                 }
3382         }
3383
3384         iommu_prepare_isa();
3385
3386 domains_done:
3387
3388         /*
3389          * for each drhd
3390          *   enable fault log
3391          *   global invalidate context cache
3392          *   global invalidate iotlb
3393          *   enable translation
3394          */
3395         for_each_iommu(iommu, drhd) {
3396                 if (drhd->ignored) {
3397                         /*
3398                          * we always have to disable PMRs or DMA may fail on
3399                          * this device
3400                          */
3401                         if (force_on)
3402                                 iommu_disable_protect_mem_regions(iommu);
3403                         continue;
3404                 }
3405
3406                 iommu_flush_write_buffer(iommu);
3407
3408 #ifdef CONFIG_INTEL_IOMMU_SVM
3409                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3410                         ret = intel_svm_enable_prq(iommu);
3411                         if (ret)
3412                                 goto free_iommu;
3413                 }
3414 #endif
3415                 ret = dmar_set_interrupt(iommu);
3416                 if (ret)
3417                         goto free_iommu;
3418
3419                 if (!translation_pre_enabled(iommu))
3420                         iommu_enable_translation(iommu);
3421
3422                 iommu_disable_protect_mem_regions(iommu);
3423         }
3424
3425         return 0;
3426
3427 free_iommu:
3428         for_each_active_iommu(iommu, drhd) {
3429                 disable_dmar_iommu(iommu);
3430                 free_dmar_iommu(iommu);
3431         }
3432 free_g_iommus:
3433         for_each_possible_cpu(cpu)
3434                 kfree(per_cpu_ptr(&deferred_flush, cpu)->tables);
3435         kfree(g_iommus);
3436 error:
3437         return ret;
3438 }
3439
3440 /* This takes a number of _MM_ pages, not VTD pages */
3441 static unsigned long intel_alloc_iova(struct device *dev,
3442                                      struct dmar_domain *domain,
3443                                      unsigned long nrpages, uint64_t dma_mask)
3444 {
3445         unsigned long iova_pfn = 0;
3446
3447         /* Restrict dma_mask to the width that the iommu can handle */
3448         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3449         /* Ensure we reserve the whole size-aligned region */
3450         nrpages = __roundup_pow_of_two(nrpages);
3451
3452         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3453                 /*
3454                  * First try to allocate an io virtual address in
3455                  * DMA_BIT_MASK(32) and if that fails then try allocating
3456                  * from higher range
3457                  */
3458                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3459                                            IOVA_PFN(DMA_BIT_MASK(32)));
3460                 if (iova_pfn)
3461                         return iova_pfn;
3462         }
3463         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask));
3464         if (unlikely(!iova_pfn)) {
3465                 pr_err("Allocating %ld-page iova for %s failed",
3466                        nrpages, dev_name(dev));
3467                 return 0;
3468         }
3469
3470         return iova_pfn;
3471 }
3472
3473 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3474 {
3475         struct dmar_domain *domain, *tmp;
3476         struct dmar_rmrr_unit *rmrr;
3477         struct device *i_dev;
3478         int i, ret;
3479
3480         domain = find_domain(dev);
3481         if (domain)
3482                 goto out;
3483
3484         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3485         if (!domain)
3486                 goto out;
3487
3488         /* We have a new domain - setup possible RMRRs for the device */
3489         rcu_read_lock();
3490         for_each_rmrr_units(rmrr) {
3491                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3492                                           i, i_dev) {
3493                         if (i_dev != dev)
3494                                 continue;
3495
3496                         ret = domain_prepare_identity_map(dev, domain,
3497                                                           rmrr->base_address,
3498                                                           rmrr->end_address);
3499                         if (ret)
3500                                 dev_err(dev, "Mapping reserved region failed\n");
3501                 }
3502         }
3503         rcu_read_unlock();
3504
3505         tmp = set_domain_for_dev(dev, domain);
3506         if (!tmp || domain != tmp) {
3507                 domain_exit(domain);
3508                 domain = tmp;
3509         }
3510
3511 out:
3512
3513         if (!domain)
3514                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3515
3516
3517         return domain;
3518 }
3519
3520 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3521 {
3522         struct device_domain_info *info;
3523
3524         /* No lock here, assumes no domain exit in normal case */
3525         info = dev->archdata.iommu;
3526         if (likely(info))
3527                 return info->domain;
3528
3529         return __get_valid_domain_for_dev(dev);
3530 }
3531
3532 /* Check if the dev needs to go through non-identity map and unmap process.*/
3533 static int iommu_no_mapping(struct device *dev)
3534 {
3535         int found;
3536
3537         if (iommu_dummy(dev))
3538                 return 1;
3539
3540         if (!iommu_identity_mapping)
3541                 return 0;
3542
3543         found = identity_mapping(dev);
3544         if (found) {
3545                 if (iommu_should_identity_map(dev, 0))
3546                         return 1;
3547                 else {
3548                         /*
3549                          * 32 bit DMA is removed from si_domain and fall back
3550                          * to non-identity mapping.
3551                          */
3552                         dmar_remove_one_dev_info(si_domain, dev);
3553                         pr_info("32bit %s uses non-identity mapping\n",
3554                                 dev_name(dev));
3555                         return 0;
3556                 }
3557         } else {
3558                 /*
3559                  * In case of a detached 64 bit DMA device from vm, the device
3560                  * is put into si_domain for identity mapping.
3561                  */
3562                 if (iommu_should_identity_map(dev, 0)) {
3563                         int ret;
3564                         ret = domain_add_dev_info(si_domain, dev);
3565                         if (!ret) {
3566                                 pr_info("64bit %s uses identity mapping\n",
3567                                         dev_name(dev));
3568                                 return 1;
3569                         }
3570                 }
3571         }
3572
3573         return 0;
3574 }
3575
3576 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3577                                      size_t size, int dir, u64 dma_mask)
3578 {
3579         struct dmar_domain *domain;
3580         phys_addr_t start_paddr;
3581         unsigned long iova_pfn;
3582         int prot = 0;
3583         int ret;
3584         struct intel_iommu *iommu;
3585         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3586
3587         BUG_ON(dir == DMA_NONE);
3588
3589         if (iommu_no_mapping(dev))
3590                 return paddr;
3591
3592         domain = get_valid_domain_for_dev(dev);
3593         if (!domain)
3594                 return 0;
3595
3596         iommu = domain_get_iommu(domain);
3597         size = aligned_nrpages(paddr, size);
3598
3599         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3600         if (!iova_pfn)
3601                 goto error;
3602
3603         /*
3604          * Check if DMAR supports zero-length reads on write only
3605          * mappings..
3606          */
3607         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3608                         !cap_zlr(iommu->cap))
3609                 prot |= DMA_PTE_READ;
3610         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3611                 prot |= DMA_PTE_WRITE;
3612         /*
3613          * paddr - (paddr + size) might be partial page, we should map the whole
3614          * page.  Note: if two part of one page are separately mapped, we
3615          * might have two guest_addr mapping to the same host paddr, but this
3616          * is not a big problem
3617          */
3618         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3619                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3620         if (ret)
3621                 goto error;
3622
3623         /* it's a non-present to present mapping. Only flush if caching mode */
3624         if (cap_caching_mode(iommu->cap))
3625                 iommu_flush_iotlb_psi(iommu, domain,
3626                                       mm_to_dma_pfn(iova_pfn),
3627                                       size, 0, 1);
3628         else
3629                 iommu_flush_write_buffer(iommu);
3630
3631         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3632         start_paddr += paddr & ~PAGE_MASK;
3633         return start_paddr;
3634
3635 error:
3636         if (iova_pfn)
3637                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3638         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3639                 dev_name(dev), size, (unsigned long long)paddr, dir);
3640         return 0;
3641 }
3642
3643 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3644                                  unsigned long offset, size_t size,
3645                                  enum dma_data_direction dir,
3646                                  unsigned long attrs)
3647 {
3648         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3649                                   dir, *dev->dma_mask);
3650 }
3651
3652 static void flush_unmaps(struct deferred_flush_data *flush_data)
3653 {
3654         int i, j;
3655
3656         flush_data->timer_on = 0;
3657
3658         /* just flush them all */
3659         for (i = 0; i < g_num_of_iommus; i++) {
3660                 struct intel_iommu *iommu = g_iommus[i];
3661                 struct deferred_flush_table *flush_table =
3662                                 &flush_data->tables[i];
3663                 if (!iommu)
3664                         continue;
3665
3666                 if (!flush_table->next)
3667                         continue;
3668
3669                 /* In caching mode, global flushes turn emulation expensive */
3670                 if (!cap_caching_mode(iommu->cap))
3671                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3672                                          DMA_TLB_GLOBAL_FLUSH);
3673                 for (j = 0; j < flush_table->next; j++) {
3674                         unsigned long mask;
3675                         struct deferred_flush_entry *entry =
3676                                                 &flush_table->entries[j];
3677                         unsigned long iova_pfn = entry->iova_pfn;
3678                         unsigned long nrpages = entry->nrpages;
3679                         struct dmar_domain *domain = entry->domain;
3680                         struct page *freelist = entry->freelist;
3681
3682                         /* On real hardware multiple invalidations are expensive */
3683                         if (cap_caching_mode(iommu->cap))
3684                                 iommu_flush_iotlb_psi(iommu, domain,
3685                                         mm_to_dma_pfn(iova_pfn),
3686                                         nrpages, !freelist, 0);
3687                         else {
3688                                 mask = ilog2(nrpages);
3689                                 iommu_flush_dev_iotlb(domain,
3690                                                 (uint64_t)iova_pfn << PAGE_SHIFT, mask);
3691                         }
3692                         free_iova_fast(&domain->iovad, iova_pfn, nrpages);
3693                         if (freelist)
3694                                 dma_free_pagelist(freelist);
3695                 }
3696                 flush_table->next = 0;
3697         }
3698
3699         flush_data->size = 0;
3700 }
3701
3702 static void flush_unmaps_timeout(unsigned long cpuid)
3703 {
3704         struct deferred_flush_data *flush_data = per_cpu_ptr(&deferred_flush, cpuid);
3705         unsigned long flags;
3706
3707         spin_lock_irqsave(&flush_data->lock, flags);
3708         flush_unmaps(flush_data);
3709         spin_unlock_irqrestore(&flush_data->lock, flags);
3710 }
3711
3712 static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
3713                       unsigned long nrpages, struct page *freelist)
3714 {
3715         unsigned long flags;
3716         int entry_id, iommu_id;
3717         struct intel_iommu *iommu;
3718         struct deferred_flush_entry *entry;
3719         struct deferred_flush_data *flush_data;
3720         unsigned int cpuid;
3721
3722         cpuid = get_cpu();
3723         flush_data = per_cpu_ptr(&deferred_flush, cpuid);
3724
3725         /* Flush all CPUs' entries to avoid deferring too much.  If
3726          * this becomes a bottleneck, can just flush us, and rely on
3727          * flush timer for the rest.
3728          */
3729         if (flush_data->size == HIGH_WATER_MARK) {
3730                 int cpu;
3731
3732                 for_each_online_cpu(cpu)
3733                         flush_unmaps_timeout(cpu);
3734         }
3735
3736         spin_lock_irqsave(&flush_data->lock, flags);
3737
3738         iommu = domain_get_iommu(dom);
3739         iommu_id = iommu->seq_id;
3740
3741         entry_id = flush_data->tables[iommu_id].next;
3742         ++(flush_data->tables[iommu_id].next);
3743
3744         entry = &flush_data->tables[iommu_id].entries[entry_id];
3745         entry->domain = dom;
3746         entry->iova_pfn = iova_pfn;
3747         entry->nrpages = nrpages;
3748         entry->freelist = freelist;
3749
3750         if (!flush_data->timer_on) {
3751                 mod_timer(&flush_data->timer, jiffies + msecs_to_jiffies(10));
3752                 flush_data->timer_on = 1;
3753         }
3754         flush_data->size++;
3755         spin_unlock_irqrestore(&flush_data->lock, flags);
3756
3757         put_cpu();
3758 }
3759
3760 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3761 {
3762         struct dmar_domain *domain;
3763         unsigned long start_pfn, last_pfn;
3764         unsigned long nrpages;
3765         unsigned long iova_pfn;
3766         struct intel_iommu *iommu;
3767         struct page *freelist;
3768
3769         if (iommu_no_mapping(dev))
3770                 return;
3771
3772         domain = find_domain(dev);
3773         BUG_ON(!domain);
3774
3775         iommu = domain_get_iommu(domain);
3776
3777         iova_pfn = IOVA_PFN(dev_addr);
3778
3779         nrpages = aligned_nrpages(dev_addr, size);
3780         start_pfn = mm_to_dma_pfn(iova_pfn);
3781         last_pfn = start_pfn + nrpages - 1;
3782
3783         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3784                  dev_name(dev), start_pfn, last_pfn);
3785
3786         freelist = domain_unmap(domain, start_pfn, last_pfn);
3787
3788         if (intel_iommu_strict) {
3789                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3790                                       nrpages, !freelist, 0);
3791                 /* free iova */
3792                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3793                 dma_free_pagelist(freelist);
3794         } else {
3795                 add_unmap(domain, iova_pfn, nrpages, freelist);
3796                 /*
3797                  * queue up the release of the unmap to save the 1/6th of the
3798                  * cpu used up by the iotlb flush operation...
3799                  */
3800         }
3801 }
3802
3803 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3804                              size_t size, enum dma_data_direction dir,
3805                              unsigned long attrs)
3806 {
3807         intel_unmap(dev, dev_addr, size);
3808 }
3809
3810 static void *intel_alloc_coherent(struct device *dev, size_t size,
3811                                   dma_addr_t *dma_handle, gfp_t flags,
3812                                   unsigned long attrs)
3813 {
3814         struct page *page = NULL;
3815         int order;
3816
3817         size = PAGE_ALIGN(size);
3818         order = get_order(size);
3819
3820         if (!iommu_no_mapping(dev))
3821                 flags &= ~(GFP_DMA | GFP_DMA32);
3822         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3823                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3824                         flags |= GFP_DMA;
3825                 else
3826                         flags |= GFP_DMA32;
3827         }
3828
3829         if (gfpflags_allow_blocking(flags)) {
3830                 unsigned int count = size >> PAGE_SHIFT;
3831
3832                 page = dma_alloc_from_contiguous(dev, count, order, flags);
3833                 if (page && iommu_no_mapping(dev) &&
3834                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3835                         dma_release_from_contiguous(dev, page, count);
3836                         page = NULL;
3837                 }
3838         }
3839
3840         if (!page)
3841                 page = alloc_pages(flags, order);
3842         if (!page)
3843                 return NULL;
3844         memset(page_address(page), 0, size);
3845
3846         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3847                                          DMA_BIDIRECTIONAL,
3848                                          dev->coherent_dma_mask);
3849         if (*dma_handle)
3850                 return page_address(page);
3851         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3852                 __free_pages(page, order);
3853
3854         return NULL;
3855 }
3856
3857 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3858                                 dma_addr_t dma_handle, unsigned long attrs)
3859 {
3860         int order;
3861         struct page *page = virt_to_page(vaddr);
3862
3863         size = PAGE_ALIGN(size);
3864         order = get_order(size);
3865
3866         intel_unmap(dev, dma_handle, size);
3867         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3868                 __free_pages(page, order);
3869 }
3870
3871 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3872                            int nelems, enum dma_data_direction dir,
3873                            unsigned long attrs)
3874 {
3875         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3876         unsigned long nrpages = 0;
3877         struct scatterlist *sg;
3878         int i;
3879
3880         for_each_sg(sglist, sg, nelems, i) {
3881                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3882         }
3883
3884         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3885 }
3886
3887 static int intel_nontranslate_map_sg(struct device *hddev,
3888         struct scatterlist *sglist, int nelems, int dir)
3889 {
3890         int i;
3891         struct scatterlist *sg;
3892
3893         for_each_sg(sglist, sg, nelems, i) {
3894                 BUG_ON(!sg_page(sg));
3895                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3896                 sg->dma_length = sg->length;
3897         }
3898         return nelems;
3899 }
3900
3901 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3902                         enum dma_data_direction dir, unsigned long attrs)
3903 {
3904         int i;
3905         struct dmar_domain *domain;
3906         size_t size = 0;
3907         int prot = 0;
3908         unsigned long iova_pfn;
3909         int ret;
3910         struct scatterlist *sg;
3911         unsigned long start_vpfn;
3912         struct intel_iommu *iommu;
3913
3914         BUG_ON(dir == DMA_NONE);
3915         if (iommu_no_mapping(dev))
3916                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3917
3918         domain = get_valid_domain_for_dev(dev);
3919         if (!domain)
3920                 return 0;
3921
3922         iommu = domain_get_iommu(domain);
3923
3924         for_each_sg(sglist, sg, nelems, i)
3925                 size += aligned_nrpages(sg->offset, sg->length);
3926
3927         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3928                                 *dev->dma_mask);
3929         if (!iova_pfn) {
3930                 sglist->dma_length = 0;
3931                 return 0;
3932         }
3933
3934         /*
3935          * Check if DMAR supports zero-length reads on write only
3936          * mappings..
3937          */
3938         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3939                         !cap_zlr(iommu->cap))
3940                 prot |= DMA_PTE_READ;
3941         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3942                 prot |= DMA_PTE_WRITE;
3943
3944         start_vpfn = mm_to_dma_pfn(iova_pfn);
3945
3946         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3947         if (unlikely(ret)) {
3948                 dma_pte_free_pagetable(domain, start_vpfn,
3949                                        start_vpfn + size - 1);
3950                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3951                 return 0;
3952         }
3953
3954         /* it's a non-present to present mapping. Only flush if caching mode */
3955         if (cap_caching_mode(iommu->cap))
3956                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3957         else
3958                 iommu_flush_write_buffer(iommu);
3959
3960         return nelems;
3961 }
3962
3963 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3964 {
3965         return !dma_addr;
3966 }
3967
3968 struct dma_map_ops intel_dma_ops = {
3969         .alloc = intel_alloc_coherent,
3970         .free = intel_free_coherent,
3971         .map_sg = intel_map_sg,
3972         .unmap_sg = intel_unmap_sg,
3973         .map_page = intel_map_page,
3974         .unmap_page = intel_unmap_page,
3975         .mapping_error = intel_mapping_error,
3976 };
3977
3978 static inline int iommu_domain_cache_init(void)
3979 {
3980         int ret = 0;
3981
3982         iommu_domain_cache = kmem_cache_create("iommu_domain",
3983                                          sizeof(struct dmar_domain),
3984                                          0,
3985                                          SLAB_HWCACHE_ALIGN,
3986
3987                                          NULL);
3988         if (!iommu_domain_cache) {
3989                 pr_err("Couldn't create iommu_domain cache\n");
3990                 ret = -ENOMEM;
3991         }
3992
3993         return ret;
3994 }
3995
3996 static inline int iommu_devinfo_cache_init(void)
3997 {
3998         int ret = 0;
3999
4000         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4001                                          sizeof(struct device_domain_info),
4002                                          0,
4003                                          SLAB_HWCACHE_ALIGN,
4004                                          NULL);
4005         if (!iommu_devinfo_cache) {
4006                 pr_err("Couldn't create devinfo cache\n");
4007                 ret = -ENOMEM;
4008         }
4009
4010         return ret;
4011 }
4012
4013 static int __init iommu_init_mempool(void)
4014 {
4015         int ret;
4016         ret = iova_cache_get();
4017         if (ret)
4018                 return ret;
4019
4020         ret = iommu_domain_cache_init();
4021         if (ret)
4022                 goto domain_error;
4023
4024         ret = iommu_devinfo_cache_init();
4025         if (!ret)
4026                 return ret;
4027
4028         kmem_cache_destroy(iommu_domain_cache);
4029 domain_error:
4030         iova_cache_put();
4031
4032         return -ENOMEM;
4033 }
4034
4035 static void __init iommu_exit_mempool(void)
4036 {
4037         kmem_cache_destroy(iommu_devinfo_cache);
4038         kmem_cache_destroy(iommu_domain_cache);
4039         iova_cache_put();
4040 }
4041
4042 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4043 {
4044         struct dmar_drhd_unit *drhd;
4045         u32 vtbar;
4046         int rc;
4047
4048         /* We know that this device on this chipset has its own IOMMU.
4049          * If we find it under a different IOMMU, then the BIOS is lying
4050          * to us. Hope that the IOMMU for this device is actually
4051          * disabled, and it needs no translation...
4052          */
4053         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4054         if (rc) {
4055                 /* "can't" happen */
4056                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4057                 return;
4058         }
4059         vtbar &= 0xffff0000;
4060
4061         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4062         drhd = dmar_find_matched_drhd_unit(pdev);
4063         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4064                             TAINT_FIRMWARE_WORKAROUND,
4065                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4066                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4067 }
4068 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4069
4070 static void __init init_no_remapping_devices(void)
4071 {
4072         struct dmar_drhd_unit *drhd;
4073         struct device *dev;
4074         int i;
4075
4076         for_each_drhd_unit(drhd) {
4077                 if (!drhd->include_all) {
4078                         for_each_active_dev_scope(drhd->devices,
4079                                                   drhd->devices_cnt, i, dev)
4080                                 break;
4081                         /* ignore DMAR unit if no devices exist */
4082                         if (i == drhd->devices_cnt)
4083                                 drhd->ignored = 1;
4084                 }
4085         }
4086
4087         for_each_active_drhd_unit(drhd) {
4088                 if (drhd->include_all)
4089                         continue;
4090
4091                 for_each_active_dev_scope(drhd->devices,
4092                                           drhd->devices_cnt, i, dev)
4093                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4094                                 break;
4095                 if (i < drhd->devices_cnt)
4096                         continue;
4097
4098                 /* This IOMMU has *only* gfx devices. Either bypass it or
4099                    set the gfx_mapped flag, as appropriate */
4100                 if (dmar_map_gfx) {
4101                         intel_iommu_gfx_mapped = 1;
4102                 } else {
4103                         drhd->ignored = 1;
4104                         for_each_active_dev_scope(drhd->devices,
4105                                                   drhd->devices_cnt, i, dev)
4106                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4107                 }
4108         }
4109 }
4110
4111 #ifdef CONFIG_SUSPEND
4112 static int init_iommu_hw(void)
4113 {
4114         struct dmar_drhd_unit *drhd;
4115         struct intel_iommu *iommu = NULL;
4116
4117         for_each_active_iommu(iommu, drhd)
4118                 if (iommu->qi)
4119                         dmar_reenable_qi(iommu);
4120
4121         for_each_iommu(iommu, drhd) {
4122                 if (drhd->ignored) {
4123                         /*
4124                          * we always have to disable PMRs or DMA may fail on
4125                          * this device
4126                          */
4127                         if (force_on)
4128                                 iommu_disable_protect_mem_regions(iommu);
4129                         continue;
4130                 }
4131
4132                 iommu_flush_write_buffer(iommu);
4133
4134                 iommu_set_root_entry(iommu);
4135
4136                 iommu->flush.flush_context(iommu, 0, 0, 0,
4137                                            DMA_CCMD_GLOBAL_INVL);
4138                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4139                 iommu_enable_translation(iommu);
4140                 iommu_disable_protect_mem_regions(iommu);
4141         }
4142
4143         return 0;
4144 }
4145
4146 static void iommu_flush_all(void)
4147 {
4148         struct dmar_drhd_unit *drhd;
4149         struct intel_iommu *iommu;
4150
4151         for_each_active_iommu(iommu, drhd) {
4152                 iommu->flush.flush_context(iommu, 0, 0, 0,
4153                                            DMA_CCMD_GLOBAL_INVL);
4154                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4155                                          DMA_TLB_GLOBAL_FLUSH);
4156         }
4157 }
4158
4159 static int iommu_suspend(void)
4160 {
4161         struct dmar_drhd_unit *drhd;
4162         struct intel_iommu *iommu = NULL;
4163         unsigned long flag;
4164
4165         for_each_active_iommu(iommu, drhd) {
4166                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4167                                                  GFP_ATOMIC);
4168                 if (!iommu->iommu_state)
4169                         goto nomem;
4170         }
4171
4172         iommu_flush_all();
4173
4174         for_each_active_iommu(iommu, drhd) {
4175                 iommu_disable_translation(iommu);
4176
4177                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4178
4179                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4180                         readl(iommu->reg + DMAR_FECTL_REG);
4181                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4182                         readl(iommu->reg + DMAR_FEDATA_REG);
4183                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4184                         readl(iommu->reg + DMAR_FEADDR_REG);
4185                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4186                         readl(iommu->reg + DMAR_FEUADDR_REG);
4187
4188                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4189         }
4190         return 0;
4191
4192 nomem:
4193         for_each_active_iommu(iommu, drhd)
4194                 kfree(iommu->iommu_state);
4195
4196         return -ENOMEM;
4197 }
4198
4199 static void iommu_resume(void)
4200 {
4201         struct dmar_drhd_unit *drhd;
4202         struct intel_iommu *iommu = NULL;
4203         unsigned long flag;
4204
4205         if (init_iommu_hw()) {
4206                 if (force_on)
4207                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4208                 else
4209                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4210                 return;
4211         }
4212
4213         for_each_active_iommu(iommu, drhd) {
4214
4215                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4216
4217                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4218                         iommu->reg + DMAR_FECTL_REG);
4219                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4220                         iommu->reg + DMAR_FEDATA_REG);
4221                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4222                         iommu->reg + DMAR_FEADDR_REG);
4223                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4224                         iommu->reg + DMAR_FEUADDR_REG);
4225
4226                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4227         }
4228
4229         for_each_active_iommu(iommu, drhd)
4230                 kfree(iommu->iommu_state);
4231 }
4232
4233 static struct syscore_ops iommu_syscore_ops = {
4234         .resume         = iommu_resume,
4235         .suspend        = iommu_suspend,
4236 };
4237
4238 static void __init init_iommu_pm_ops(void)
4239 {
4240         register_syscore_ops(&iommu_syscore_ops);
4241 }
4242
4243 #else
4244 static inline void init_iommu_pm_ops(void) {}
4245 #endif  /* CONFIG_PM */
4246
4247
4248 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4249 {
4250         struct acpi_dmar_reserved_memory *rmrr;
4251         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4252         struct dmar_rmrr_unit *rmrru;
4253         size_t length;
4254
4255         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4256         if (!rmrru)
4257                 goto out;
4258
4259         rmrru->hdr = header;
4260         rmrr = (struct acpi_dmar_reserved_memory *)header;
4261         rmrru->base_address = rmrr->base_address;
4262         rmrru->end_address = rmrr->end_address;
4263
4264         length = rmrr->end_address - rmrr->base_address + 1;
4265         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4266                                               IOMMU_RESV_DIRECT);
4267         if (!rmrru->resv)
4268                 goto free_rmrru;
4269
4270         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4271                                 ((void *)rmrr) + rmrr->header.length,
4272                                 &rmrru->devices_cnt);
4273         if (rmrru->devices_cnt && rmrru->devices == NULL)
4274                 goto free_all;
4275
4276         list_add(&rmrru->list, &dmar_rmrr_units);
4277
4278         return 0;
4279 free_all:
4280         kfree(rmrru->resv);
4281 free_rmrru:
4282         kfree(rmrru);
4283 out:
4284         return -ENOMEM;
4285 }
4286
4287 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4288 {
4289         struct dmar_atsr_unit *atsru;
4290         struct acpi_dmar_atsr *tmp;
4291
4292         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4293                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4294                 if (atsr->segment != tmp->segment)
4295                         continue;
4296                 if (atsr->header.length != tmp->header.length)
4297                         continue;
4298                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4299                         return atsru;
4300         }
4301
4302         return NULL;
4303 }
4304
4305 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4306 {
4307         struct acpi_dmar_atsr *atsr;
4308         struct dmar_atsr_unit *atsru;
4309
4310         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4311                 return 0;
4312
4313         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4314         atsru = dmar_find_atsr(atsr);
4315         if (atsru)
4316                 return 0;
4317
4318         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4319         if (!atsru)
4320                 return -ENOMEM;
4321
4322         /*
4323          * If memory is allocated from slab by ACPI _DSM method, we need to
4324          * copy the memory content because the memory buffer will be freed
4325          * on return.
4326          */
4327         atsru->hdr = (void *)(atsru + 1);
4328         memcpy(atsru->hdr, hdr, hdr->length);
4329         atsru->include_all = atsr->flags & 0x1;
4330         if (!atsru->include_all) {
4331                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4332                                 (void *)atsr + atsr->header.length,
4333                                 &atsru->devices_cnt);
4334                 if (atsru->devices_cnt && atsru->devices == NULL) {
4335                         kfree(atsru);
4336                         return -ENOMEM;
4337                 }
4338         }
4339
4340         list_add_rcu(&atsru->list, &dmar_atsr_units);
4341
4342         return 0;
4343 }
4344
4345 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4346 {
4347         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4348         kfree(atsru);
4349 }
4350
4351 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4352 {
4353         struct acpi_dmar_atsr *atsr;
4354         struct dmar_atsr_unit *atsru;
4355
4356         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4357         atsru = dmar_find_atsr(atsr);
4358         if (atsru) {
4359                 list_del_rcu(&atsru->list);
4360                 synchronize_rcu();
4361                 intel_iommu_free_atsr(atsru);
4362         }
4363
4364         return 0;
4365 }
4366
4367 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4368 {
4369         int i;
4370         struct device *dev;
4371         struct acpi_dmar_atsr *atsr;
4372         struct dmar_atsr_unit *atsru;
4373
4374         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4375         atsru = dmar_find_atsr(atsr);
4376         if (!atsru)
4377                 return 0;
4378
4379         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4380                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4381                                           i, dev)
4382                         return -EBUSY;
4383         }
4384
4385         return 0;
4386 }
4387
4388 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4389 {
4390         int sp, ret = 0;
4391         struct intel_iommu *iommu = dmaru->iommu;
4392
4393         if (g_iommus[iommu->seq_id])
4394                 return 0;
4395
4396         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4397                 pr_warn("%s: Doesn't support hardware pass through.\n",
4398                         iommu->name);
4399                 return -ENXIO;
4400         }
4401         if (!ecap_sc_support(iommu->ecap) &&
4402             domain_update_iommu_snooping(iommu)) {
4403                 pr_warn("%s: Doesn't support snooping.\n",
4404                         iommu->name);
4405                 return -ENXIO;
4406         }
4407         sp = domain_update_iommu_superpage(iommu) - 1;
4408         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4409                 pr_warn("%s: Doesn't support large page.\n",
4410                         iommu->name);
4411                 return -ENXIO;
4412         }
4413
4414         /*
4415          * Disable translation if already enabled prior to OS handover.
4416          */
4417         if (iommu->gcmd & DMA_GCMD_TE)
4418                 iommu_disable_translation(iommu);
4419
4420         g_iommus[iommu->seq_id] = iommu;
4421         ret = iommu_init_domains(iommu);
4422         if (ret == 0)
4423                 ret = iommu_alloc_root_entry(iommu);
4424         if (ret)
4425                 goto out;
4426
4427 #ifdef CONFIG_INTEL_IOMMU_SVM
4428         if (pasid_enabled(iommu))
4429                 intel_svm_alloc_pasid_tables(iommu);
4430 #endif
4431
4432         if (dmaru->ignored) {
4433                 /*
4434                  * we always have to disable PMRs or DMA may fail on this device
4435                  */
4436                 if (force_on)
4437                         iommu_disable_protect_mem_regions(iommu);
4438                 return 0;
4439         }
4440
4441         intel_iommu_init_qi(iommu);
4442         iommu_flush_write_buffer(iommu);
4443
4444 #ifdef CONFIG_INTEL_IOMMU_SVM
4445         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4446                 ret = intel_svm_enable_prq(iommu);
4447                 if (ret)
4448                         goto disable_iommu;
4449         }
4450 #endif
4451         ret = dmar_set_interrupt(iommu);
4452         if (ret)
4453                 goto disable_iommu;
4454
4455         iommu_set_root_entry(iommu);
4456         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4457         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4458         iommu_enable_translation(iommu);
4459
4460         iommu_disable_protect_mem_regions(iommu);
4461         return 0;
4462
4463 disable_iommu:
4464         disable_dmar_iommu(iommu);
4465 out:
4466         free_dmar_iommu(iommu);
4467         return ret;
4468 }
4469
4470 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4471 {
4472         int ret = 0;
4473         struct intel_iommu *iommu = dmaru->iommu;
4474
4475         if (!intel_iommu_enabled)
4476                 return 0;
4477         if (iommu == NULL)
4478                 return -EINVAL;
4479
4480         if (insert) {
4481                 ret = intel_iommu_add(dmaru);
4482         } else {
4483                 disable_dmar_iommu(iommu);
4484                 free_dmar_iommu(iommu);
4485         }
4486
4487         return ret;
4488 }
4489
4490 static void intel_iommu_free_dmars(void)
4491 {
4492         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4493         struct dmar_atsr_unit *atsru, *atsr_n;
4494
4495         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4496                 list_del(&rmrru->list);
4497                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4498                 kfree(rmrru->resv);
4499                 kfree(rmrru);
4500         }
4501
4502         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4503                 list_del(&atsru->list);
4504                 intel_iommu_free_atsr(atsru);
4505         }
4506 }
4507
4508 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4509 {
4510         int i, ret = 1;
4511         struct pci_bus *bus;
4512         struct pci_dev *bridge = NULL;
4513         struct device *tmp;
4514         struct acpi_dmar_atsr *atsr;
4515         struct dmar_atsr_unit *atsru;
4516
4517         dev = pci_physfn(dev);
4518         for (bus = dev->bus; bus; bus = bus->parent) {
4519                 bridge = bus->self;
4520                 /* If it's an integrated device, allow ATS */
4521                 if (!bridge)
4522                         return 1;
4523                 /* Connected via non-PCIe: no ATS */
4524                 if (!pci_is_pcie(bridge) ||
4525                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4526                         return 0;
4527                 /* If we found the root port, look it up in the ATSR */
4528                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4529                         break;
4530         }
4531
4532         rcu_read_lock();
4533         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4534                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4535                 if (atsr->segment != pci_domain_nr(dev->bus))
4536                         continue;
4537
4538                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4539                         if (tmp == &bridge->dev)
4540                                 goto out;
4541
4542                 if (atsru->include_all)
4543                         goto out;
4544         }
4545         ret = 0;
4546 out:
4547         rcu_read_unlock();
4548
4549         return ret;
4550 }
4551
4552 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4553 {
4554         int ret = 0;
4555         struct dmar_rmrr_unit *rmrru;
4556         struct dmar_atsr_unit *atsru;
4557         struct acpi_dmar_atsr *atsr;
4558         struct acpi_dmar_reserved_memory *rmrr;
4559
4560         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4561                 return 0;
4562
4563         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4564                 rmrr = container_of(rmrru->hdr,
4565                                     struct acpi_dmar_reserved_memory, header);
4566                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4567                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4568                                 ((void *)rmrr) + rmrr->header.length,
4569                                 rmrr->segment, rmrru->devices,
4570                                 rmrru->devices_cnt);
4571                         if(ret < 0)
4572                                 return ret;
4573                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4574                         dmar_remove_dev_scope(info, rmrr->segment,
4575                                 rmrru->devices, rmrru->devices_cnt);
4576                 }
4577         }
4578
4579         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4580                 if (atsru->include_all)
4581                         continue;
4582
4583                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4584                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4585                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4586                                         (void *)atsr + atsr->header.length,
4587                                         atsr->segment, atsru->devices,
4588                                         atsru->devices_cnt);
4589                         if (ret > 0)
4590                                 break;
4591                         else if(ret < 0)
4592                                 return ret;
4593                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4594                         if (dmar_remove_dev_scope(info, atsr->segment,
4595                                         atsru->devices, atsru->devices_cnt))
4596                                 break;
4597                 }
4598         }
4599
4600         return 0;
4601 }
4602
4603 /*
4604  * Here we only respond to action of unbound device from driver.
4605  *
4606  * Added device is not attached to its DMAR domain here yet. That will happen
4607  * when mapping the device to iova.
4608  */
4609 static int device_notifier(struct notifier_block *nb,
4610                                   unsigned long action, void *data)
4611 {
4612         struct device *dev = data;
4613         struct dmar_domain *domain;
4614
4615         if (iommu_dummy(dev))
4616                 return 0;
4617
4618         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4619                 return 0;
4620
4621         domain = find_domain(dev);
4622         if (!domain)
4623                 return 0;
4624
4625         dmar_remove_one_dev_info(domain, dev);
4626         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4627                 domain_exit(domain);
4628
4629         return 0;
4630 }
4631
4632 static struct notifier_block device_nb = {
4633         .notifier_call = device_notifier,
4634 };
4635
4636 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4637                                        unsigned long val, void *v)
4638 {
4639         struct memory_notify *mhp = v;
4640         unsigned long long start, end;
4641         unsigned long start_vpfn, last_vpfn;
4642
4643         switch (val) {
4644         case MEM_GOING_ONLINE:
4645                 start = mhp->start_pfn << PAGE_SHIFT;
4646                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4647                 if (iommu_domain_identity_map(si_domain, start, end)) {
4648                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4649                                 start, end);
4650                         return NOTIFY_BAD;
4651                 }
4652                 break;
4653
4654         case MEM_OFFLINE:
4655         case MEM_CANCEL_ONLINE:
4656                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4657                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4658                 while (start_vpfn <= last_vpfn) {
4659                         struct iova *iova;
4660                         struct dmar_drhd_unit *drhd;
4661                         struct intel_iommu *iommu;
4662                         struct page *freelist;
4663
4664                         iova = find_iova(&si_domain->iovad, start_vpfn);
4665                         if (iova == NULL) {
4666                                 pr_debug("Failed get IOVA for PFN %lx\n",
4667                                          start_vpfn);
4668                                 break;
4669                         }
4670
4671                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4672                                                      start_vpfn, last_vpfn);
4673                         if (iova == NULL) {
4674                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4675                                         start_vpfn, last_vpfn);
4676                                 return NOTIFY_BAD;
4677                         }
4678
4679                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4680                                                iova->pfn_hi);
4681
4682                         rcu_read_lock();
4683                         for_each_active_iommu(iommu, drhd)
4684                                 iommu_flush_iotlb_psi(iommu, si_domain,
4685                                         iova->pfn_lo, iova_size(iova),
4686                                         !freelist, 0);
4687                         rcu_read_unlock();
4688                         dma_free_pagelist(freelist);
4689
4690                         start_vpfn = iova->pfn_hi + 1;
4691                         free_iova_mem(iova);
4692                 }
4693                 break;
4694         }
4695
4696         return NOTIFY_OK;
4697 }
4698
4699 static struct notifier_block intel_iommu_memory_nb = {
4700         .notifier_call = intel_iommu_memory_notifier,
4701         .priority = 0
4702 };
4703
4704 static void free_all_cpu_cached_iovas(unsigned int cpu)
4705 {
4706         int i;
4707
4708         for (i = 0; i < g_num_of_iommus; i++) {
4709                 struct intel_iommu *iommu = g_iommus[i];
4710                 struct dmar_domain *domain;
4711                 int did;
4712
4713                 if (!iommu)
4714                         continue;
4715
4716                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4717                         domain = get_iommu_domain(iommu, (u16)did);
4718
4719                         if (!domain)
4720                                 continue;
4721                         free_cpu_cached_iovas(cpu, &domain->iovad);
4722                 }
4723         }
4724 }
4725
4726 static int intel_iommu_cpu_dead(unsigned int cpu)
4727 {
4728         free_all_cpu_cached_iovas(cpu);
4729         flush_unmaps_timeout(cpu);
4730         return 0;
4731 }
4732
4733 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4734 {
4735         return container_of(dev, struct intel_iommu, iommu.dev);
4736 }
4737
4738 static ssize_t intel_iommu_show_version(struct device *dev,
4739                                         struct device_attribute *attr,
4740                                         char *buf)
4741 {
4742         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4743         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4744         return sprintf(buf, "%d:%d\n",
4745                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4746 }
4747 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4748
4749 static ssize_t intel_iommu_show_address(struct device *dev,
4750                                         struct device_attribute *attr,
4751                                         char *buf)
4752 {
4753         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4754         return sprintf(buf, "%llx\n", iommu->reg_phys);
4755 }
4756 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4757
4758 static ssize_t intel_iommu_show_cap(struct device *dev,
4759                                     struct device_attribute *attr,
4760                                     char *buf)
4761 {
4762         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4763         return sprintf(buf, "%llx\n", iommu->cap);
4764 }
4765 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4766
4767 static ssize_t intel_iommu_show_ecap(struct device *dev,
4768                                     struct device_attribute *attr,
4769                                     char *buf)
4770 {
4771         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4772         return sprintf(buf, "%llx\n", iommu->ecap);
4773 }
4774 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4775
4776 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4777                                       struct device_attribute *attr,
4778                                       char *buf)
4779 {
4780         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4781         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4782 }
4783 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4784
4785 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4786                                            struct device_attribute *attr,
4787                                            char *buf)
4788 {
4789         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4790         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4791                                                   cap_ndoms(iommu->cap)));
4792 }
4793 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4794
4795 static struct attribute *intel_iommu_attrs[] = {
4796         &dev_attr_version.attr,
4797         &dev_attr_address.attr,
4798         &dev_attr_cap.attr,
4799         &dev_attr_ecap.attr,
4800         &dev_attr_domains_supported.attr,
4801         &dev_attr_domains_used.attr,
4802         NULL,
4803 };
4804
4805 static struct attribute_group intel_iommu_group = {
4806         .name = "intel-iommu",
4807         .attrs = intel_iommu_attrs,
4808 };
4809
4810 const struct attribute_group *intel_iommu_groups[] = {
4811         &intel_iommu_group,
4812         NULL,
4813 };
4814
4815 int __init intel_iommu_init(void)
4816 {
4817         int ret = -ENODEV;
4818         struct dmar_drhd_unit *drhd;
4819         struct intel_iommu *iommu;
4820
4821         /* VT-d is required for a TXT/tboot launch, so enforce that */
4822         force_on = tboot_force_iommu();
4823
4824         if (iommu_init_mempool()) {
4825                 if (force_on)
4826                         panic("tboot: Failed to initialize iommu memory\n");
4827                 return -ENOMEM;
4828         }
4829
4830         down_write(&dmar_global_lock);
4831         if (dmar_table_init()) {
4832                 if (force_on)
4833                         panic("tboot: Failed to initialize DMAR table\n");
4834                 goto out_free_dmar;
4835         }
4836
4837         if (dmar_dev_scope_init() < 0) {
4838                 if (force_on)
4839                         panic("tboot: Failed to initialize DMAR device scope\n");
4840                 goto out_free_dmar;
4841         }
4842
4843         if (no_iommu || dmar_disabled)
4844                 goto out_free_dmar;
4845
4846         if (list_empty(&dmar_rmrr_units))
4847                 pr_info("No RMRR found\n");
4848
4849         if (list_empty(&dmar_atsr_units))
4850                 pr_info("No ATSR found\n");
4851
4852         if (dmar_init_reserved_ranges()) {
4853                 if (force_on)
4854                         panic("tboot: Failed to reserve iommu ranges\n");
4855                 goto out_free_reserved_range;
4856         }
4857
4858         init_no_remapping_devices();
4859
4860         ret = init_dmars();
4861         if (ret) {
4862                 if (force_on)
4863                         panic("tboot: Failed to initialize DMARs\n");
4864                 pr_err("Initialization failed\n");
4865                 goto out_free_reserved_range;
4866         }
4867         up_write(&dmar_global_lock);
4868         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4869
4870 #ifdef CONFIG_SWIOTLB
4871         swiotlb = 0;
4872 #endif
4873         dma_ops = &intel_dma_ops;
4874
4875         init_iommu_pm_ops();
4876
4877         for_each_active_iommu(iommu, drhd) {
4878                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4879                                        intel_iommu_groups,
4880                                        "%s", iommu->name);
4881                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4882                 iommu_device_register(&iommu->iommu);
4883         }
4884
4885         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4886         bus_register_notifier(&pci_bus_type, &device_nb);
4887         if (si_domain && !hw_pass_through)
4888                 register_memory_notifier(&intel_iommu_memory_nb);
4889         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4890                           intel_iommu_cpu_dead);
4891         intel_iommu_enabled = 1;
4892
4893         return 0;
4894
4895 out_free_reserved_range:
4896         put_iova_domain(&reserved_iova_list);
4897 out_free_dmar:
4898         intel_iommu_free_dmars();
4899         up_write(&dmar_global_lock);
4900         iommu_exit_mempool();
4901         return ret;
4902 }
4903
4904 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4905 {
4906         struct intel_iommu *iommu = opaque;
4907
4908         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4909         return 0;
4910 }
4911
4912 /*
4913  * NB - intel-iommu lacks any sort of reference counting for the users of
4914  * dependent devices.  If multiple endpoints have intersecting dependent
4915  * devices, unbinding the driver from any one of them will possibly leave
4916  * the others unable to operate.
4917  */
4918 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4919 {
4920         if (!iommu || !dev || !dev_is_pci(dev))
4921                 return;
4922
4923         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4924 }
4925
4926 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4927 {
4928         struct intel_iommu *iommu;
4929         unsigned long flags;
4930
4931         assert_spin_locked(&device_domain_lock);
4932
4933         if (WARN_ON(!info))
4934                 return;
4935
4936         iommu = info->iommu;
4937
4938         if (info->dev) {
4939                 iommu_disable_dev_iotlb(info);
4940                 domain_context_clear(iommu, info->dev);
4941         }
4942
4943         unlink_domain_info(info);
4944
4945         spin_lock_irqsave(&iommu->lock, flags);
4946         domain_detach_iommu(info->domain, iommu);
4947         spin_unlock_irqrestore(&iommu->lock, flags);
4948
4949         free_devinfo_mem(info);
4950 }
4951
4952 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4953                                      struct device *dev)
4954 {
4955         struct device_domain_info *info;
4956         unsigned long flags;
4957
4958         spin_lock_irqsave(&device_domain_lock, flags);
4959         info = dev->archdata.iommu;
4960         __dmar_remove_one_dev_info(info);
4961         spin_unlock_irqrestore(&device_domain_lock, flags);
4962 }
4963
4964 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4965 {
4966         int adjust_width;
4967
4968         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4969                         DMA_32BIT_PFN);
4970         domain_reserve_special_ranges(domain);
4971
4972         /* calculate AGAW */
4973         domain->gaw = guest_width;
4974         adjust_width = guestwidth_to_adjustwidth(guest_width);
4975         domain->agaw = width_to_agaw(adjust_width);
4976
4977         domain->iommu_coherency = 0;
4978         domain->iommu_snooping = 0;
4979         domain->iommu_superpage = 0;
4980         domain->max_addr = 0;
4981
4982         /* always allocate the top pgd */
4983         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4984         if (!domain->pgd)
4985                 return -ENOMEM;
4986         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4987         return 0;
4988 }
4989
4990 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4991 {
4992         struct dmar_domain *dmar_domain;
4993         struct iommu_domain *domain;
4994
4995         if (type != IOMMU_DOMAIN_UNMANAGED)
4996                 return NULL;
4997
4998         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4999         if (!dmar_domain) {
5000                 pr_err("Can't allocate dmar_domain\n");
5001                 return NULL;
5002         }
5003         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5004                 pr_err("Domain initialization failed\n");
5005                 domain_exit(dmar_domain);
5006                 return NULL;
5007         }
5008         domain_update_iommu_cap(dmar_domain);
5009
5010         domain = &dmar_domain->domain;
5011         domain->geometry.aperture_start = 0;
5012         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5013         domain->geometry.force_aperture = true;
5014
5015         return domain;
5016 }
5017
5018 static void intel_iommu_domain_free(struct iommu_domain *domain)
5019 {
5020         domain_exit(to_dmar_domain(domain));
5021 }
5022
5023 static int intel_iommu_attach_device(struct iommu_domain *domain,
5024                                      struct device *dev)
5025 {
5026         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5027         struct intel_iommu *iommu;
5028         int addr_width;
5029         u8 bus, devfn;
5030
5031         if (device_is_rmrr_locked(dev)) {
5032                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5033                 return -EPERM;
5034         }
5035
5036         /* normally dev is not mapped */
5037         if (unlikely(domain_context_mapped(dev))) {
5038                 struct dmar_domain *old_domain;
5039
5040                 old_domain = find_domain(dev);
5041                 if (old_domain) {
5042                         rcu_read_lock();
5043                         dmar_remove_one_dev_info(old_domain, dev);
5044                         rcu_read_unlock();
5045
5046                         if (!domain_type_is_vm_or_si(old_domain) &&
5047                              list_empty(&old_domain->devices))
5048                                 domain_exit(old_domain);
5049                 }
5050         }
5051
5052         iommu = device_to_iommu(dev, &bus, &devfn);
5053         if (!iommu)
5054                 return -ENODEV;
5055
5056         /* check if this iommu agaw is sufficient for max mapped address */
5057         addr_width = agaw_to_width(iommu->agaw);
5058         if (addr_width > cap_mgaw(iommu->cap))
5059                 addr_width = cap_mgaw(iommu->cap);
5060
5061         if (dmar_domain->max_addr > (1LL << addr_width)) {
5062                 pr_err("%s: iommu width (%d) is not "
5063                        "sufficient for the mapped address (%llx)\n",
5064                        __func__, addr_width, dmar_domain->max_addr);
5065                 return -EFAULT;
5066         }
5067         dmar_domain->gaw = addr_width;
5068
5069         /*
5070          * Knock out extra levels of page tables if necessary
5071          */
5072         while (iommu->agaw < dmar_domain->agaw) {
5073                 struct dma_pte *pte;
5074
5075                 pte = dmar_domain->pgd;
5076                 if (dma_pte_present(pte)) {
5077                         dmar_domain->pgd = (struct dma_pte *)
5078                                 phys_to_virt(dma_pte_addr(pte));
5079                         free_pgtable_page(pte);
5080                 }
5081                 dmar_domain->agaw--;
5082         }
5083
5084         return domain_add_dev_info(dmar_domain, dev);
5085 }
5086
5087 static void intel_iommu_detach_device(struct iommu_domain *domain,
5088                                       struct device *dev)
5089 {
5090         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5091 }
5092
5093 static int intel_iommu_map(struct iommu_domain *domain,
5094                            unsigned long iova, phys_addr_t hpa,
5095                            size_t size, int iommu_prot)
5096 {
5097         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5098         u64 max_addr;
5099         int prot = 0;
5100         int ret;
5101
5102         if (iommu_prot & IOMMU_READ)
5103                 prot |= DMA_PTE_READ;
5104         if (iommu_prot & IOMMU_WRITE)
5105                 prot |= DMA_PTE_WRITE;
5106         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5107                 prot |= DMA_PTE_SNP;
5108
5109         max_addr = iova + size;
5110         if (dmar_domain->max_addr < max_addr) {
5111                 u64 end;
5112
5113                 /* check if minimum agaw is sufficient for mapped address */
5114                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5115                 if (end < max_addr) {
5116                         pr_err("%s: iommu width (%d) is not "
5117                                "sufficient for the mapped address (%llx)\n",
5118                                __func__, dmar_domain->gaw, max_addr);
5119                         return -EFAULT;
5120                 }
5121                 dmar_domain->max_addr = max_addr;
5122         }
5123         /* Round up size to next multiple of PAGE_SIZE, if it and
5124            the low bits of hpa would take us onto the next page */
5125         size = aligned_nrpages(hpa, size);
5126         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5127                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5128         return ret;
5129 }
5130
5131 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5132                                 unsigned long iova, size_t size)
5133 {
5134         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5135         struct page *freelist = NULL;
5136         struct intel_iommu *iommu;
5137         unsigned long start_pfn, last_pfn;
5138         unsigned int npages;
5139         int iommu_id, level = 0;
5140
5141         /* Cope with horrid API which requires us to unmap more than the
5142            size argument if it happens to be a large-page mapping. */
5143         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5144
5145         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5146                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5147
5148         start_pfn = iova >> VTD_PAGE_SHIFT;
5149         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5150
5151         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5152
5153         npages = last_pfn - start_pfn + 1;
5154
5155         for_each_domain_iommu(iommu_id, dmar_domain) {
5156                 iommu = g_iommus[iommu_id];
5157
5158                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5159                                       start_pfn, npages, !freelist, 0);
5160         }
5161
5162         dma_free_pagelist(freelist);
5163
5164         if (dmar_domain->max_addr == iova + size)
5165                 dmar_domain->max_addr = iova;
5166
5167         return size;
5168 }
5169
5170 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5171                                             dma_addr_t iova)
5172 {
5173         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5174         struct dma_pte *pte;
5175         int level = 0;
5176         u64 phys = 0;
5177
5178         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5179         if (pte)
5180                 phys = dma_pte_addr(pte);
5181
5182         return phys;
5183 }
5184
5185 static bool intel_iommu_capable(enum iommu_cap cap)
5186 {
5187         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5188                 return domain_update_iommu_snooping(NULL) == 1;
5189         if (cap == IOMMU_CAP_INTR_REMAP)
5190                 return irq_remapping_enabled == 1;
5191
5192         return false;
5193 }
5194
5195 static int intel_iommu_add_device(struct device *dev)
5196 {
5197         struct intel_iommu *iommu;
5198         struct iommu_group *group;
5199         u8 bus, devfn;
5200
5201         iommu = device_to_iommu(dev, &bus, &devfn);
5202         if (!iommu)
5203                 return -ENODEV;
5204
5205         iommu_device_link(&iommu->iommu, dev);
5206
5207         group = iommu_group_get_for_dev(dev);
5208
5209         if (IS_ERR(group))
5210                 return PTR_ERR(group);
5211
5212         iommu_group_put(group);
5213         return 0;
5214 }
5215
5216 static void intel_iommu_remove_device(struct device *dev)
5217 {
5218         struct intel_iommu *iommu;
5219         u8 bus, devfn;
5220
5221         iommu = device_to_iommu(dev, &bus, &devfn);
5222         if (!iommu)
5223                 return;
5224
5225         iommu_group_remove_device(dev);
5226
5227         iommu_device_unlink(&iommu->iommu, dev);
5228 }
5229
5230 static void intel_iommu_get_resv_regions(struct device *device,
5231                                          struct list_head *head)
5232 {
5233         struct iommu_resv_region *reg;
5234         struct dmar_rmrr_unit *rmrr;
5235         struct device *i_dev;
5236         int i;
5237
5238         rcu_read_lock();
5239         for_each_rmrr_units(rmrr) {
5240                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5241                                           i, i_dev) {
5242                         if (i_dev != device)
5243                                 continue;
5244
5245                         list_add_tail(&rmrr->resv->list, head);
5246                 }
5247         }
5248         rcu_read_unlock();
5249
5250         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5251                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5252                                       0, IOMMU_RESV_MSI);
5253         if (!reg)
5254                 return;
5255         list_add_tail(&reg->list, head);
5256 }
5257
5258 static void intel_iommu_put_resv_regions(struct device *dev,
5259                                          struct list_head *head)
5260 {
5261         struct iommu_resv_region *entry, *next;
5262
5263         list_for_each_entry_safe(entry, next, head, list) {
5264                 if (entry->type == IOMMU_RESV_RESERVED)
5265                         kfree(entry);
5266         }
5267 }
5268
5269 #ifdef CONFIG_INTEL_IOMMU_SVM
5270 #define MAX_NR_PASID_BITS (20)
5271 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5272 {
5273         /*
5274          * Convert ecap_pss to extend context entry pts encoding, also
5275          * respect the soft pasid_max value set by the iommu.
5276          * - number of PASID bits = ecap_pss + 1
5277          * - number of PASID table entries = 2^(pts + 5)
5278          * Therefore, pts = ecap_pss - 4
5279          * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5280          */
5281         if (ecap_pss(iommu->ecap) < 5)
5282                 return 0;
5283
5284         /* pasid_max is encoded as actual number of entries not the bits */
5285         return find_first_bit((unsigned long *)&iommu->pasid_max,
5286                         MAX_NR_PASID_BITS) - 5;
5287 }
5288
5289 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5290 {
5291         struct device_domain_info *info;
5292         struct context_entry *context;
5293         struct dmar_domain *domain;
5294         unsigned long flags;
5295         u64 ctx_lo;
5296         int ret;
5297
5298         domain = get_valid_domain_for_dev(sdev->dev);
5299         if (!domain)
5300                 return -EINVAL;
5301
5302         spin_lock_irqsave(&device_domain_lock, flags);
5303         spin_lock(&iommu->lock);
5304
5305         ret = -EINVAL;
5306         info = sdev->dev->archdata.iommu;
5307         if (!info || !info->pasid_supported)
5308                 goto out;
5309
5310         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5311         if (WARN_ON(!context))
5312                 goto out;
5313
5314         ctx_lo = context[0].lo;
5315
5316         sdev->did = domain->iommu_did[iommu->seq_id];
5317         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5318
5319         if (!(ctx_lo & CONTEXT_PASIDE)) {
5320                 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5321                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5322                         intel_iommu_get_pts(iommu);
5323
5324                 wmb();
5325                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5326                  * extended to permit requests-with-PASID if the PASIDE bit
5327                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5328                  * however, the PASIDE bit is ignored and requests-with-PASID
5329                  * are unconditionally blocked. Which makes less sense.
5330                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5331                  * "guest mode" translation types depending on whether ATS
5332                  * is available or not. Annoyingly, we can't use the new
5333                  * modes *unless* PASIDE is set. */
5334                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5335                         ctx_lo &= ~CONTEXT_TT_MASK;
5336                         if (info->ats_supported)
5337                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5338                         else
5339                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5340                 }
5341                 ctx_lo |= CONTEXT_PASIDE;
5342                 if (iommu->pasid_state_table)
5343                         ctx_lo |= CONTEXT_DINVE;
5344                 if (info->pri_supported)
5345                         ctx_lo |= CONTEXT_PRS;
5346                 context[0].lo = ctx_lo;
5347                 wmb();
5348                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5349                                            DMA_CCMD_MASK_NOBIT,
5350                                            DMA_CCMD_DEVICE_INVL);
5351         }
5352
5353         /* Enable PASID support in the device, if it wasn't already */
5354         if (!info->pasid_enabled)
5355                 iommu_enable_dev_iotlb(info);
5356
5357         if (info->ats_enabled) {
5358                 sdev->dev_iotlb = 1;
5359                 sdev->qdep = info->ats_qdep;
5360                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5361                         sdev->qdep = 0;
5362         }
5363         ret = 0;
5364
5365  out:
5366         spin_unlock(&iommu->lock);
5367         spin_unlock_irqrestore(&device_domain_lock, flags);
5368
5369         return ret;
5370 }
5371
5372 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5373 {
5374         struct intel_iommu *iommu;
5375         u8 bus, devfn;
5376
5377         if (iommu_dummy(dev)) {
5378                 dev_warn(dev,
5379                          "No IOMMU translation for device; cannot enable SVM\n");
5380                 return NULL;
5381         }
5382
5383         iommu = device_to_iommu(dev, &bus, &devfn);
5384         if ((!iommu)) {
5385                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5386                 return NULL;
5387         }
5388
5389         if (!iommu->pasid_table) {
5390                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5391                 return NULL;
5392         }
5393
5394         return iommu;
5395 }
5396 #endif /* CONFIG_INTEL_IOMMU_SVM */
5397
5398 const struct iommu_ops intel_iommu_ops = {
5399         .capable                = intel_iommu_capable,
5400         .domain_alloc           = intel_iommu_domain_alloc,
5401         .domain_free            = intel_iommu_domain_free,
5402         .attach_dev             = intel_iommu_attach_device,
5403         .detach_dev             = intel_iommu_detach_device,
5404         .map                    = intel_iommu_map,
5405         .unmap                  = intel_iommu_unmap,
5406         .map_sg                 = default_iommu_map_sg,
5407         .iova_to_phys           = intel_iommu_iova_to_phys,
5408         .add_device             = intel_iommu_add_device,
5409         .remove_device          = intel_iommu_remove_device,
5410         .get_resv_regions       = intel_iommu_get_resv_regions,
5411         .put_resv_regions       = intel_iommu_put_resv_regions,
5412         .device_group           = pci_device_group,
5413         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5414 };
5415
5416 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5417 {
5418         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5419         pr_info("Disabling IOMMU for graphics on this chipset\n");
5420         dmar_map_gfx = 0;
5421 }
5422
5423 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5424 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5425 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5426 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5427 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5428 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5430
5431 static void quirk_iommu_rwbf(struct pci_dev *dev)
5432 {
5433         /*
5434          * Mobile 4 Series Chipset neglects to set RWBF capability,
5435          * but needs it. Same seems to hold for the desktop versions.
5436          */
5437         pr_info("Forcing write-buffer flush capability\n");
5438         rwbf_quirk = 1;
5439 }
5440
5441 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5442 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5443 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5444 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5445 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5446 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5447 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5448
5449 #define GGC 0x52
5450 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5451 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5452 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5453 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5454 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5455 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5456 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5457 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5458
5459 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5460 {
5461         unsigned short ggc;
5462
5463         if (pci_read_config_word(dev, GGC, &ggc))
5464                 return;
5465
5466         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5467                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5468                 dmar_map_gfx = 0;
5469         } else if (dmar_map_gfx) {
5470                 /* we have to ensure the gfx device is idle before we flush */
5471                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5472                 intel_iommu_strict = 1;
5473        }
5474 }
5475 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5476 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5477 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5478 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5479
5480 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5481    ISOCH DMAR unit for the Azalia sound device, but not give it any
5482    TLB entries, which causes it to deadlock. Check for that.  We do
5483    this in a function called from init_dmars(), instead of in a PCI
5484    quirk, because we don't want to print the obnoxious "BIOS broken"
5485    message if VT-d is actually disabled.
5486 */
5487 static void __init check_tylersburg_isoch(void)
5488 {
5489         struct pci_dev *pdev;
5490         uint32_t vtisochctrl;
5491
5492         /* If there's no Azalia in the system anyway, forget it. */
5493         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5494         if (!pdev)
5495                 return;
5496         pci_dev_put(pdev);
5497
5498         /* System Management Registers. Might be hidden, in which case
5499            we can't do the sanity check. But that's OK, because the
5500            known-broken BIOSes _don't_ actually hide it, so far. */
5501         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5502         if (!pdev)
5503                 return;
5504
5505         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5506                 pci_dev_put(pdev);
5507                 return;
5508         }
5509
5510         pci_dev_put(pdev);
5511
5512         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5513         if (vtisochctrl & 1)
5514                 return;
5515
5516         /* Drop all bits other than the number of TLB entries */
5517         vtisochctrl &= 0x1c;
5518
5519         /* If we have the recommended number of TLB entries (16), fine. */
5520         if (vtisochctrl == 0x10)
5521                 return;
5522
5523         /* Zero TLB entries? You get to ride the short bus to school. */
5524         if (!vtisochctrl) {
5525                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5526                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5527                      dmi_get_system_info(DMI_BIOS_VENDOR),
5528                      dmi_get_system_info(DMI_BIOS_VERSION),
5529                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5530                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5531                 return;
5532         }
5533
5534         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5535                vtisochctrl);
5536 }