drivers/iommu/intel-iommu.c

   1 /*
   2  * Copyright © 2006-2014 Intel Corporation.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  * Authors: David Woodhouse <dwmw2@infradead.org>,
  14  *          Ashok Raj <ashok.raj@intel.com>,
  15  *          Shaohua Li <shaohua.li@intel.com>,
  16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17  *          Fenghua Yu <fenghua.yu@intel.com>
  18  *          Joerg Roedel <jroedel@suse.de>
  19  */
  20
  21 #define pr_fmt(fmt)     "DMAR: " fmt
  22
  23 #include <linux/init.h>
  24 #include <linux/bitmap.h>
  25 #include <linux/debugfs.h>
  26 #include <linux/export.h>
  27 #include <linux/slab.h>
  28 #include <linux/irq.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/spinlock.h>
  31 #include <linux/pci.h>
  32 #include <linux/dmar.h>
  33 #include <linux/dma-mapping.h>
  34 #include <linux/mempool.h>
  35 #include <linux/memory.h>
  36 #include <linux/cpu.h>
  37 #include <linux/timer.h>
  38 #include <linux/io.h>
  39 #include <linux/iova.h>
  40 #include <linux/iommu.h>
  41 #include <linux/intel-iommu.h>
  42 #include <linux/syscore_ops.h>
  43 #include <linux/tboot.h>
  44 #include <linux/dmi.h>
  45 #include <linux/pci-ats.h>
  46 #include <linux/memblock.h>
  47 #include <linux/dma-contiguous.h>
  48 #include <linux/crash_dump.h>
  49 #include <asm/irq_remapping.h>
  50 #include <asm/cacheflush.h>
  51 #include <asm/iommu.h>
  52
  53 #include "irq_remapping.h"
  54
  55 #define ROOT_SIZE               VTD_PAGE_SIZE
  56 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  57
  58 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  59 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  60 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  61 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  62
  63 #define IOAPIC_RANGE_START      (0xfee00000)
  64 #define IOAPIC_RANGE_END        (0xfeefffff)
  65 #define IOVA_START_ADDR         (0x1000)
  66
  67 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  68
  69 #define MAX_AGAW_WIDTH 64
  70 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  71
  72 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  73 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  74
  75 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  76    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  77 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  78                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  79 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  80
  81 /* IO virtual address start page frame number */
  82 #define IOVA_START_PFN          (1)
  83
  84 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  85 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  86 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  87
  88 /* page table handling */
  89 #define LEVEL_STRIDE            (9)
  90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  91
  92 /*
  93  * This bitmap is used to advertise the page sizes our hardware support
  94  * to the IOMMU core, which will then use this information to split
  95  * physically contiguous memory regions it is mapping into page sizes
  96  * that we support.
  97  *
  98  * Traditionally the IOMMU core just handed us the mappings directly,
  99  * after making sure the size is an order of a 4KiB page and that the
 100  * mapping has natural alignment.
 101  *
 102  * To retain this behavior, we currently advertise that we support
 103  * all page sizes that are an order of 4KiB.
 104  *
 105  * If at some point we'd like to utilize the IOMMU core's new behavior,
 106  * we could change this to advertise the real page sizes we support.
 107  */
 108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 109
 110 static inline int agaw_to_level(int agaw)
 111 {
 112         return agaw + 2;
 113 }
 114
 115 static inline int agaw_to_width(int agaw)
 116 {
 117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 118 }
 119
 120 static inline int width_to_agaw(int width)
 121 {
 122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 123 }
 124
 125 static inline unsigned int level_to_offset_bits(int level)
 126 {
 127         return (level - 1) * LEVEL_STRIDE;
 128 }
 129
 130 static inline int pfn_level_offset(unsigned long pfn, int level)
 131 {
 132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 133 }
 134
 135 static inline unsigned long level_mask(int level)
 136 {
 137         return -1UL << level_to_offset_bits(level);
 138 }
 139
 140 static inline unsigned long level_size(int level)
 141 {
 142         return 1UL << level_to_offset_bits(level);
 143 }
 144
 145 static inline unsigned long align_to_level(unsigned long pfn, int level)
 146 {
 147         return (pfn + level_size(level) - 1) & level_mask(level);
 148 }
 149
 150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 151 {
 152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 153 }
 154
 155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 156    are never going to work. */
 157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 158 {
 159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 160 }
 161
 162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 163 {
 164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 165 }
 166 static inline unsigned long page_to_dma_pfn(struct page *pg)
 167 {
 168         return mm_to_dma_pfn(page_to_pfn(pg));
 169 }
 170 static inline unsigned long virt_to_dma_pfn(void *p)
 171 {
 172         return page_to_dma_pfn(virt_to_page(p));
 173 }
 174
 175 /* global iommu list, set NULL for ignored DMAR units */
 176 static struct intel_iommu **g_iommus;
 177
 178 static void __init check_tylersburg_isoch(void);
 179 static int rwbf_quirk;
 180
 181 /*
 182  * set to 1 to panic kernel if can't successfully enable VT-d
 183  * (used when kernel is launched w/ TXT)
 184  */
 185 static int force_on = 0;
 186
 187 /*
 188  * 0: Present
 189  * 1-11: Reserved
 190  * 12-63: Context Ptr (12 - (haw-1))
 191  * 64-127: Reserved
 192  */
 193 struct root_entry {
 194         u64     lo;
 195         u64     hi;
 196 };
 197 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 198
 199 /*
 200  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 201  * if marked present.
 202  */
 203 static phys_addr_t root_entry_lctp(struct root_entry *re)
 204 {
 205         if (!(re->lo & 1))
 206                 return 0;
 207
 208         return re->lo & VTD_PAGE_MASK;
 209 }
 210
 211 /*
 212  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 213  * if marked present.
 214  */
 215 static phys_addr_t root_entry_uctp(struct root_entry *re)
 216 {
 217         if (!(re->hi & 1))
 218                 return 0;
 219
 220         return re->hi & VTD_PAGE_MASK;
 221 }
 222 /*
 223  * low 64 bits:
 224  * 0: present
 225  * 1: fault processing disable
 226  * 2-3: translation type
 227  * 12-63: address space root
 228  * high 64 bits:
 229  * 0-2: address width
 230  * 3-6: aval
 231  * 8-23: domain id
 232  */
 233 struct context_entry {
 234         u64 lo;
 235         u64 hi;
 236 };
 237
 238 static inline void context_clear_pasid_enable(struct context_entry *context)
 239 {
 240         context->lo &= ~(1ULL << 11);
 241 }
 242
 243 static inline bool context_pasid_enabled(struct context_entry *context)
 244 {
 245         return !!(context->lo & (1ULL << 11));
 246 }
 247
 248 static inline void context_set_copied(struct context_entry *context)
 249 {
 250         context->hi |= (1ull << 3);
 251 }
 252
 253 static inline bool context_copied(struct context_entry *context)
 254 {
 255         return !!(context->hi & (1ULL << 3));
 256 }
 257
 258 static inline bool __context_present(struct context_entry *context)
 259 {
 260         return (context->lo & 1);
 261 }
 262
 263 static inline bool context_present(struct context_entry *context)
 264 {
 265         return context_pasid_enabled(context) ?
 266              __context_present(context) :
 267              __context_present(context) && !context_copied(context);
 268 }
 269
 270 static inline void context_set_present(struct context_entry *context)
 271 {
 272         context->lo |= 1;
 273 }
 274
 275 static inline void context_set_fault_enable(struct context_entry *context)
 276 {
 277         context->lo &= (((u64)-1) << 2) | 1;
 278 }
 279
 280 static inline void context_set_translation_type(struct context_entry *context,
 281                                                 unsigned long value)
 282 {
 283         context->lo &= (((u64)-1) << 4) | 3;
 284         context->lo |= (value & 3) << 2;
 285 }
 286
 287 static inline void context_set_address_root(struct context_entry *context,
 288                                             unsigned long value)
 289 {
 290         context->lo &= ~VTD_PAGE_MASK;
 291         context->lo |= value & VTD_PAGE_MASK;
 292 }
 293
 294 static inline void context_set_address_width(struct context_entry *context,
 295                                              unsigned long value)
 296 {
 297         context->hi |= value & 7;
 298 }
 299
 300 static inline void context_set_domain_id(struct context_entry *context,
 301                                          unsigned long value)
 302 {
 303         context->hi |= (value & ((1 << 16) - 1)) << 8;
 304 }
 305
 306 static inline int context_domain_id(struct context_entry *c)
 307 {
 308         return((c->hi >> 8) & 0xffff);
 309 }
 310
 311 static inline void context_clear_entry(struct context_entry *context)
 312 {
 313         context->lo = 0;
 314         context->hi = 0;
 315 }
 316
 317 /*
 318  * 0: readable
 319  * 1: writable
 320  * 2-6: reserved
 321  * 7: super page
 322  * 8-10: available
 323  * 11: snoop behavior
 324  * 12-63: Host physcial address
 325  */
 326 struct dma_pte {
 327         u64 val;
 328 };
 329
 330 static inline void dma_clear_pte(struct dma_pte *pte)
 331 {
 332         pte->val = 0;
 333 }
 334
 335 static inline u64 dma_pte_addr(struct dma_pte *pte)
 336 {
 337 #ifdef CONFIG_64BIT
 338         return pte->val & VTD_PAGE_MASK;
 339 #else
 340         /* Must have a full atomic 64-bit read */
 341         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 342 #endif
 343 }
 344
 345 static inline bool dma_pte_present(struct dma_pte *pte)
 346 {
 347         return (pte->val & 3) != 0;
 348 }
 349
 350 static inline bool dma_pte_superpage(struct dma_pte *pte)
 351 {
 352         return (pte->val & DMA_PTE_LARGE_PAGE);
 353 }
 354
 355 static inline int first_pte_in_page(struct dma_pte *pte)
 356 {
 357         return !((unsigned long)pte & ~VTD_PAGE_MASK);
 358 }
 359
 360 /*
 361  * This domain is a statically identity mapping domain.
 362  *      1. This domain creats a static 1:1 mapping to all usable memory.
 363  *      2. It maps to each iommu if successful.
 364  *      3. Each iommu mapps to this domain if successful.
 365  */
 366 static struct dmar_domain *si_domain;
 367 static int hw_pass_through = 1;
 368
 369 /*
 370  * Domain represents a virtual machine, more than one devices
 371  * across iommus may be owned in one domain, e.g. kvm guest.
 372  */
 373 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
 374
 375 /* si_domain contains mulitple devices */
 376 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
 377
 378 #define for_each_domain_iommu(idx, domain)                      \
 379         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 380                 if (domain->iommu_refcnt[idx])
 381
 382 struct dmar_domain {
 383         int     nid;                    /* node id */
 384
 385         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
 386                                         /* Refcount of devices per iommu */
 387
 388
 389         u16             iommu_did[DMAR_UNITS_SUPPORTED];
 390                                         /* Domain ids per IOMMU. Use u16 since
 391                                          * domain ids are 16 bit wide according
 392                                          * to VT-d spec, section 9.3 */
 393
 394         bool has_iotlb_device;
 395         struct list_head devices;       /* all devices' list */
 396         struct iova_domain iovad;       /* iova's that belong to this domain */
 397
 398         struct dma_pte  *pgd;           /* virtual address */
 399         int             gaw;            /* max guest address width */
 400
 401         /* adjusted guest address width, 0 is level 2 30-bit */
 402         int             agaw;
 403
 404         int             flags;          /* flags to find out type of domain */
 405
 406         int             iommu_coherency;/* indicate coherency of iommu access */
 407         int             iommu_snooping; /* indicate snooping control feature*/
 408         int             iommu_count;    /* reference count of iommu */
 409         int             iommu_superpage;/* Level of superpages supported:
 410                                            0 == 4KiB (no superpages), 1 == 2MiB,
 411                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 412         u64             max_addr;       /* maximum mapped address */
 413
 414         struct iommu_domain domain;     /* generic domain data structure for
 415                                            iommu core */
 416 };
 417
 418 /* PCI domain-device relationship */
 419 struct device_domain_info {
 420         struct list_head link;  /* link to domain siblings */
 421         struct list_head global; /* link to global list */
 422         u8 bus;                 /* PCI bus number */
 423         u8 devfn;               /* PCI devfn number */
 424         u8 pasid_supported:3;
 425         u8 pasid_enabled:1;
 426         u8 pri_supported:1;
 427         u8 pri_enabled:1;
 428         u8 ats_supported:1;
 429         u8 ats_enabled:1;
 430         u8 ats_qdep;
 431         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
 432         struct intel_iommu *iommu; /* IOMMU used by this device */
 433         struct dmar_domain *domain; /* pointer to domain */
 434 };
 435
 436 struct dmar_rmrr_unit {
 437         struct list_head list;          /* list of rmrr units   */
 438         struct acpi_dmar_header *hdr;   /* ACPI header          */
 439         u64     base_address;           /* reserved base address*/
 440         u64     end_address;            /* reserved end address */
 441         struct dmar_dev_scope *devices; /* target devices */
 442         int     devices_cnt;            /* target device count */
 443         struct iommu_resv_region *resv; /* reserved region handle */
 444 };
 445
 446 struct dmar_atsr_unit {
 447         struct list_head list;          /* list of ATSR units */
 448         struct acpi_dmar_header *hdr;   /* ACPI header */
 449         struct dmar_dev_scope *devices; /* target devices */
 450         int devices_cnt;                /* target device count */
 451         u8 include_all:1;               /* include all ports */
 452 };
 453
 454 static LIST_HEAD(dmar_atsr_units);
 455 static LIST_HEAD(dmar_rmrr_units);
 456
 457 #define for_each_rmrr_units(rmrr) \
 458         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 459
 460 static void flush_unmaps_timeout(unsigned long data);
 461
 462 struct deferred_flush_entry {
 463         unsigned long iova_pfn;
 464         unsigned long nrpages;
 465         struct dmar_domain *domain;
 466         struct page *freelist;
 467 };
 468
 469 #define HIGH_WATER_MARK 250
 470 struct deferred_flush_table {
 471         int next;
 472         struct deferred_flush_entry entries[HIGH_WATER_MARK];
 473 };
 474
 475 struct deferred_flush_data {
 476         spinlock_t lock;
 477         int timer_on;
 478         struct timer_list timer;
 479         long size;
 480         struct deferred_flush_table *tables;
 481 };
 482
 483 DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
 484
 485 /* bitmap for indexing intel_iommus */
 486 static int g_num_of_iommus;
 487
 488 static void domain_exit(struct dmar_domain *domain);
 489 static void domain_remove_dev_info(struct dmar_domain *domain);
 490 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 491                                      struct device *dev);
 492 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 493 static void domain_context_clear(struct intel_iommu *iommu,
 494                                  struct device *dev);
 495 static int domain_detach_iommu(struct dmar_domain *domain,
 496                                struct intel_iommu *iommu);
 497
 498 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 499 int dmar_disabled = 0;
 500 #else
 501 int dmar_disabled = 1;
 502 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 503
 504 int intel_iommu_enabled = 0;
 505 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 506
 507 static int dmar_map_gfx = 1;
 508 static int dmar_forcedac;
 509 static int intel_iommu_strict;
 510 static int intel_iommu_superpage = 1;
 511 static int intel_iommu_ecs = 1;
 512 static int intel_iommu_pasid28;
 513 static int iommu_identity_mapping;
 514
 515 #define IDENTMAP_ALL            1
 516 #define IDENTMAP_GFX            2
 517 #define IDENTMAP_AZALIA         4
 518
 519 /* Broadwell and Skylake have broken ECS support — normal so-called "second
 520  * level" translation of DMA requests-without-PASID doesn't actually happen
 521  * unless you also set the NESTE bit in an extended context-entry. Which of
 522  * course means that SVM doesn't work because it's trying to do nested
 523  * translation of the physical addresses it finds in the process page tables,
 524  * through the IOVA->phys mapping found in the "second level" page tables.
 525  *
 526  * The VT-d specification was retroactively changed to change the definition
 527  * of the capability bits and pretend that Broadwell/Skylake never happened...
 528  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
 529  * for some reason it was the PASID capability bit which was redefined (from
 530  * bit 28 on BDW/SKL to bit 40 in future).
 531  *
 532  * So our test for ECS needs to eschew those implementations which set the old
 533  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
 534  * Unless we are working around the 'pasid28' limitations, that is, by putting
 535  * the device into passthrough mode for normal DMA and thus masking the bug.
 536  */
 537 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
 538                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
 539 /* PASID support is thus enabled if ECS is enabled and *either* of the old
 540  * or new capability bits are set. */
 541 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
 542                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
 543
 544 int intel_iommu_gfx_mapped;
 545 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 546
 547 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 548 static DEFINE_SPINLOCK(device_domain_lock);
 549 static LIST_HEAD(device_domain_list);
 550
 551 static const struct iommu_ops intel_iommu_ops;
 552
 553 static bool translation_pre_enabled(struct intel_iommu *iommu)
 554 {
 555         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 556 }
 557
 558 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 559 {
 560         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 561 }
 562
 563 static void init_translation_status(struct intel_iommu *iommu)
 564 {
 565         u32 gsts;
 566
 567         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 568         if (gsts & DMA_GSTS_TES)
 569                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 570 }
 571
 572 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 573 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 574 {
 575         return container_of(dom, struct dmar_domain, domain);
 576 }
 577
 578 static int __init intel_iommu_setup(char *str)
 579 {
 580         if (!str)
 581                 return -EINVAL;
 582         while (*str) {
 583                 if (!strncmp(str, "on", 2)) {
 584                         dmar_disabled = 0;
 585                         pr_info("IOMMU enabled\n");
 586                 } else if (!strncmp(str, "off", 3)) {
 587                         dmar_disabled = 1;
 588                         pr_info("IOMMU disabled\n");
 589                 } else if (!strncmp(str, "igfx_off", 8)) {
 590                         dmar_map_gfx = 0;
 591                         pr_info("Disable GFX device mapping\n");
 592                 } else if (!strncmp(str, "forcedac", 8)) {
 593                         pr_info("Forcing DAC for PCI devices\n");
 594                         dmar_forcedac = 1;
 595                 } else if (!strncmp(str, "strict", 6)) {
 596                         pr_info("Disable batched IOTLB flush\n");
 597                         intel_iommu_strict = 1;
 598                 } else if (!strncmp(str, "sp_off", 6)) {
 599                         pr_info("Disable supported super page\n");
 600                         intel_iommu_superpage = 0;
 601                 } else if (!strncmp(str, "ecs_off", 7)) {
 602                         printk(KERN_INFO
 603                                 "Intel-IOMMU: disable extended context table support\n");
 604                         intel_iommu_ecs = 0;
 605                 } else if (!strncmp(str, "pasid28", 7)) {
 606                         printk(KERN_INFO
 607                                 "Intel-IOMMU: enable pre-production PASID support\n");
 608                         intel_iommu_pasid28 = 1;
 609                         iommu_identity_mapping |= IDENTMAP_GFX;
 610                 }
 611
 612                 str += strcspn(str, ",");
 613                 while (*str == ',')
 614                         str++;
 615         }
 616         return 0;
 617 }
 618 __setup("intel_iommu=", intel_iommu_setup);
 619
 620 static struct kmem_cache *iommu_domain_cache;
 621 static struct kmem_cache *iommu_devinfo_cache;
 622
 623 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 624 {
 625         struct dmar_domain **domains;
 626         int idx = did >> 8;
 627
 628         domains = iommu->domains[idx];
 629         if (!domains)
 630                 return NULL;
 631
 632         return domains[did & 0xff];
 633 }
 634
 635 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 636                              struct dmar_domain *domain)
 637 {
 638         struct dmar_domain **domains;
 639         int idx = did >> 8;
 640
 641         if (!iommu->domains[idx]) {
 642                 size_t size = 256 * sizeof(struct dmar_domain *);
 643                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 644         }
 645
 646         domains = iommu->domains[idx];
 647         if (WARN_ON(!domains))
 648                 return;
 649         else
 650                 domains[did & 0xff] = domain;
 651 }
 652
 653 static inline void *alloc_pgtable_page(int node)
 654 {
 655         struct page *page;
 656         void *vaddr = NULL;
 657
 658         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 659         if (page)
 660                 vaddr = page_address(page);
 661         return vaddr;
 662 }
 663
 664 static inline void free_pgtable_page(void *vaddr)
 665 {
 666         free_page((unsigned long)vaddr);
 667 }
 668
 669 static inline void *alloc_domain_mem(void)
 670 {
 671         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 672 }
 673
 674 static void free_domain_mem(void *vaddr)
 675 {
 676         kmem_cache_free(iommu_domain_cache, vaddr);
 677 }
 678
 679 static inline void * alloc_devinfo_mem(void)
 680 {
 681         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 682 }
 683
 684 static inline void free_devinfo_mem(void *vaddr)
 685 {
 686         kmem_cache_free(iommu_devinfo_cache, vaddr);
 687 }
 688
 689 static inline int domain_type_is_vm(struct dmar_domain *domain)
 690 {
 691         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 692 }
 693
 694 static inline int domain_type_is_si(struct dmar_domain *domain)
 695 {
 696         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 697 }
 698
 699 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 700 {
 701         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 702                                 DOMAIN_FLAG_STATIC_IDENTITY);
 703 }
 704
 705 static inline int domain_pfn_supported(struct dmar_domain *domain,
 706                                        unsigned long pfn)
 707 {
 708         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 709
 710         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 711 }
 712
 713 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 714 {
 715         unsigned long sagaw;
 716         int agaw = -1;
 717
 718         sagaw = cap_sagaw(iommu->cap);
 719         for (agaw = width_to_agaw(max_gaw);
 720              agaw >= 0; agaw--) {
 721                 if (test_bit(agaw, &sagaw))
 722                         break;
 723         }
 724
 725         return agaw;
 726 }
 727
 728 /*
 729  * Calculate max SAGAW for each iommu.
 730  */
 731 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 732 {
 733         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 734 }
 735
 736 /*
 737  * calculate agaw for each iommu.
 738  * "SAGAW" may be different across iommus, use a default agaw, and
 739  * get a supported less agaw for iommus that don't support the default agaw.
 740  */
 741 int iommu_calculate_agaw(struct intel_iommu *iommu)
 742 {
 743         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 744 }
 745
 746 /* This functionin only returns single iommu in a domain */
 747 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 748 {
 749         int iommu_id;
 750
 751         /* si_domain and vm domain should not get here. */
 752         BUG_ON(domain_type_is_vm_or_si(domain));
 753         for_each_domain_iommu(iommu_id, domain)
 754                 break;
 755
 756         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 757                 return NULL;
 758
 759         return g_iommus[iommu_id];
 760 }
 761
 762 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 763 {
 764         struct dmar_drhd_unit *drhd;
 765         struct intel_iommu *iommu;
 766         bool found = false;
 767         int i;
 768
 769         domain->iommu_coherency = 1;
 770
 771         for_each_domain_iommu(i, domain) {
 772                 found = true;
 773                 if (!ecap_coherent(g_iommus[i]->ecap)) {
 774                         domain->iommu_coherency = 0;
 775                         break;
 776                 }
 777         }
 778         if (found)
 779                 return;
 780
 781         /* No hardware attached; use lowest common denominator */
 782         rcu_read_lock();
 783         for_each_active_iommu(iommu, drhd) {
 784                 if (!ecap_coherent(iommu->ecap)) {
 785                         domain->iommu_coherency = 0;
 786                         break;
 787                 }
 788         }
 789         rcu_read_unlock();
 790 }
 791
 792 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 793 {
 794         struct dmar_drhd_unit *drhd;
 795         struct intel_iommu *iommu;
 796         int ret = 1;
 797
 798         rcu_read_lock();
 799         for_each_active_iommu(iommu, drhd) {
 800                 if (iommu != skip) {
 801                         if (!ecap_sc_support(iommu->ecap)) {
 802                                 ret = 0;
 803                                 break;
 804                         }
 805                 }
 806         }
 807         rcu_read_unlock();
 808
 809         return ret;
 810 }
 811
 812 static int domain_update_iommu_superpage(struct intel_iommu *skip)
 813 {
 814         struct dmar_drhd_unit *drhd;
 815         struct intel_iommu *iommu;
 816         int mask = 0xf;
 817
 818         if (!intel_iommu_superpage) {
 819                 return 0;
 820         }
 821
 822         /* set iommu_superpage to the smallest common denominator */
 823         rcu_read_lock();
 824         for_each_active_iommu(iommu, drhd) {
 825                 if (iommu != skip) {
 826                         mask &= cap_super_page_val(iommu->cap);
 827                         if (!mask)
 828                                 break;
 829                 }
 830         }
 831         rcu_read_unlock();
 832
 833         return fls(mask);
 834 }
 835
 836 /* Some capabilities may be different across iommus */
 837 static void domain_update_iommu_cap(struct dmar_domain *domain)
 838 {
 839         domain_update_iommu_coherency(domain);
 840         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 841         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 842 }
 843
 844 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
 845                                                        u8 bus, u8 devfn, int alloc)
 846 {
 847         struct root_entry *root = &iommu->root_entry[bus];
 848         struct context_entry *context;
 849         u64 *entry;
 850
 851         entry = &root->lo;
 852         if (ecs_enabled(iommu)) {
 853                 if (devfn >= 0x80) {
 854                         devfn -= 0x80;
 855                         entry = &root->hi;
 856                 }
 857                 devfn *= 2;
 858         }
 859         if (*entry & 1)
 860                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 861         else {
 862                 unsigned long phy_addr;
 863                 if (!alloc)
 864                         return NULL;
 865
 866                 context = alloc_pgtable_page(iommu->node);
 867                 if (!context)
 868                         return NULL;
 869
 870                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 871                 phy_addr = virt_to_phys((void *)context);
 872                 *entry = phy_addr | 1;
 873                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 874         }
 875         return &context[devfn];
 876 }
 877
 878 static int iommu_dummy(struct device *dev)
 879 {
 880         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 881 }
 882
 883 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 884 {
 885         struct dmar_drhd_unit *drhd = NULL;
 886         struct intel_iommu *iommu;
 887         struct device *tmp;
 888         struct pci_dev *ptmp, *pdev = NULL;
 889         u16 segment = 0;
 890         int i;
 891
 892         if (iommu_dummy(dev))
 893                 return NULL;
 894
 895         if (dev_is_pci(dev)) {
 896                 struct pci_dev *pf_pdev;
 897
 898                 pdev = to_pci_dev(dev);
 899                 /* VFs aren't listed in scope tables; we need to look up
 900                  * the PF instead to find the IOMMU. */
 901                 pf_pdev = pci_physfn(pdev);
 902                 dev = &pf_pdev->dev;
 903                 segment = pci_domain_nr(pdev->bus);
 904         } else if (has_acpi_companion(dev))
 905                 dev = &ACPI_COMPANION(dev)->dev;
 906
 907         rcu_read_lock();
 908         for_each_active_iommu(iommu, drhd) {
 909                 if (pdev && segment != drhd->segment)
 910                         continue;
 911
 912                 for_each_active_dev_scope(drhd->devices,
 913                                           drhd->devices_cnt, i, tmp) {
 914                         if (tmp == dev) {
 915                                 /* For a VF use its original BDF# not that of the PF
 916                                  * which we used for the IOMMU lookup. Strictly speaking
 917                                  * we could do this for all PCI devices; we only need to
 918                                  * get the BDF# from the scope table for ACPI matches. */
 919                                 if (pdev->is_virtfn)
 920                                         goto got_pdev;
 921
 922                                 *bus = drhd->devices[i].bus;
 923                                 *devfn = drhd->devices[i].devfn;
 924                                 goto out;
 925                         }
 926
 927                         if (!pdev || !dev_is_pci(tmp))
 928                                 continue;
 929
 930                         ptmp = to_pci_dev(tmp);
 931                         if (ptmp->subordinate &&
 932                             ptmp->subordinate->number <= pdev->bus->number &&
 933                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
 934                                 goto got_pdev;
 935                 }
 936
 937                 if (pdev && drhd->include_all) {
 938                 got_pdev:
 939                         *bus = pdev->bus->number;
 940                         *devfn = pdev->devfn;
 941                         goto out;
 942                 }
 943         }
 944         iommu = NULL;
 945  out:
 946         rcu_read_unlock();
 947
 948         return iommu;
 949 }
 950
 951 static void domain_flush_cache(struct dmar_domain *domain,
 952                                void *addr, int size)
 953 {
 954         if (!domain->iommu_coherency)
 955                 clflush_cache_range(addr, size);
 956 }
 957
 958 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 959 {
 960         struct context_entry *context;
 961         int ret = 0;
 962         unsigned long flags;
 963
 964         spin_lock_irqsave(&iommu->lock, flags);
 965         context = iommu_context_addr(iommu, bus, devfn, 0);
 966         if (context)
 967                 ret = context_present(context);
 968         spin_unlock_irqrestore(&iommu->lock, flags);
 969         return ret;
 970 }
 971
 972 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 973 {
 974         struct context_entry *context;
 975         unsigned long flags;
 976
 977         spin_lock_irqsave(&iommu->lock, flags);
 978         context = iommu_context_addr(iommu, bus, devfn, 0);
 979         if (context) {
 980                 context_clear_entry(context);
 981                 __iommu_flush_cache(iommu, context, sizeof(*context));
 982         }
 983         spin_unlock_irqrestore(&iommu->lock, flags);
 984 }
 985
 986 static void free_context_table(struct intel_iommu *iommu)
 987 {
 988         int i;
 989         unsigned long flags;
 990         struct context_entry *context;
 991
 992         spin_lock_irqsave(&iommu->lock, flags);
 993         if (!iommu->root_entry) {
 994                 goto out;
 995         }
 996         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 997                 context = iommu_context_addr(iommu, i, 0, 0);
 998                 if (context)
 999                         free_pgtable_page(context);
1000
1001                 if (!ecs_enabled(iommu))
1002                         continue;
1003
1004                 context = iommu_context_addr(iommu, i, 0x80, 0);
1005                 if (context)
1006                         free_pgtable_page(context);
1007
1008         }
1009         free_pgtable_page(iommu->root_entry);
1010         iommu->root_entry = NULL;
1011 out:
1012         spin_unlock_irqrestore(&iommu->lock, flags);
1013 }
1014
1015 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1016                                       unsigned long pfn, int *target_level)
1017 {
1018         struct dma_pte *parent, *pte = NULL;
1019         int level = agaw_to_level(domain->agaw);
1020         int offset;
1021
1022         BUG_ON(!domain->pgd);
1023
1024         if (!domain_pfn_supported(domain, pfn))
1025                 /* Address beyond IOMMU's addressing capabilities. */
1026                 return NULL;
1027
1028         parent = domain->pgd;
1029
1030         while (1) {
1031                 void *tmp_page;
1032
1033                 offset = pfn_level_offset(pfn, level);
1034                 pte = &parent[offset];
1035                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1036                         break;
1037                 if (level == *target_level)
1038                         break;
1039
1040                 if (!dma_pte_present(pte)) {
1041                         uint64_t pteval;
1042
1043                         tmp_page = alloc_pgtable_page(domain->nid);
1044
1045                         if (!tmp_page)
1046                                 return NULL;
1047
1048                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1049                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1050                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1051                                 /* Someone else set it while we were thinking; use theirs. */
1052                                 free_pgtable_page(tmp_page);
1053                         else
1054                                 domain_flush_cache(domain, pte, sizeof(*pte));
1055                 }
1056                 if (level == 1)
1057                         break;
1058
1059                 parent = phys_to_virt(dma_pte_addr(pte));
1060                 level--;
1061         }
1062
1063         if (!*target_level)
1064                 *target_level = level;
1065
1066         return pte;
1067 }
1068
1069
1070 /* return address's pte at specific level */
1071 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1072                                          unsigned long pfn,
1073                                          int level, int *large_page)
1074 {
1075         struct dma_pte *parent, *pte = NULL;
1076         int total = agaw_to_level(domain->agaw);
1077         int offset;
1078
1079         parent = domain->pgd;
1080         while (level <= total) {
1081                 offset = pfn_level_offset(pfn, total);
1082                 pte = &parent[offset];
1083                 if (level == total)
1084                         return pte;
1085
1086                 if (!dma_pte_present(pte)) {
1087                         *large_page = total;
1088                         break;
1089                 }
1090
1091                 if (dma_pte_superpage(pte)) {
1092                         *large_page = total;
1093                         return pte;
1094                 }
1095
1096                 parent = phys_to_virt(dma_pte_addr(pte));
1097                 total--;
1098         }
1099         return NULL;
1100 }
1101
1102 /* clear last level pte, a tlb flush should be followed */
1103 static void dma_pte_clear_range(struct dmar_domain *domain,
1104                                 unsigned long start_pfn,
1105                                 unsigned long last_pfn)
1106 {
1107         unsigned int large_page = 1;
1108         struct dma_pte *first_pte, *pte;
1109
1110         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1111         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1112         BUG_ON(start_pfn > last_pfn);
1113
1114         /* we don't need lock here; nobody else touches the iova range */
1115         do {
1116                 large_page = 1;
1117                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1118                 if (!pte) {
1119                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1120                         continue;
1121                 }
1122                 do {
1123                         dma_clear_pte(pte);
1124                         start_pfn += lvl_to_nr_pages(large_page);
1125                         pte++;
1126                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1127
1128                 domain_flush_cache(domain, first_pte,
1129                                    (void *)pte - (void *)first_pte);
1130
1131         } while (start_pfn && start_pfn <= last_pfn);
1132 }
1133
1134 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1135                                struct dma_pte *pte, unsigned long pfn,
1136                                unsigned long start_pfn, unsigned long last_pfn)
1137 {
1138         pfn = max(start_pfn, pfn);
1139         pte = &pte[pfn_level_offset(pfn, level)];
1140
1141         do {
1142                 unsigned long level_pfn;
1143                 struct dma_pte *level_pte;
1144
1145                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1146                         goto next;
1147
1148                 level_pfn = pfn & level_mask(level - 1);
1149                 level_pte = phys_to_virt(dma_pte_addr(pte));
1150
1151                 if (level > 2)
1152                         dma_pte_free_level(domain, level - 1, level_pte,
1153                                            level_pfn, start_pfn, last_pfn);
1154
1155                 /* If range covers entire pagetable, free it */
1156                 if (!(start_pfn > level_pfn ||
1157                       last_pfn < level_pfn + level_size(level) - 1)) {
1158                         dma_clear_pte(pte);
1159                         domain_flush_cache(domain, pte, sizeof(*pte));
1160                         free_pgtable_page(level_pte);
1161                 }
1162 next:
1163                 pfn += level_size(level);
1164         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1165 }
1166
1167 /* clear last level (leaf) ptes and free page table pages. */
1168 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1169                                    unsigned long start_pfn,
1170                                    unsigned long last_pfn)
1171 {
1172         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1173         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1174         BUG_ON(start_pfn > last_pfn);
1175
1176         dma_pte_clear_range(domain, start_pfn, last_pfn);
1177
1178         /* We don't need lock here; nobody else touches the iova range */
1179         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1180                            domain->pgd, 0, start_pfn, last_pfn);
1181
1182         /* free pgd */
1183         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1184                 free_pgtable_page(domain->pgd);
1185                 domain->pgd = NULL;
1186         }
1187 }
1188
1189 /* When a page at a given level is being unlinked from its parent, we don't
1190    need to *modify* it at all. All we need to do is make a list of all the
1191    pages which can be freed just as soon as we've flushed the IOTLB and we
1192    know the hardware page-walk will no longer touch them.
1193    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1194    be freed. */
1195 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1196                                             int level, struct dma_pte *pte,
1197                                             struct page *freelist)
1198 {
1199         struct page *pg;
1200
1201         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1202         pg->freelist = freelist;
1203         freelist = pg;
1204
1205         if (level == 1)
1206                 return freelist;
1207
1208         pte = page_address(pg);
1209         do {
1210                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1211                         freelist = dma_pte_list_pagetables(domain, level - 1,
1212                                                            pte, freelist);
1213                 pte++;
1214         } while (!first_pte_in_page(pte));
1215
1216         return freelist;
1217 }
1218
1219 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1220                                         struct dma_pte *pte, unsigned long pfn,
1221                                         unsigned long start_pfn,
1222                                         unsigned long last_pfn,
1223                                         struct page *freelist)
1224 {
1225         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1226
1227         pfn = max(start_pfn, pfn);
1228         pte = &pte[pfn_level_offset(pfn, level)];
1229
1230         do {
1231                 unsigned long level_pfn;
1232
1233                 if (!dma_pte_present(pte))
1234                         goto next;
1235
1236                 level_pfn = pfn & level_mask(level);
1237
1238                 /* If range covers entire pagetable, free it */
1239                 if (start_pfn <= level_pfn &&
1240                     last_pfn >= level_pfn + level_size(level) - 1) {
1241                         /* These suborbinate page tables are going away entirely. Don't
1242                            bother to clear them; we're just going to *free* them. */
1243                         if (level > 1 && !dma_pte_superpage(pte))
1244                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1245
1246                         dma_clear_pte(pte);
1247                         if (!first_pte)
1248                                 first_pte = pte;
1249                         last_pte = pte;
1250                 } else if (level > 1) {
1251                         /* Recurse down into a level that isn't *entirely* obsolete */
1252                         freelist = dma_pte_clear_level(domain, level - 1,
1253                                                        phys_to_virt(dma_pte_addr(pte)),
1254                                                        level_pfn, start_pfn, last_pfn,
1255                                                        freelist);
1256                 }
1257 next:
1258                 pfn += level_size(level);
1259         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1260
1261         if (first_pte)
1262                 domain_flush_cache(domain, first_pte,
1263                                    (void *)++last_pte - (void *)first_pte);
1264
1265         return freelist;
1266 }
1267
1268 /* We can't just free the pages because the IOMMU may still be walking
1269    the page tables, and may have cached the intermediate levels. The
1270    pages can only be freed after the IOTLB flush has been done. */
1271 static struct page *domain_unmap(struct dmar_domain *domain,
1272                                  unsigned long start_pfn,
1273                                  unsigned long last_pfn)
1274 {
1275         struct page *freelist = NULL;
1276
1277         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1278         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1279         BUG_ON(start_pfn > last_pfn);
1280
1281         /* we don't need lock here; nobody else touches the iova range */
1282         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1283                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1284
1285         /* free pgd */
1286         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1287                 struct page *pgd_page = virt_to_page(domain->pgd);
1288                 pgd_page->freelist = freelist;
1289                 freelist = pgd_page;
1290
1291                 domain->pgd = NULL;
1292         }
1293
1294         return freelist;
1295 }
1296
1297 static void dma_free_pagelist(struct page *freelist)
1298 {
1299         struct page *pg;
1300
1301         while ((pg = freelist)) {
1302                 freelist = pg->freelist;
1303                 free_pgtable_page(page_address(pg));
1304         }
1305 }
1306
1307 /* iommu handling */
1308 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1309 {
1310         struct root_entry *root;
1311         unsigned long flags;
1312
1313         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1314         if (!root) {
1315                 pr_err("Allocating root entry for %s failed\n",
1316                         iommu->name);
1317                 return -ENOMEM;
1318         }
1319
1320         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1321
1322         spin_lock_irqsave(&iommu->lock, flags);
1323         iommu->root_entry = root;
1324         spin_unlock_irqrestore(&iommu->lock, flags);
1325
1326         return 0;
1327 }
1328
1329 static void iommu_set_root_entry(struct intel_iommu *iommu)
1330 {
1331         u64 addr;
1332         u32 sts;
1333         unsigned long flag;
1334
1335         addr = virt_to_phys(iommu->root_entry);
1336         if (ecs_enabled(iommu))
1337                 addr |= DMA_RTADDR_RTT;
1338
1339         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1341
1342         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1343
1344         /* Make sure hardware complete it */
1345         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1346                       readl, (sts & DMA_GSTS_RTPS), sts);
1347
1348         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1349 }
1350
1351 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1352 {
1353         u32 val;
1354         unsigned long flag;
1355
1356         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1357                 return;
1358
1359         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1360         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1361
1362         /* Make sure hardware complete it */
1363         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1364                       readl, (!(val & DMA_GSTS_WBFS)), val);
1365
1366         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1367 }
1368
1369 /* return value determine if we need a write buffer flush */
1370 static void __iommu_flush_context(struct intel_iommu *iommu,
1371                                   u16 did, u16 source_id, u8 function_mask,
1372                                   u64 type)
1373 {
1374         u64 val = 0;
1375         unsigned long flag;
1376
1377         switch (type) {
1378         case DMA_CCMD_GLOBAL_INVL:
1379                 val = DMA_CCMD_GLOBAL_INVL;
1380                 break;
1381         case DMA_CCMD_DOMAIN_INVL:
1382                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1383                 break;
1384         case DMA_CCMD_DEVICE_INVL:
1385                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1386                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1387                 break;
1388         default:
1389                 BUG();
1390         }
1391         val |= DMA_CCMD_ICC;
1392
1393         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1394         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1395
1396         /* Make sure hardware complete it */
1397         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1398                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1399
1400         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1401 }
1402
1403 /* return value determine if we need a write buffer flush */
1404 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1405                                 u64 addr, unsigned int size_order, u64 type)
1406 {
1407         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1408         u64 val = 0, val_iva = 0;
1409         unsigned long flag;
1410
1411         switch (type) {
1412         case DMA_TLB_GLOBAL_FLUSH:
1413                 /* global flush doesn't need set IVA_REG */
1414                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1415                 break;
1416         case DMA_TLB_DSI_FLUSH:
1417                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1418                 break;
1419         case DMA_TLB_PSI_FLUSH:
1420                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1421                 /* IH bit is passed in as part of address */
1422                 val_iva = size_order | addr;
1423                 break;
1424         default:
1425                 BUG();
1426         }
1427         /* Note: set drain read/write */
1428 #if 0
1429         /*
1430          * This is probably to be super secure.. Looks like we can
1431          * ignore it without any impact.
1432          */
1433         if (cap_read_drain(iommu->cap))
1434                 val |= DMA_TLB_READ_DRAIN;
1435 #endif
1436         if (cap_write_drain(iommu->cap))
1437                 val |= DMA_TLB_WRITE_DRAIN;
1438
1439         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1440         /* Note: Only uses first TLB reg currently */
1441         if (val_iva)
1442                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1443         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1444
1445         /* Make sure hardware complete it */
1446         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1447                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1448
1449         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1450
1451         /* check IOTLB invalidation granularity */
1452         if (DMA_TLB_IAIG(val) == 0)
1453                 pr_err("Flush IOTLB failed\n");
1454         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1455                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1456                         (unsigned long long)DMA_TLB_IIRG(type),
1457                         (unsigned long long)DMA_TLB_IAIG(val));
1458 }
1459
1460 static struct device_domain_info *
1461 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1462                          u8 bus, u8 devfn)
1463 {
1464         struct device_domain_info *info;
1465
1466         assert_spin_locked(&device_domain_lock);
1467
1468         if (!iommu->qi)
1469                 return NULL;
1470
1471         list_for_each_entry(info, &domain->devices, link)
1472                 if (info->iommu == iommu && info->bus == bus &&
1473                     info->devfn == devfn) {
1474                         if (info->ats_supported && info->dev)
1475                                 return info;
1476                         break;
1477                 }
1478
1479         return NULL;
1480 }
1481
1482 static void domain_update_iotlb(struct dmar_domain *domain)
1483 {
1484         struct device_domain_info *info;
1485         bool has_iotlb_device = false;
1486
1487         assert_spin_locked(&device_domain_lock);
1488
1489         list_for_each_entry(info, &domain->devices, link) {
1490                 struct pci_dev *pdev;
1491
1492                 if (!info->dev || !dev_is_pci(info->dev))
1493                         continue;
1494
1495                 pdev = to_pci_dev(info->dev);
1496                 if (pdev->ats_enabled) {
1497                         has_iotlb_device = true;
1498                         break;
1499                 }
1500         }
1501
1502         domain->has_iotlb_device = has_iotlb_device;
1503 }
1504
1505 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1506 {
1507         struct pci_dev *pdev;
1508
1509         assert_spin_locked(&device_domain_lock);
1510
1511         if (!info || !dev_is_pci(info->dev))
1512                 return;
1513
1514         pdev = to_pci_dev(info->dev);
1515
1516 #ifdef CONFIG_INTEL_IOMMU_SVM
1517         /* The PCIe spec, in its wisdom, declares that the behaviour of
1518            the device if you enable PASID support after ATS support is
1519            undefined. So always enable PASID support on devices which
1520            have it, even if we can't yet know if we're ever going to
1521            use it. */
1522         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1523                 info->pasid_enabled = 1;
1524
1525         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1526                 info->pri_enabled = 1;
1527 #endif
1528         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1529                 info->ats_enabled = 1;
1530                 domain_update_iotlb(info->domain);
1531                 info->ats_qdep = pci_ats_queue_depth(pdev);
1532         }
1533 }
1534
1535 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1536 {
1537         struct pci_dev *pdev;
1538
1539         assert_spin_locked(&device_domain_lock);
1540
1541         if (!dev_is_pci(info->dev))
1542                 return;
1543
1544         pdev = to_pci_dev(info->dev);
1545
1546         if (info->ats_enabled) {
1547                 pci_disable_ats(pdev);
1548                 info->ats_enabled = 0;
1549                 domain_update_iotlb(info->domain);
1550         }
1551 #ifdef CONFIG_INTEL_IOMMU_SVM
1552         if (info->pri_enabled) {
1553                 pci_disable_pri(pdev);
1554                 info->pri_enabled = 0;
1555         }
1556         if (info->pasid_enabled) {
1557                 pci_disable_pasid(pdev);
1558                 info->pasid_enabled = 0;
1559         }
1560 #endif
1561 }
1562
1563 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1564                                   u64 addr, unsigned mask)
1565 {
1566         u16 sid, qdep;
1567         unsigned long flags;
1568         struct device_domain_info *info;
1569
1570         if (!domain->has_iotlb_device)
1571                 return;
1572
1573         spin_lock_irqsave(&device_domain_lock, flags);
1574         list_for_each_entry(info, &domain->devices, link) {
1575                 if (!info->ats_enabled)
1576                         continue;
1577
1578                 sid = info->bus << 8 | info->devfn;
1579                 qdep = info->ats_qdep;
1580                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1581         }
1582         spin_unlock_irqrestore(&device_domain_lock, flags);
1583 }
1584
1585 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1586                                   struct dmar_domain *domain,
1587                                   unsigned long pfn, unsigned int pages,
1588                                   int ih, int map)
1589 {
1590         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1591         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1592         u16 did = domain->iommu_did[iommu->seq_id];
1593
1594         BUG_ON(pages == 0);
1595
1596         if (ih)
1597                 ih = 1 << 6;
1598         /*
1599          * Fallback to domain selective flush if no PSI support or the size is
1600          * too big.
1601          * PSI requires page size to be 2 ^ x, and the base address is naturally
1602          * aligned to the size
1603          */
1604         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1605                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1606                                                 DMA_TLB_DSI_FLUSH);
1607         else
1608                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1609                                                 DMA_TLB_PSI_FLUSH);
1610
1611         /*
1612          * In caching mode, changes of pages from non-present to present require
1613          * flush. However, device IOTLB doesn't need to be flushed in this case.
1614          */
1615         if (!cap_caching_mode(iommu->cap) || !map)
1616                 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1617                                       addr, mask);
1618 }
1619
1620 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1621 {
1622         u32 pmen;
1623         unsigned long flags;
1624
1625         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1626         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1627         pmen &= ~DMA_PMEN_EPM;
1628         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1629
1630         /* wait for the protected region status bit to clear */
1631         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1632                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1633
1634         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1635 }
1636
1637 static void iommu_enable_translation(struct intel_iommu *iommu)
1638 {
1639         u32 sts;
1640         unsigned long flags;
1641
1642         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1643         iommu->gcmd |= DMA_GCMD_TE;
1644         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1645
1646         /* Make sure hardware complete it */
1647         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1648                       readl, (sts & DMA_GSTS_TES), sts);
1649
1650         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1651 }
1652
1653 static void iommu_disable_translation(struct intel_iommu *iommu)
1654 {
1655         u32 sts;
1656         unsigned long flag;
1657
1658         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1659         iommu->gcmd &= ~DMA_GCMD_TE;
1660         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1661
1662         /* Make sure hardware complete it */
1663         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1664                       readl, (!(sts & DMA_GSTS_TES)), sts);
1665
1666         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1667 }
1668
1669
1670 static int iommu_init_domains(struct intel_iommu *iommu)
1671 {
1672         u32 ndomains, nlongs;
1673         size_t size;
1674
1675         ndomains = cap_ndoms(iommu->cap);
1676         pr_debug("%s: Number of Domains supported <%d>\n",
1677                  iommu->name, ndomains);
1678         nlongs = BITS_TO_LONGS(ndomains);
1679
1680         spin_lock_init(&iommu->lock);
1681
1682         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1683         if (!iommu->domain_ids) {
1684                 pr_err("%s: Allocating domain id array failed\n",
1685                        iommu->name);
1686                 return -ENOMEM;
1687         }
1688
1689         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1690         iommu->domains = kzalloc(size, GFP_KERNEL);
1691
1692         if (iommu->domains) {
1693                 size = 256 * sizeof(struct dmar_domain *);
1694                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1695         }
1696
1697         if (!iommu->domains || !iommu->domains[0]) {
1698                 pr_err("%s: Allocating domain array failed\n",
1699                        iommu->name);
1700                 kfree(iommu->domain_ids);
1701                 kfree(iommu->domains);
1702                 iommu->domain_ids = NULL;
1703                 iommu->domains    = NULL;
1704                 return -ENOMEM;
1705         }
1706
1707
1708
1709         /*
1710          * If Caching mode is set, then invalid translations are tagged
1711          * with domain-id 0, hence we need to pre-allocate it. We also
1712          * use domain-id 0 as a marker for non-allocated domain-id, so
1713          * make sure it is not used for a real domain.
1714          */
1715         set_bit(0, iommu->domain_ids);
1716
1717         return 0;
1718 }
1719
1720 static void disable_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722         struct device_domain_info *info, *tmp;
1723         unsigned long flags;
1724
1725         if (!iommu->domains || !iommu->domain_ids)
1726                 return;
1727
1728 again:
1729         spin_lock_irqsave(&device_domain_lock, flags);
1730         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1731                 struct dmar_domain *domain;
1732
1733                 if (info->iommu != iommu)
1734                         continue;
1735
1736                 if (!info->dev || !info->domain)
1737                         continue;
1738
1739                 domain = info->domain;
1740
1741                 __dmar_remove_one_dev_info(info);
1742
1743                 if (!domain_type_is_vm_or_si(domain)) {
1744                         /*
1745                          * The domain_exit() function  can't be called under
1746                          * device_domain_lock, as it takes this lock itself.
1747                          * So release the lock here and re-run the loop
1748                          * afterwards.
1749                          */
1750                         spin_unlock_irqrestore(&device_domain_lock, flags);
1751                         domain_exit(domain);
1752                         goto again;
1753                 }
1754         }
1755         spin_unlock_irqrestore(&device_domain_lock, flags);
1756
1757         if (iommu->gcmd & DMA_GCMD_TE)
1758                 iommu_disable_translation(iommu);
1759 }
1760
1761 static void free_dmar_iommu(struct intel_iommu *iommu)
1762 {
1763         if ((iommu->domains) && (iommu->domain_ids)) {
1764                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1765                 int i;
1766
1767                 for (i = 0; i < elems; i++)
1768                         kfree(iommu->domains[i]);
1769                 kfree(iommu->domains);
1770                 kfree(iommu->domain_ids);
1771                 iommu->domains = NULL;
1772                 iommu->domain_ids = NULL;
1773         }
1774
1775         g_iommus[iommu->seq_id] = NULL;
1776
1777         /* free context mapping */
1778         free_context_table(iommu);
1779
1780 #ifdef CONFIG_INTEL_IOMMU_SVM
1781         if (pasid_enabled(iommu)) {
1782                 if (ecap_prs(iommu->ecap))
1783                         intel_svm_finish_prq(iommu);
1784                 intel_svm_free_pasid_tables(iommu);
1785         }
1786 #endif
1787 }
1788
1789 static struct dmar_domain *alloc_domain(int flags)
1790 {
1791         struct dmar_domain *domain;
1792
1793         domain = alloc_domain_mem();
1794         if (!domain)
1795                 return NULL;
1796
1797         memset(domain, 0, sizeof(*domain));
1798         domain->nid = -1;
1799         domain->flags = flags;
1800         domain->has_iotlb_device = false;
1801         INIT_LIST_HEAD(&domain->devices);
1802
1803         return domain;
1804 }
1805
1806 /* Must be called with iommu->lock */
1807 static int domain_attach_iommu(struct dmar_domain *domain,
1808                                struct intel_iommu *iommu)
1809 {
1810         unsigned long ndomains;
1811         int num;
1812
1813         assert_spin_locked(&device_domain_lock);
1814         assert_spin_locked(&iommu->lock);
1815
1816         domain->iommu_refcnt[iommu->seq_id] += 1;
1817         domain->iommu_count += 1;
1818         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1819                 ndomains = cap_ndoms(iommu->cap);
1820                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1821
1822                 if (num >= ndomains) {
1823                         pr_err("%s: No free domain ids\n", iommu->name);
1824                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1825                         domain->iommu_count -= 1;
1826                         return -ENOSPC;
1827                 }
1828
1829                 set_bit(num, iommu->domain_ids);
1830                 set_iommu_domain(iommu, num, domain);
1831
1832                 domain->iommu_did[iommu->seq_id] = num;
1833                 domain->nid                      = iommu->node;
1834
1835                 domain_update_iommu_cap(domain);
1836         }
1837
1838         return 0;
1839 }
1840
1841 static int domain_detach_iommu(struct dmar_domain *domain,
1842                                struct intel_iommu *iommu)
1843 {
1844         int num, count = INT_MAX;
1845
1846         assert_spin_locked(&device_domain_lock);
1847         assert_spin_locked(&iommu->lock);
1848
1849         domain->iommu_refcnt[iommu->seq_id] -= 1;
1850         count = --domain->iommu_count;
1851         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1852                 num = domain->iommu_did[iommu->seq_id];
1853                 clear_bit(num, iommu->domain_ids);
1854                 set_iommu_domain(iommu, num, NULL);
1855
1856                 domain_update_iommu_cap(domain);
1857                 domain->iommu_did[iommu->seq_id] = 0;
1858         }
1859
1860         return count;
1861 }
1862
1863 static struct iova_domain reserved_iova_list;
1864 static struct lock_class_key reserved_rbtree_key;
1865
1866 static int dmar_init_reserved_ranges(void)
1867 {
1868         struct pci_dev *pdev = NULL;
1869         struct iova *iova;
1870         int i;
1871
1872         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1873                         DMA_32BIT_PFN);
1874
1875         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1876                 &reserved_rbtree_key);
1877
1878         /* IOAPIC ranges shouldn't be accessed by DMA */
1879         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1880                 IOVA_PFN(IOAPIC_RANGE_END));
1881         if (!iova) {
1882                 pr_err("Reserve IOAPIC range failed\n");
1883                 return -ENODEV;
1884         }
1885
1886         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1887         for_each_pci_dev(pdev) {
1888                 struct resource *r;
1889
1890                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1891                         r = &pdev->resource[i];
1892                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1893                                 continue;
1894                         iova = reserve_iova(&reserved_iova_list,
1895                                             IOVA_PFN(r->start),
1896                                             IOVA_PFN(r->end));
1897                         if (!iova) {
1898                                 pr_err("Reserve iova failed\n");
1899                                 return -ENODEV;
1900                         }
1901                 }
1902         }
1903         return 0;
1904 }
1905
1906 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1907 {
1908         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1909 }
1910
1911 static inline int guestwidth_to_adjustwidth(int gaw)
1912 {
1913         int agaw;
1914         int r = (gaw - 12) % 9;
1915
1916         if (r == 0)
1917                 agaw = gaw;
1918         else
1919                 agaw = gaw + 9 - r;
1920         if (agaw > 64)
1921                 agaw = 64;
1922         return agaw;
1923 }
1924
1925 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1926                        int guest_width)
1927 {
1928         int adjust_width, agaw;
1929         unsigned long sagaw;
1930
1931         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1932                         DMA_32BIT_PFN);
1933         domain_reserve_special_ranges(domain);
1934
1935         /* calculate AGAW */
1936         if (guest_width > cap_mgaw(iommu->cap))
1937                 guest_width = cap_mgaw(iommu->cap);
1938         domain->gaw = guest_width;
1939         adjust_width = guestwidth_to_adjustwidth(guest_width);
1940         agaw = width_to_agaw(adjust_width);
1941         sagaw = cap_sagaw(iommu->cap);
1942         if (!test_bit(agaw, &sagaw)) {
1943                 /* hardware doesn't support it, choose a bigger one */
1944                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1945                 agaw = find_next_bit(&sagaw, 5, agaw);
1946                 if (agaw >= 5)
1947                         return -ENODEV;
1948         }
1949         domain->agaw = agaw;
1950
1951         if (ecap_coherent(iommu->ecap))
1952                 domain->iommu_coherency = 1;
1953         else
1954                 domain->iommu_coherency = 0;
1955
1956         if (ecap_sc_support(iommu->ecap))
1957                 domain->iommu_snooping = 1;
1958         else
1959                 domain->iommu_snooping = 0;
1960
1961         if (intel_iommu_superpage)
1962                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1963         else
1964                 domain->iommu_superpage = 0;
1965
1966         domain->nid = iommu->node;
1967
1968         /* always allocate the top pgd */
1969         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1970         if (!domain->pgd)
1971                 return -ENOMEM;
1972         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1973         return 0;
1974 }
1975
1976 static void domain_exit(struct dmar_domain *domain)
1977 {
1978         struct page *freelist = NULL;
1979
1980         /* Domain 0 is reserved, so dont process it */
1981         if (!domain)
1982                 return;
1983
1984         /* Flush any lazy unmaps that may reference this domain */
1985         if (!intel_iommu_strict) {
1986                 int cpu;
1987
1988                 for_each_possible_cpu(cpu)
1989                         flush_unmaps_timeout(cpu);
1990         }
1991
1992         /* Remove associated devices and clear attached or cached domains */
1993         rcu_read_lock();
1994         domain_remove_dev_info(domain);
1995         rcu_read_unlock();
1996
1997         /* destroy iovas */
1998         put_iova_domain(&domain->iovad);
1999
2000         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2001
2002         dma_free_pagelist(freelist);
2003
2004         free_domain_mem(domain);
2005 }
2006
2007 static int domain_context_mapping_one(struct dmar_domain *domain,
2008                                       struct intel_iommu *iommu,
2009                                       u8 bus, u8 devfn)
2010 {
2011         u16 did = domain->iommu_did[iommu->seq_id];
2012         int translation = CONTEXT_TT_MULTI_LEVEL;
2013         struct device_domain_info *info = NULL;
2014         struct context_entry *context;
2015         unsigned long flags;
2016         struct dma_pte *pgd;
2017         int ret, agaw;
2018
2019         WARN_ON(did == 0);
2020
2021         if (hw_pass_through && domain_type_is_si(domain))
2022                 translation = CONTEXT_TT_PASS_THROUGH;
2023
2024         pr_debug("Set context mapping for %02x:%02x.%d\n",
2025                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2026
2027         BUG_ON(!domain->pgd);
2028
2029         spin_lock_irqsave(&device_domain_lock, flags);
2030         spin_lock(&iommu->lock);
2031
2032         ret = -ENOMEM;
2033         context = iommu_context_addr(iommu, bus, devfn, 1);
2034         if (!context)
2035                 goto out_unlock;
2036
2037         ret = 0;
2038         if (context_present(context))
2039                 goto out_unlock;
2040
2041         /*
2042          * For kdump cases, old valid entries may be cached due to the
2043          * in-flight DMA and copied pgtable, but there is no unmapping
2044          * behaviour for them, thus we need an explicit cache flush for
2045          * the newly-mapped device. For kdump, at this point, the device
2046          * is supposed to finish reset at its driver probe stage, so no
2047          * in-flight DMA will exist, and we don't need to worry anymore
2048          * hereafter.
2049          */
2050         if (context_copied(context)) {
2051                 u16 did_old = context_domain_id(context);
2052
2053                 if (did_old >= 0 && did_old < cap_ndoms(iommu->cap))
2054                         iommu->flush.flush_context(iommu, did_old,
2055                                                    (((u16)bus) << 8) | devfn,
2056                                                    DMA_CCMD_MASK_NOBIT,
2057                                                    DMA_CCMD_DEVICE_INVL);
2058         }
2059
2060         pgd = domain->pgd;
2061
2062         context_clear_entry(context);
2063         context_set_domain_id(context, did);
2064
2065         /*
2066          * Skip top levels of page tables for iommu which has less agaw
2067          * than default.  Unnecessary for PT mode.
2068          */
2069         if (translation != CONTEXT_TT_PASS_THROUGH) {
2070                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2071                         ret = -ENOMEM;
2072                         pgd = phys_to_virt(dma_pte_addr(pgd));
2073                         if (!dma_pte_present(pgd))
2074                                 goto out_unlock;
2075                 }
2076
2077                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2078                 if (info && info->ats_supported)
2079                         translation = CONTEXT_TT_DEV_IOTLB;
2080                 else
2081                         translation = CONTEXT_TT_MULTI_LEVEL;
2082
2083                 context_set_address_root(context, virt_to_phys(pgd));
2084                 context_set_address_width(context, iommu->agaw);
2085         } else {
2086                 /*
2087                  * In pass through mode, AW must be programmed to
2088                  * indicate the largest AGAW value supported by
2089                  * hardware. And ASR is ignored by hardware.
2090                  */
2091                 context_set_address_width(context, iommu->msagaw);
2092         }
2093
2094         context_set_translation_type(context, translation);
2095         context_set_fault_enable(context);
2096         context_set_present(context);
2097         domain_flush_cache(domain, context, sizeof(*context));
2098
2099         /*
2100          * It's a non-present to present mapping. If hardware doesn't cache
2101          * non-present entry we only need to flush the write-buffer. If the
2102          * _does_ cache non-present entries, then it does so in the special
2103          * domain #0, which we have to flush:
2104          */
2105         if (cap_caching_mode(iommu->cap)) {
2106                 iommu->flush.flush_context(iommu, 0,
2107                                            (((u16)bus) << 8) | devfn,
2108                                            DMA_CCMD_MASK_NOBIT,
2109                                            DMA_CCMD_DEVICE_INVL);
2110                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2111         } else {
2112                 iommu_flush_write_buffer(iommu);
2113         }
2114         iommu_enable_dev_iotlb(info);
2115
2116         ret = 0;
2117
2118 out_unlock:
2119         spin_unlock(&iommu->lock);
2120         spin_unlock_irqrestore(&device_domain_lock, flags);
2121
2122         return ret;
2123 }
2124
2125 struct domain_context_mapping_data {
2126         struct dmar_domain *domain;
2127         struct intel_iommu *iommu;
2128 };
2129
2130 static int domain_context_mapping_cb(struct pci_dev *pdev,
2131                                      u16 alias, void *opaque)
2132 {
2133         struct domain_context_mapping_data *data = opaque;
2134
2135         return domain_context_mapping_one(data->domain, data->iommu,
2136                                           PCI_BUS_NUM(alias), alias & 0xff);
2137 }
2138
2139 static int
2140 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2141 {
2142         struct intel_iommu *iommu;
2143         u8 bus, devfn;
2144         struct domain_context_mapping_data data;
2145
2146         iommu = device_to_iommu(dev, &bus, &devfn);
2147         if (!iommu)
2148                 return -ENODEV;
2149
2150         if (!dev_is_pci(dev))
2151                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2152
2153         data.domain = domain;
2154         data.iommu = iommu;
2155
2156         return pci_for_each_dma_alias(to_pci_dev(dev),
2157                                       &domain_context_mapping_cb, &data);
2158 }
2159
2160 static int domain_context_mapped_cb(struct pci_dev *pdev,
2161                                     u16 alias, void *opaque)
2162 {
2163         struct intel_iommu *iommu = opaque;
2164
2165         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2166 }
2167
2168 static int domain_context_mapped(struct device *dev)
2169 {
2170         struct intel_iommu *iommu;
2171         u8 bus, devfn;
2172
2173         iommu = device_to_iommu(dev, &bus, &devfn);
2174         if (!iommu)
2175                 return -ENODEV;
2176
2177         if (!dev_is_pci(dev))
2178                 return device_context_mapped(iommu, bus, devfn);
2179
2180         return !pci_for_each_dma_alias(to_pci_dev(dev),
2181                                        domain_context_mapped_cb, iommu);
2182 }
2183
2184 /* Returns a number of VTD pages, but aligned to MM page size */
2185 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2186                                             size_t size)
2187 {
2188         host_addr &= ~PAGE_MASK;
2189         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2190 }
2191
2192 /* Return largest possible superpage level for a given mapping */
2193 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2194                                           unsigned long iov_pfn,
2195                                           unsigned long phy_pfn,
2196                                           unsigned long pages)
2197 {
2198         int support, level = 1;
2199         unsigned long pfnmerge;
2200
2201         support = domain->iommu_superpage;
2202
2203         /* To use a large page, the virtual *and* physical addresses
2204            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2205            of them will mean we have to use smaller pages. So just
2206            merge them and check both at once. */
2207         pfnmerge = iov_pfn | phy_pfn;
2208
2209         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2210                 pages >>= VTD_STRIDE_SHIFT;
2211                 if (!pages)
2212                         break;
2213                 pfnmerge >>= VTD_STRIDE_SHIFT;
2214                 level++;
2215                 support--;
2216         }
2217         return level;
2218 }
2219
2220 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2221                             struct scatterlist *sg, unsigned long phys_pfn,
2222                             unsigned long nr_pages, int prot)
2223 {
2224         struct dma_pte *first_pte = NULL, *pte = NULL;
2225         phys_addr_t uninitialized_var(pteval);
2226         unsigned long sg_res = 0;
2227         unsigned int largepage_lvl = 0;
2228         unsigned long lvl_pages = 0;
2229
2230         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2231
2232         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2233                 return -EINVAL;
2234
2235         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2236
2237         if (!sg) {
2238                 sg_res = nr_pages;
2239                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2240         }
2241
2242         while (nr_pages > 0) {
2243                 uint64_t tmp;
2244
2245                 if (!sg_res) {
2246                         sg_res = aligned_nrpages(sg->offset, sg->length);
2247                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2248                         sg->dma_length = sg->length;
2249                         pteval = page_to_phys(sg_page(sg)) | prot;
2250                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2251                 }
2252
2253                 if (!pte) {
2254                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2255
2256                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2257                         if (!pte)
2258                                 return -ENOMEM;
2259                         /* It is large page*/
2260                         if (largepage_lvl > 1) {
2261                                 unsigned long nr_superpages, end_pfn;
2262
2263                                 pteval |= DMA_PTE_LARGE_PAGE;
2264                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2265
2266                                 nr_superpages = sg_res / lvl_pages;
2267                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2268
2269                                 /*
2270                                  * Ensure that old small page tables are
2271                                  * removed to make room for superpage(s).
2272                                  */
2273                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
2274                         } else {
2275                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2276                         }
2277
2278                 }
2279                 /* We don't need lock here, nobody else
2280                  * touches the iova range
2281                  */
2282                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2283                 if (tmp) {
2284                         static int dumps = 5;
2285                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2286                                 iov_pfn, tmp, (unsigned long long)pteval);
2287                         if (dumps) {
2288                                 dumps--;
2289                                 debug_dma_dump_mappings(NULL);
2290                         }
2291                         WARN_ON(1);
2292                 }
2293
2294                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2295
2296                 BUG_ON(nr_pages < lvl_pages);
2297                 BUG_ON(sg_res < lvl_pages);
2298
2299                 nr_pages -= lvl_pages;
2300                 iov_pfn += lvl_pages;
2301                 phys_pfn += lvl_pages;
2302                 pteval += lvl_pages * VTD_PAGE_SIZE;
2303                 sg_res -= lvl_pages;
2304
2305                 /* If the next PTE would be the first in a new page, then we
2306                    need to flush the cache on the entries we've just written.
2307                    And then we'll need to recalculate 'pte', so clear it and
2308                    let it get set again in the if (!pte) block above.
2309
2310                    If we're done (!nr_pages) we need to flush the cache too.
2311
2312                    Also if we've been setting superpages, we may need to
2313                    recalculate 'pte' and switch back to smaller pages for the
2314                    end of the mapping, if the trailing size is not enough to
2315                    use another superpage (i.e. sg_res < lvl_pages). */
2316                 pte++;
2317                 if (!nr_pages || first_pte_in_page(pte) ||
2318                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2319                         domain_flush_cache(domain, first_pte,
2320                                            (void *)pte - (void *)first_pte);
2321                         pte = NULL;
2322                 }
2323
2324                 if (!sg_res && nr_pages)
2325                         sg = sg_next(sg);
2326         }
2327         return 0;
2328 }
2329
2330 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2331                                     struct scatterlist *sg, unsigned long nr_pages,
2332                                     int prot)
2333 {
2334         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2335 }
2336
2337 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2338                                      unsigned long phys_pfn, unsigned long nr_pages,
2339                                      int prot)
2340 {
2341         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2342 }
2343
2344 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2345 {
2346         if (!iommu)
2347                 return;
2348
2349         clear_context_table(iommu, bus, devfn);
2350         iommu->flush.flush_context(iommu, 0, 0, 0,
2351                                            DMA_CCMD_GLOBAL_INVL);
2352         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2353 }
2354
2355 static inline void unlink_domain_info(struct device_domain_info *info)
2356 {
2357         assert_spin_locked(&device_domain_lock);
2358         list_del(&info->link);
2359         list_del(&info->global);
2360         if (info->dev)
2361                 info->dev->archdata.iommu = NULL;
2362 }
2363
2364 static void domain_remove_dev_info(struct dmar_domain *domain)
2365 {
2366         struct device_domain_info *info, *tmp;
2367         unsigned long flags;
2368
2369         spin_lock_irqsave(&device_domain_lock, flags);
2370         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2371                 __dmar_remove_one_dev_info(info);
2372         spin_unlock_irqrestore(&device_domain_lock, flags);
2373 }
2374
2375 /*
2376  * find_domain
2377  * Note: we use struct device->archdata.iommu stores the info
2378  */
2379 static struct dmar_domain *find_domain(struct device *dev)
2380 {
2381         struct device_domain_info *info;
2382
2383         /* No lock here, assumes no domain exit in normal case */
2384         info = dev->archdata.iommu;
2385         if (info)
2386                 return info->domain;
2387         return NULL;
2388 }
2389
2390 static inline struct device_domain_info *
2391 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2392 {
2393         struct device_domain_info *info;
2394
2395         list_for_each_entry(info, &device_domain_list, global)
2396                 if (info->iommu->segment == segment && info->bus == bus &&
2397                     info->devfn == devfn)
2398                         return info;
2399
2400         return NULL;
2401 }
2402
2403 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2404                                                     int bus, int devfn,
2405                                                     struct device *dev,
2406                                                     struct dmar_domain *domain)
2407 {
2408         struct dmar_domain *found = NULL;
2409         struct device_domain_info *info;
2410         unsigned long flags;
2411         int ret;
2412
2413         info = alloc_devinfo_mem();
2414         if (!info)
2415                 return NULL;
2416
2417         info->bus = bus;
2418         info->devfn = devfn;
2419         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2420         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2421         info->ats_qdep = 0;
2422         info->dev = dev;
2423         info->domain = domain;
2424         info->iommu = iommu;
2425
2426         if (dev && dev_is_pci(dev)) {
2427                 struct pci_dev *pdev = to_pci_dev(info->dev);
2428
2429                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2430                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2431                     dmar_find_matched_atsr_unit(pdev))
2432                         info->ats_supported = 1;
2433
2434                 if (ecs_enabled(iommu)) {
2435                         if (pasid_enabled(iommu)) {
2436                                 int features = pci_pasid_features(pdev);
2437                                 if (features >= 0)
2438                                         info->pasid_supported = features | 1;
2439                         }
2440
2441                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2442                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2443                                 info->pri_supported = 1;
2444                 }
2445         }
2446
2447         spin_lock_irqsave(&device_domain_lock, flags);
2448         if (dev)
2449                 found = find_domain(dev);
2450
2451         if (!found) {
2452                 struct device_domain_info *info2;
2453                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2454                 if (info2) {
2455                         found      = info2->domain;
2456                         info2->dev = dev;
2457                 }
2458         }
2459
2460         if (found) {
2461                 spin_unlock_irqrestore(&device_domain_lock, flags);
2462                 free_devinfo_mem(info);
2463                 /* Caller must free the original domain */
2464                 return found;
2465         }
2466
2467         spin_lock(&iommu->lock);
2468         ret = domain_attach_iommu(domain, iommu);
2469         spin_unlock(&iommu->lock);
2470
2471         if (ret) {
2472                 spin_unlock_irqrestore(&device_domain_lock, flags);
2473                 free_devinfo_mem(info);
2474                 return NULL;
2475         }
2476
2477         list_add(&info->link, &domain->devices);
2478         list_add(&info->global, &device_domain_list);
2479         if (dev)
2480                 dev->archdata.iommu = info;
2481         spin_unlock_irqrestore(&device_domain_lock, flags);
2482
2483         if (dev && domain_context_mapping(domain, dev)) {
2484                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2485                 dmar_remove_one_dev_info(domain, dev);
2486                 return NULL;
2487         }
2488
2489         return domain;
2490 }
2491
2492 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2493 {
2494         *(u16 *)opaque = alias;
2495         return 0;
2496 }
2497
2498 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2499 {
2500         struct device_domain_info *info = NULL;
2501         struct dmar_domain *domain = NULL;
2502         struct intel_iommu *iommu;
2503         u16 req_id, dma_alias;
2504         unsigned long flags;
2505         u8 bus, devfn;
2506
2507         iommu = device_to_iommu(dev, &bus, &devfn);
2508         if (!iommu)
2509                 return NULL;
2510
2511         req_id = ((u16)bus << 8) | devfn;
2512
2513         if (dev_is_pci(dev)) {
2514                 struct pci_dev *pdev = to_pci_dev(dev);
2515
2516                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2517
2518                 spin_lock_irqsave(&device_domain_lock, flags);
2519                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2520                                                       PCI_BUS_NUM(dma_alias),
2521                                                       dma_alias & 0xff);
2522                 if (info) {
2523                         iommu = info->iommu;
2524                         domain = info->domain;
2525                 }
2526                 spin_unlock_irqrestore(&device_domain_lock, flags);
2527
2528                 /* DMA alias already has a domain, use it */
2529                 if (info)
2530                         goto out;
2531         }
2532
2533         /* Allocate and initialize new domain for the device */
2534         domain = alloc_domain(0);
2535         if (!domain)
2536                 return NULL;
2537         if (domain_init(domain, iommu, gaw)) {
2538                 domain_exit(domain);
2539                 return NULL;
2540         }
2541
2542 out:
2543
2544         return domain;
2545 }
2546
2547 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2548                                               struct dmar_domain *domain)
2549 {
2550         struct intel_iommu *iommu;
2551         struct dmar_domain *tmp;
2552         u16 req_id, dma_alias;
2553         u8 bus, devfn;
2554
2555         iommu = device_to_iommu(dev, &bus, &devfn);
2556         if (!iommu)
2557                 return NULL;
2558
2559         req_id = ((u16)bus << 8) | devfn;
2560
2561         if (dev_is_pci(dev)) {
2562                 struct pci_dev *pdev = to_pci_dev(dev);
2563
2564                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2565
2566                 /* register PCI DMA alias device */
2567                 if (req_id != dma_alias) {
2568                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2569                                         dma_alias & 0xff, NULL, domain);
2570
2571                         if (!tmp || tmp != domain)
2572                                 return tmp;
2573                 }
2574         }
2575
2576         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2577         if (!tmp || tmp != domain)
2578                 return tmp;
2579
2580         return domain;
2581 }
2582
2583 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2584 {
2585         struct dmar_domain *domain, *tmp;
2586
2587         domain = find_domain(dev);
2588         if (domain)
2589                 goto out;
2590
2591         domain = find_or_alloc_domain(dev, gaw);
2592         if (!domain)
2593                 goto out;
2594
2595         tmp = set_domain_for_dev(dev, domain);
2596         if (!tmp || domain != tmp) {
2597                 domain_exit(domain);
2598                 domain = tmp;
2599         }
2600
2601 out:
2602
2603         return domain;
2604 }
2605
2606 static int iommu_domain_identity_map(struct dmar_domain *domain,
2607                                      unsigned long long start,
2608                                      unsigned long long end)
2609 {
2610         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2611         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2612
2613         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2614                           dma_to_mm_pfn(last_vpfn))) {
2615                 pr_err("Reserving iova failed\n");
2616                 return -ENOMEM;
2617         }
2618
2619         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2620         /*
2621          * RMRR range might have overlap with physical memory range,
2622          * clear it first
2623          */
2624         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2625
2626         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2627                                   last_vpfn - first_vpfn + 1,
2628                                   DMA_PTE_READ|DMA_PTE_WRITE);
2629 }
2630
2631 static int domain_prepare_identity_map(struct device *dev,
2632                                        struct dmar_domain *domain,
2633                                        unsigned long long start,
2634                                        unsigned long long end)
2635 {
2636         /* For _hardware_ passthrough, don't bother. But for software
2637            passthrough, we do it anyway -- it may indicate a memory
2638            range which is reserved in E820, so which didn't get set
2639            up to start with in si_domain */
2640         if (domain == si_domain && hw_pass_through) {
2641                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2642                         dev_name(dev), start, end);
2643                 return 0;
2644         }
2645
2646         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2647                 dev_name(dev), start, end);
2648
2649         if (end < start) {
2650                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2651                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2652                         dmi_get_system_info(DMI_BIOS_VENDOR),
2653                         dmi_get_system_info(DMI_BIOS_VERSION),
2654                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2655                 return -EIO;
2656         }
2657
2658         if (end >> agaw_to_width(domain->agaw)) {
2659                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2660                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2661                      agaw_to_width(domain->agaw),
2662                      dmi_get_system_info(DMI_BIOS_VENDOR),
2663                      dmi_get_system_info(DMI_BIOS_VERSION),
2664                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2665                 return -EIO;
2666         }
2667
2668         return iommu_domain_identity_map(domain, start, end);
2669 }
2670
2671 static int iommu_prepare_identity_map(struct device *dev,
2672                                       unsigned long long start,
2673                                       unsigned long long end)
2674 {
2675         struct dmar_domain *domain;
2676         int ret;
2677
2678         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2679         if (!domain)
2680                 return -ENOMEM;
2681
2682         ret = domain_prepare_identity_map(dev, domain, start, end);
2683         if (ret)
2684                 domain_exit(domain);
2685
2686         return ret;
2687 }
2688
2689 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2690                                          struct device *dev)
2691 {
2692         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2693                 return 0;
2694         return iommu_prepare_identity_map(dev, rmrr->base_address,
2695                                           rmrr->end_address);
2696 }
2697
2698 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2699 static inline void iommu_prepare_isa(void)
2700 {
2701         struct pci_dev *pdev;
2702         int ret;
2703
2704         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2705         if (!pdev)
2706                 return;
2707
2708         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2709         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2710
2711         if (ret)
2712                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2713
2714         pci_dev_put(pdev);
2715 }
2716 #else
2717 static inline void iommu_prepare_isa(void)
2718 {
2719         return;
2720 }
2721 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2722
2723 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2724
2725 static int __init si_domain_init(int hw)
2726 {
2727         int nid, ret = 0;
2728
2729         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2730         if (!si_domain)
2731                 return -EFAULT;
2732
2733         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2734                 domain_exit(si_domain);
2735                 return -EFAULT;
2736         }
2737
2738         pr_debug("Identity mapping domain allocated\n");
2739
2740         if (hw)
2741                 return 0;
2742
2743         for_each_online_node(nid) {
2744                 unsigned long start_pfn, end_pfn;
2745                 int i;
2746
2747                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2748                         ret = iommu_domain_identity_map(si_domain,
2749                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2750                         if (ret)
2751                                 return ret;
2752                 }
2753         }
2754
2755         return 0;
2756 }
2757
2758 static int identity_mapping(struct device *dev)
2759 {
2760         struct device_domain_info *info;
2761
2762         if (likely(!iommu_identity_mapping))
2763                 return 0;
2764
2765         info = dev->archdata.iommu;
2766         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2767                 return (info->domain == si_domain);
2768
2769         return 0;
2770 }
2771
2772 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2773 {
2774         struct dmar_domain *ndomain;
2775         struct intel_iommu *iommu;
2776         u8 bus, devfn;
2777
2778         iommu = device_to_iommu(dev, &bus, &devfn);
2779         if (!iommu)
2780                 return -ENODEV;
2781
2782         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2783         if (ndomain != domain)
2784                 return -EBUSY;
2785
2786         return 0;
2787 }
2788
2789 static bool device_has_rmrr(struct device *dev)
2790 {
2791         struct dmar_rmrr_unit *rmrr;
2792         struct device *tmp;
2793         int i;
2794
2795         rcu_read_lock();
2796         for_each_rmrr_units(rmrr) {
2797                 /*
2798                  * Return TRUE if this RMRR contains the device that
2799                  * is passed in.
2800                  */
2801                 for_each_active_dev_scope(rmrr->devices,
2802                                           rmrr->devices_cnt, i, tmp)
2803                         if (tmp == dev) {
2804                                 rcu_read_unlock();
2805                                 return true;
2806                         }
2807         }
2808         rcu_read_unlock();
2809         return false;
2810 }
2811
2812 /*
2813  * There are a couple cases where we need to restrict the functionality of
2814  * devices associated with RMRRs.  The first is when evaluating a device for
2815  * identity mapping because problems exist when devices are moved in and out
2816  * of domains and their respective RMRR information is lost.  This means that
2817  * a device with associated RMRRs will never be in a "passthrough" domain.
2818  * The second is use of the device through the IOMMU API.  This interface
2819  * expects to have full control of the IOVA space for the device.  We cannot
2820  * satisfy both the requirement that RMRR access is maintained and have an
2821  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2822  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2823  * We therefore prevent devices associated with an RMRR from participating in
2824  * the IOMMU API, which eliminates them from device assignment.
2825  *
2826  * In both cases we assume that PCI USB devices with RMRRs have them largely
2827  * for historical reasons and that the RMRR space is not actively used post
2828  * boot.  This exclusion may change if vendors begin to abuse it.
2829  *
2830  * The same exception is made for graphics devices, with the requirement that
2831  * any use of the RMRR regions will be torn down before assigning the device
2832  * to a guest.
2833  */
2834 static bool device_is_rmrr_locked(struct device *dev)
2835 {
2836         if (!device_has_rmrr(dev))
2837                 return false;
2838
2839         if (dev_is_pci(dev)) {
2840                 struct pci_dev *pdev = to_pci_dev(dev);
2841
2842                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2843                         return false;
2844         }
2845
2846         return true;
2847 }
2848
2849 static int iommu_should_identity_map(struct device *dev, int startup)
2850 {
2851
2852         if (dev_is_pci(dev)) {
2853                 struct pci_dev *pdev = to_pci_dev(dev);
2854
2855                 if (device_is_rmrr_locked(dev))
2856                         return 0;
2857
2858                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2859                         return 1;
2860
2861                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2862                         return 1;
2863
2864                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2865                         return 0;
2866
2867                 /*
2868                  * We want to start off with all devices in the 1:1 domain, and
2869                  * take them out later if we find they can't access all of memory.
2870                  *
2871                  * However, we can't do this for PCI devices behind bridges,
2872                  * because all PCI devices behind the same bridge will end up
2873                  * with the same source-id on their transactions.
2874                  *
2875                  * Practically speaking, we can't change things around for these
2876                  * devices at run-time, because we can't be sure there'll be no
2877                  * DMA transactions in flight for any of their siblings.
2878                  *
2879                  * So PCI devices (unless they're on the root bus) as well as
2880                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2881                  * the 1:1 domain, just in _case_ one of their siblings turns out
2882                  * not to be able to map all of memory.
2883                  */
2884                 if (!pci_is_pcie(pdev)) {
2885                         if (!pci_is_root_bus(pdev->bus))
2886                                 return 0;
2887                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2888                                 return 0;
2889                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2890                         return 0;
2891         } else {
2892                 if (device_has_rmrr(dev))
2893                         return 0;
2894         }
2895
2896         /*
2897          * At boot time, we don't yet know if devices will be 64-bit capable.
2898          * Assume that they will — if they turn out not to be, then we can
2899          * take them out of the 1:1 domain later.
2900          */
2901         if (!startup) {
2902                 /*
2903                  * If the device's dma_mask is less than the system's memory
2904                  * size then this is not a candidate for identity mapping.
2905                  */
2906                 u64 dma_mask = *dev->dma_mask;
2907
2908                 if (dev->coherent_dma_mask &&
2909                     dev->coherent_dma_mask < dma_mask)
2910                         dma_mask = dev->coherent_dma_mask;
2911
2912                 return dma_mask >= dma_get_required_mask(dev);
2913         }
2914
2915         return 1;
2916 }
2917
2918 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2919 {
2920         int ret;
2921
2922         if (!iommu_should_identity_map(dev, 1))
2923                 return 0;
2924
2925         ret = domain_add_dev_info(si_domain, dev);
2926         if (!ret)
2927                 pr_info("%s identity mapping for device %s\n",
2928                         hw ? "Hardware" : "Software", dev_name(dev));
2929         else if (ret == -ENODEV)
2930                 /* device not associated with an iommu */
2931                 ret = 0;
2932
2933         return ret;
2934 }
2935
2936
2937 static int __init iommu_prepare_static_identity_mapping(int hw)
2938 {
2939         struct pci_dev *pdev = NULL;
2940         struct dmar_drhd_unit *drhd;
2941         struct intel_iommu *iommu;
2942         struct device *dev;
2943         int i;
2944         int ret = 0;
2945
2946         for_each_pci_dev(pdev) {
2947                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2948                 if (ret)
2949                         return ret;
2950         }
2951
2952         for_each_active_iommu(iommu, drhd)
2953                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2954                         struct acpi_device_physical_node *pn;
2955                         struct acpi_device *adev;
2956
2957                         if (dev->bus != &acpi_bus_type)
2958                                 continue;
2959
2960                         adev= to_acpi_device(dev);
2961                         mutex_lock(&adev->physical_node_lock);
2962                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2963                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2964                                 if (ret)
2965                                         break;
2966                         }
2967                         mutex_unlock(&adev->physical_node_lock);
2968                         if (ret)
2969                                 return ret;
2970                 }
2971
2972         return 0;
2973 }
2974
2975 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2976 {
2977         /*
2978          * Start from the sane iommu hardware state.
2979          * If the queued invalidation is already initialized by us
2980          * (for example, while enabling interrupt-remapping) then
2981          * we got the things already rolling from a sane state.
2982          */
2983         if (!iommu->qi) {
2984                 /*
2985                  * Clear any previous faults.
2986                  */
2987                 dmar_fault(-1, iommu);
2988                 /*
2989                  * Disable queued invalidation if supported and already enabled
2990                  * before OS handover.
2991                  */
2992                 dmar_disable_qi(iommu);
2993         }
2994
2995         if (dmar_enable_qi(iommu)) {
2996                 /*
2997                  * Queued Invalidate not enabled, use Register Based Invalidate
2998                  */
2999                 iommu->flush.flush_context = __iommu_flush_context;
3000                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3001                 pr_info("%s: Using Register based invalidation\n",
3002                         iommu->name);
3003         } else {
3004                 iommu->flush.flush_context = qi_flush_context;
3005                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3006                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3007         }
3008 }
3009
3010 static int copy_context_table(struct intel_iommu *iommu,
3011                               struct root_entry *old_re,
3012                               struct context_entry **tbl,
3013                               int bus, bool ext)
3014 {
3015         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3016         struct context_entry *new_ce = NULL, ce;
3017         struct context_entry *old_ce = NULL;
3018         struct root_entry re;
3019         phys_addr_t old_ce_phys;
3020
3021         tbl_idx = ext ? bus * 2 : bus;
3022         memcpy(&re, old_re, sizeof(re));
3023
3024         for (devfn = 0; devfn < 256; devfn++) {
3025                 /* First calculate the correct index */
3026                 idx = (ext ? devfn * 2 : devfn) % 256;
3027
3028                 if (idx == 0) {
3029                         /* First save what we may have and clean up */
3030                         if (new_ce) {
3031                                 tbl[tbl_idx] = new_ce;
3032                                 __iommu_flush_cache(iommu, new_ce,
3033                                                     VTD_PAGE_SIZE);
3034                                 pos = 1;
3035                         }
3036
3037                         if (old_ce)
3038                                 iounmap(old_ce);
3039
3040                         ret = 0;
3041                         if (devfn < 0x80)
3042                                 old_ce_phys = root_entry_lctp(&re);
3043                         else
3044                                 old_ce_phys = root_entry_uctp(&re);
3045
3046                         if (!old_ce_phys) {
3047                                 if (ext && devfn == 0) {
3048                                         /* No LCTP, try UCTP */
3049                                         devfn = 0x7f;
3050                                         continue;
3051                                 } else {
3052                                         goto out;
3053                                 }
3054                         }
3055
3056                         ret = -ENOMEM;
3057                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3058                                         MEMREMAP_WB);
3059                         if (!old_ce)
3060                                 goto out;
3061
3062                         new_ce = alloc_pgtable_page(iommu->node);
3063                         if (!new_ce)
3064                                 goto out_unmap;
3065
3066                         ret = 0;
3067                 }
3068
3069                 /* Now copy the context entry */
3070                 memcpy(&ce, old_ce + idx, sizeof(ce));
3071
3072                 if (!__context_present(&ce))
3073                         continue;
3074
3075                 did = context_domain_id(&ce);
3076                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3077                         set_bit(did, iommu->domain_ids);
3078
3079                 /*
3080                  * We need a marker for copied context entries. This
3081                  * marker needs to work for the old format as well as
3082                  * for extended context entries.
3083                  *
3084                  * Bit 67 of the context entry is used. In the old
3085                  * format this bit is available to software, in the
3086                  * extended format it is the PGE bit, but PGE is ignored
3087                  * by HW if PASIDs are disabled (and thus still
3088                  * available).
3089                  *
3090                  * So disable PASIDs first and then mark the entry
3091                  * copied. This means that we don't copy PASID
3092                  * translations from the old kernel, but this is fine as
3093                  * faults there are not fatal.
3094                  */
3095                 context_clear_pasid_enable(&ce);
3096                 context_set_copied(&ce);
3097
3098                 new_ce[idx] = ce;
3099         }
3100
3101         tbl[tbl_idx + pos] = new_ce;
3102
3103         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3104
3105 out_unmap:
3106         memunmap(old_ce);
3107
3108 out:
3109         return ret;
3110 }
3111
3112 static int copy_translation_tables(struct intel_iommu *iommu)
3113 {
3114         struct context_entry **ctxt_tbls;
3115         struct root_entry *old_rt;
3116         phys_addr_t old_rt_phys;
3117         int ctxt_table_entries;
3118         unsigned long flags;
3119         u64 rtaddr_reg;
3120         int bus, ret;
3121         bool new_ext, ext;
3122
3123         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3124         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3125         new_ext    = !!ecap_ecs(iommu->ecap);
3126
3127         /*
3128          * The RTT bit can only be changed when translation is disabled,
3129          * but disabling translation means to open a window for data
3130          * corruption. So bail out and don't copy anything if we would
3131          * have to change the bit.
3132          */
3133         if (new_ext != ext)
3134                 return -EINVAL;
3135
3136         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3137         if (!old_rt_phys)
3138                 return -EINVAL;
3139
3140         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3141         if (!old_rt)
3142                 return -ENOMEM;
3143
3144         /* This is too big for the stack - allocate it from slab */
3145         ctxt_table_entries = ext ? 512 : 256;
3146         ret = -ENOMEM;
3147         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3148         if (!ctxt_tbls)
3149                 goto out_unmap;
3150
3151         for (bus = 0; bus < 256; bus++) {
3152                 ret = copy_context_table(iommu, &old_rt[bus],
3153                                          ctxt_tbls, bus, ext);
3154                 if (ret) {
3155                         pr_err("%s: Failed to copy context table for bus %d\n",
3156                                 iommu->name, bus);
3157                         continue;
3158                 }
3159         }
3160
3161         spin_lock_irqsave(&iommu->lock, flags);
3162
3163         /* Context tables are copied, now write them to the root_entry table */
3164         for (bus = 0; bus < 256; bus++) {
3165                 int idx = ext ? bus * 2 : bus;
3166                 u64 val;
3167
3168                 if (ctxt_tbls[idx]) {
3169                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3170                         iommu->root_entry[bus].lo = val;
3171                 }
3172
3173                 if (!ext || !ctxt_tbls[idx + 1])
3174                         continue;
3175
3176                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3177                 iommu->root_entry[bus].hi = val;
3178         }
3179
3180         spin_unlock_irqrestore(&iommu->lock, flags);
3181
3182         kfree(ctxt_tbls);
3183
3184         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3185
3186         ret = 0;
3187
3188 out_unmap:
3189         memunmap(old_rt);
3190
3191         return ret;
3192 }
3193
3194 static int __init init_dmars(void)
3195 {
3196         struct dmar_drhd_unit *drhd;
3197         struct dmar_rmrr_unit *rmrr;
3198         bool copied_tables = false;
3199         struct device *dev;
3200         struct intel_iommu *iommu;
3201         int i, ret, cpu;
3202
3203         /*
3204          * for each drhd
3205          *    allocate root
3206          *    initialize and program root entry to not present
3207          * endfor
3208          */
3209         for_each_drhd_unit(drhd) {
3210                 /*
3211                  * lock not needed as this is only incremented in the single
3212                  * threaded kernel __init code path all other access are read
3213                  * only
3214                  */
3215                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3216                         g_num_of_iommus++;
3217                         continue;
3218                 }
3219                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3220         }
3221
3222         /* Preallocate enough resources for IOMMU hot-addition */
3223         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3224                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3225
3226         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3227                         GFP_KERNEL);
3228         if (!g_iommus) {
3229                 pr_err("Allocating global iommu array failed\n");
3230                 ret = -ENOMEM;
3231                 goto error;
3232         }
3233
3234         for_each_possible_cpu(cpu) {
3235                 struct deferred_flush_data *dfd = per_cpu_ptr(&deferred_flush,
3236                                                               cpu);
3237
3238                 dfd->tables = kzalloc(g_num_of_iommus *
3239                                       sizeof(struct deferred_flush_table),
3240                                       GFP_KERNEL);
3241                 if (!dfd->tables) {
3242                         ret = -ENOMEM;
3243                         goto free_g_iommus;
3244                 }
3245
3246                 spin_lock_init(&dfd->lock);
3247                 setup_timer(&dfd->timer, flush_unmaps_timeout, cpu);
3248         }
3249
3250         for_each_active_iommu(iommu, drhd) {
3251                 g_iommus[iommu->seq_id] = iommu;
3252
3253                 intel_iommu_init_qi(iommu);
3254
3255                 ret = iommu_init_domains(iommu);
3256                 if (ret)
3257                         goto free_iommu;
3258
3259                 init_translation_status(iommu);
3260
3261                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3262                         iommu_disable_translation(iommu);
3263                         clear_translation_pre_enabled(iommu);
3264                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3265                                 iommu->name);
3266                 }
3267
3268                 /*
3269                  * TBD:
3270                  * we could share the same root & context tables
3271                  * among all IOMMU's. Need to Split it later.
3272                  */
3273                 ret = iommu_alloc_root_entry(iommu);
3274                 if (ret)
3275                         goto free_iommu;
3276
3277                 if (translation_pre_enabled(iommu)) {
3278                         pr_info("Translation already enabled - trying to copy translation structures\n");
3279
3280                         ret = copy_translation_tables(iommu);
3281                         if (ret) {
3282                                 /*
3283                                  * We found the IOMMU with translation
3284                                  * enabled - but failed to copy over the
3285                                  * old root-entry table. Try to proceed
3286                                  * by disabling translation now and
3287                                  * allocating a clean root-entry table.
3288                                  * This might cause DMAR faults, but
3289                                  * probably the dump will still succeed.
3290                                  */
3291                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3292                                        iommu->name);
3293                                 iommu_disable_translation(iommu);
3294                                 clear_translation_pre_enabled(iommu);
3295                         } else {
3296                                 pr_info("Copied translation tables from previous kernel for %s\n",
3297                                         iommu->name);
3298                                 copied_tables = true;
3299                         }
3300                 }
3301
3302                 if (!ecap_pass_through(iommu->ecap))
3303                         hw_pass_through = 0;
3304 #ifdef CONFIG_INTEL_IOMMU_SVM
3305                 if (pasid_enabled(iommu))
3306                         intel_svm_alloc_pasid_tables(iommu);
3307 #endif
3308         }
3309
3310         /*
3311          * Now that qi is enabled on all iommus, set the root entry and flush
3312          * caches. This is required on some Intel X58 chipsets, otherwise the
3313          * flush_context function will loop forever and the boot hangs.
3314          */
3315         for_each_active_iommu(iommu, drhd) {
3316                 iommu_flush_write_buffer(iommu);
3317                 iommu_set_root_entry(iommu);
3318                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3319                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3320         }
3321
3322         if (iommu_pass_through)
3323                 iommu_identity_mapping |= IDENTMAP_ALL;
3324
3325 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3326         iommu_identity_mapping |= IDENTMAP_GFX;
3327 #endif
3328
3329         if (iommu_identity_mapping) {
3330                 ret = si_domain_init(hw_pass_through);
3331                 if (ret)
3332                         goto free_iommu;
3333         }
3334
3335         check_tylersburg_isoch();
3336
3337         /*
3338          * If we copied translations from a previous kernel in the kdump
3339          * case, we can not assign the devices to domains now, as that
3340          * would eliminate the old mappings. So skip this part and defer
3341          * the assignment to device driver initialization time.
3342          */
3343         if (copied_tables)
3344                 goto domains_done;
3345
3346         /*
3347          * If pass through is not set or not enabled, setup context entries for
3348          * identity mappings for rmrr, gfx, and isa and may fall back to static
3349          * identity mapping if iommu_identity_mapping is set.
3350          */
3351         if (iommu_identity_mapping) {
3352                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3353                 if (ret) {
3354                         pr_crit("Failed to setup IOMMU pass-through\n");
3355                         goto free_iommu;
3356                 }
3357         }
3358         /*
3359          * For each rmrr
3360          *   for each dev attached to rmrr
3361          *   do
3362          *     locate drhd for dev, alloc domain for dev
3363          *     allocate free domain
3364          *     allocate page table entries for rmrr
3365          *     if context not allocated for bus
3366          *           allocate and init context
3367          *           set present in root table for this bus
3368          *     init context with domain, translation etc
3369          *    endfor
3370          * endfor
3371          */
3372         pr_info("Setting RMRR:\n");
3373         for_each_rmrr_units(rmrr) {
3374                 /* some BIOS lists non-exist devices in DMAR table. */
3375                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3376                                           i, dev) {
3377                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3378                         if (ret)
3379                                 pr_err("Mapping reserved region failed\n");
3380                 }
3381         }
3382
3383         iommu_prepare_isa();
3384
3385 domains_done:
3386
3387         /*
3388          * for each drhd
3389          *   enable fault log
3390          *   global invalidate context cache
3391          *   global invalidate iotlb
3392          *   enable translation
3393          */
3394         for_each_iommu(iommu, drhd) {
3395                 if (drhd->ignored) {
3396                         /*
3397                          * we always have to disable PMRs or DMA may fail on
3398                          * this device
3399                          */
3400                         if (force_on)
3401                                 iommu_disable_protect_mem_regions(iommu);
3402                         continue;
3403                 }
3404
3405                 iommu_flush_write_buffer(iommu);
3406
3407 #ifdef CONFIG_INTEL_IOMMU_SVM
3408                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3409                         ret = intel_svm_enable_prq(iommu);
3410                         if (ret)
3411                                 goto free_iommu;
3412                 }
3413 #endif
3414                 ret = dmar_set_interrupt(iommu);
3415                 if (ret)
3416                         goto free_iommu;
3417
3418                 if (!translation_pre_enabled(iommu))
3419                         iommu_enable_translation(iommu);
3420
3421                 iommu_disable_protect_mem_regions(iommu);
3422         }
3423
3424         return 0;
3425
3426 free_iommu:
3427         for_each_active_iommu(iommu, drhd) {
3428                 disable_dmar_iommu(iommu);
3429                 free_dmar_iommu(iommu);
3430         }
3431 free_g_iommus:
3432         for_each_possible_cpu(cpu)
3433                 kfree(per_cpu_ptr(&deferred_flush, cpu)->tables);
3434         kfree(g_iommus);
3435 error:
3436         return ret;
3437 }
3438
3439 /* This takes a number of _MM_ pages, not VTD pages */
3440 static unsigned long intel_alloc_iova(struct device *dev,
3441                                      struct dmar_domain *domain,
3442                                      unsigned long nrpages, uint64_t dma_mask)
3443 {
3444         unsigned long iova_pfn = 0;
3445
3446         /* Restrict dma_mask to the width that the iommu can handle */
3447         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3448         /* Ensure we reserve the whole size-aligned region */
3449         nrpages = __roundup_pow_of_two(nrpages);
3450
3451         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3452                 /*
3453                  * First try to allocate an io virtual address in
3454                  * DMA_BIT_MASK(32) and if that fails then try allocating
3455                  * from higher range
3456                  */
3457                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3458                                            IOVA_PFN(DMA_BIT_MASK(32)));
3459                 if (iova_pfn)
3460                         return iova_pfn;
3461         }
3462         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask));
3463         if (unlikely(!iova_pfn)) {
3464                 pr_err("Allocating %ld-page iova for %s failed",
3465                        nrpages, dev_name(dev));
3466                 return 0;
3467         }
3468
3469         return iova_pfn;
3470 }
3471
3472 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3473 {
3474         struct dmar_domain *domain, *tmp;
3475         struct dmar_rmrr_unit *rmrr;
3476         struct device *i_dev;
3477         int i, ret;
3478
3479         domain = find_domain(dev);
3480         if (domain)
3481                 goto out;
3482
3483         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3484         if (!domain)
3485                 goto out;
3486
3487         /* We have a new domain - setup possible RMRRs for the device */
3488         rcu_read_lock();
3489         for_each_rmrr_units(rmrr) {
3490                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3491                                           i, i_dev) {
3492                         if (i_dev != dev)
3493                                 continue;
3494
3495                         ret = domain_prepare_identity_map(dev, domain,
3496                                                           rmrr->base_address,
3497                                                           rmrr->end_address);
3498                         if (ret)
3499                                 dev_err(dev, "Mapping reserved region failed\n");
3500                 }
3501         }
3502         rcu_read_unlock();
3503
3504         tmp = set_domain_for_dev(dev, domain);
3505         if (!tmp || domain != tmp) {
3506                 domain_exit(domain);
3507                 domain = tmp;
3508         }
3509
3510 out:
3511
3512         if (!domain)
3513                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3514
3515
3516         return domain;
3517 }
3518
3519 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3520 {
3521         struct device_domain_info *info;
3522
3523         /* No lock here, assumes no domain exit in normal case */
3524         info = dev->archdata.iommu;
3525         if (likely(info))
3526                 return info->domain;
3527
3528         return __get_valid_domain_for_dev(dev);
3529 }
3530
3531 /* Check if the dev needs to go through non-identity map and unmap process.*/
3532 static int iommu_no_mapping(struct device *dev)
3533 {
3534         int found;
3535
3536         if (iommu_dummy(dev))
3537                 return 1;
3538
3539         if (!iommu_identity_mapping)
3540                 return 0;
3541
3542         found = identity_mapping(dev);
3543         if (found) {
3544                 if (iommu_should_identity_map(dev, 0))
3545                         return 1;
3546                 else {
3547                         /*
3548                          * 32 bit DMA is removed from si_domain and fall back
3549                          * to non-identity mapping.
3550                          */
3551                         dmar_remove_one_dev_info(si_domain, dev);
3552                         pr_info("32bit %s uses non-identity mapping\n",
3553                                 dev_name(dev));
3554                         return 0;
3555                 }
3556         } else {
3557                 /*
3558                  * In case of a detached 64 bit DMA device from vm, the device
3559                  * is put into si_domain for identity mapping.
3560                  */
3561                 if (iommu_should_identity_map(dev, 0)) {
3562                         int ret;
3563                         ret = domain_add_dev_info(si_domain, dev);
3564                         if (!ret) {
3565                                 pr_info("64bit %s uses identity mapping\n",
3566                                         dev_name(dev));
3567                                 return 1;
3568                         }
3569                 }
3570         }
3571
3572         return 0;
3573 }
3574
3575 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3576                                      size_t size, int dir, u64 dma_mask)
3577 {
3578         struct dmar_domain *domain;
3579         phys_addr_t start_paddr;
3580         unsigned long iova_pfn;
3581         int prot = 0;
3582         int ret;
3583         struct intel_iommu *iommu;
3584         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3585
3586         BUG_ON(dir == DMA_NONE);
3587
3588         if (iommu_no_mapping(dev))
3589                 return paddr;
3590
3591         domain = get_valid_domain_for_dev(dev);
3592         if (!domain)
3593                 return 0;
3594
3595         iommu = domain_get_iommu(domain);
3596         size = aligned_nrpages(paddr, size);
3597
3598         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3599         if (!iova_pfn)
3600                 goto error;
3601
3602         /*
3603          * Check if DMAR supports zero-length reads on write only
3604          * mappings..
3605          */
3606         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3607                         !cap_zlr(iommu->cap))
3608                 prot |= DMA_PTE_READ;
3609         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3610                 prot |= DMA_PTE_WRITE;
3611         /*
3612          * paddr - (paddr + size) might be partial page, we should map the whole
3613          * page.  Note: if two part of one page are separately mapped, we
3614          * might have two guest_addr mapping to the same host paddr, but this
3615          * is not a big problem
3616          */
3617         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3618                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3619         if (ret)
3620                 goto error;
3621
3622         /* it's a non-present to present mapping. Only flush if caching mode */
3623         if (cap_caching_mode(iommu->cap))
3624                 iommu_flush_iotlb_psi(iommu, domain,
3625                                       mm_to_dma_pfn(iova_pfn),
3626                                       size, 0, 1);
3627         else
3628                 iommu_flush_write_buffer(iommu);
3629
3630         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3631         start_paddr += paddr & ~PAGE_MASK;
3632         return start_paddr;
3633
3634 error:
3635         if (iova_pfn)
3636                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3637         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3638                 dev_name(dev), size, (unsigned long long)paddr, dir);
3639         return 0;
3640 }
3641
3642 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3643                                  unsigned long offset, size_t size,
3644                                  enum dma_data_direction dir,
3645                                  unsigned long attrs)
3646 {
3647         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3648                                   dir, *dev->dma_mask);
3649 }
3650
3651 static void flush_unmaps(struct deferred_flush_data *flush_data)
3652 {
3653         int i, j;
3654
3655         flush_data->timer_on = 0;
3656
3657         /* just flush them all */
3658         for (i = 0; i < g_num_of_iommus; i++) {
3659                 struct intel_iommu *iommu = g_iommus[i];
3660                 struct deferred_flush_table *flush_table =
3661                                 &flush_data->tables[i];
3662                 if (!iommu)
3663                         continue;
3664
3665                 if (!flush_table->next)
3666                         continue;
3667
3668                 /* In caching mode, global flushes turn emulation expensive */
3669                 if (!cap_caching_mode(iommu->cap))
3670                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3671                                          DMA_TLB_GLOBAL_FLUSH);
3672                 for (j = 0; j < flush_table->next; j++) {
3673                         unsigned long mask;
3674                         struct deferred_flush_entry *entry =
3675                                                 &flush_table->entries[j];
3676                         unsigned long iova_pfn = entry->iova_pfn;
3677                         unsigned long nrpages = entry->nrpages;
3678                         struct dmar_domain *domain = entry->domain;
3679                         struct page *freelist = entry->freelist;
3680
3681                         /* On real hardware multiple invalidations are expensive */
3682                         if (cap_caching_mode(iommu->cap))
3683                                 iommu_flush_iotlb_psi(iommu, domain,
3684                                         mm_to_dma_pfn(iova_pfn),
3685                                         nrpages, !freelist, 0);
3686                         else {
3687                                 mask = ilog2(nrpages);
3688                                 iommu_flush_dev_iotlb(domain,
3689                                                 (uint64_t)iova_pfn << PAGE_SHIFT, mask);
3690                         }
3691                         free_iova_fast(&domain->iovad, iova_pfn, nrpages);
3692                         if (freelist)
3693                                 dma_free_pagelist(freelist);
3694                 }
3695                 flush_table->next = 0;
3696         }
3697
3698         flush_data->size = 0;
3699 }
3700
3701 static void flush_unmaps_timeout(unsigned long cpuid)
3702 {
3703         struct deferred_flush_data *flush_data = per_cpu_ptr(&deferred_flush, cpuid);
3704         unsigned long flags;
3705
3706         spin_lock_irqsave(&flush_data->lock, flags);
3707         flush_unmaps(flush_data);
3708         spin_unlock_irqrestore(&flush_data->lock, flags);
3709 }
3710
3711 static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
3712                       unsigned long nrpages, struct page *freelist)
3713 {
3714         unsigned long flags;
3715         int entry_id, iommu_id;
3716         struct intel_iommu *iommu;
3717         struct deferred_flush_entry *entry;
3718         struct deferred_flush_data *flush_data;
3719         unsigned int cpuid;
3720
3721         cpuid = get_cpu();
3722         flush_data = per_cpu_ptr(&deferred_flush, cpuid);
3723
3724         /* Flush all CPUs' entries to avoid deferring too much.  If
3725          * this becomes a bottleneck, can just flush us, and rely on
3726          * flush timer for the rest.
3727          */
3728         if (flush_data->size == HIGH_WATER_MARK) {
3729                 int cpu;
3730
3731                 for_each_online_cpu(cpu)
3732                         flush_unmaps_timeout(cpu);
3733         }
3734
3735         spin_lock_irqsave(&flush_data->lock, flags);
3736
3737         iommu = domain_get_iommu(dom);
3738         iommu_id = iommu->seq_id;
3739
3740         entry_id = flush_data->tables[iommu_id].next;
3741         ++(flush_data->tables[iommu_id].next);
3742
3743         entry = &flush_data->tables[iommu_id].entries[entry_id];
3744         entry->domain = dom;
3745         entry->iova_pfn = iova_pfn;
3746         entry->nrpages = nrpages;
3747         entry->freelist = freelist;
3748
3749         if (!flush_data->timer_on) {
3750                 mod_timer(&flush_data->timer, jiffies + msecs_to_jiffies(10));
3751                 flush_data->timer_on = 1;
3752         }
3753         flush_data->size++;
3754         spin_unlock_irqrestore(&flush_data->lock, flags);
3755
3756         put_cpu();
3757 }
3758
3759 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3760 {
3761         struct dmar_domain *domain;
3762         unsigned long start_pfn, last_pfn;
3763         unsigned long nrpages;
3764         unsigned long iova_pfn;
3765         struct intel_iommu *iommu;
3766         struct page *freelist;
3767
3768         if (iommu_no_mapping(dev))
3769                 return;
3770
3771         domain = find_domain(dev);
3772         BUG_ON(!domain);
3773
3774         iommu = domain_get_iommu(domain);
3775
3776         iova_pfn = IOVA_PFN(dev_addr);
3777
3778         nrpages = aligned_nrpages(dev_addr, size);
3779         start_pfn = mm_to_dma_pfn(iova_pfn);
3780         last_pfn = start_pfn + nrpages - 1;
3781
3782         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3783                  dev_name(dev), start_pfn, last_pfn);
3784
3785         freelist = domain_unmap(domain, start_pfn, last_pfn);
3786
3787         if (intel_iommu_strict) {
3788                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3789                                       nrpages, !freelist, 0);
3790                 /* free iova */
3791                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3792                 dma_free_pagelist(freelist);
3793         } else {
3794                 add_unmap(domain, iova_pfn, nrpages, freelist);
3795                 /*
3796                  * queue up the release of the unmap to save the 1/6th of the
3797                  * cpu used up by the iotlb flush operation...
3798                  */
3799         }
3800 }
3801
3802 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3803                              size_t size, enum dma_data_direction dir,
3804                              unsigned long attrs)
3805 {
3806         intel_unmap(dev, dev_addr, size);
3807 }
3808
3809 static void *intel_alloc_coherent(struct device *dev, size_t size,
3810                                   dma_addr_t *dma_handle, gfp_t flags,
3811                                   unsigned long attrs)
3812 {
3813         struct page *page = NULL;
3814         int order;
3815
3816         size = PAGE_ALIGN(size);
3817         order = get_order(size);
3818
3819         if (!iommu_no_mapping(dev))
3820                 flags &= ~(GFP_DMA | GFP_DMA32);
3821         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3822                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3823                         flags |= GFP_DMA;
3824                 else
3825                         flags |= GFP_DMA32;
3826         }
3827
3828         if (gfpflags_allow_blocking(flags)) {
3829                 unsigned int count = size >> PAGE_SHIFT;
3830
3831                 page = dma_alloc_from_contiguous(dev, count, order);
3832                 if (page && iommu_no_mapping(dev) &&
3833                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3834                         dma_release_from_contiguous(dev, page, count);
3835                         page = NULL;
3836                 }
3837         }
3838
3839         if (!page)
3840                 page = alloc_pages(flags, order);
3841         if (!page)
3842                 return NULL;
3843         memset(page_address(page), 0, size);
3844
3845         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3846                                          DMA_BIDIRECTIONAL,
3847                                          dev->coherent_dma_mask);
3848         if (*dma_handle)
3849                 return page_address(page);
3850         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3851                 __free_pages(page, order);
3852
3853         return NULL;
3854 }
3855
3856 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3857                                 dma_addr_t dma_handle, unsigned long attrs)
3858 {
3859         int order;
3860         struct page *page = virt_to_page(vaddr);
3861
3862         size = PAGE_ALIGN(size);
3863         order = get_order(size);
3864
3865         intel_unmap(dev, dma_handle, size);
3866         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3867                 __free_pages(page, order);
3868 }
3869
3870 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3871                            int nelems, enum dma_data_direction dir,
3872                            unsigned long attrs)
3873 {
3874         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3875         unsigned long nrpages = 0;
3876         struct scatterlist *sg;
3877         int i;
3878
3879         for_each_sg(sglist, sg, nelems, i) {
3880                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3881         }
3882
3883         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3884 }
3885
3886 static int intel_nontranslate_map_sg(struct device *hddev,
3887         struct scatterlist *sglist, int nelems, int dir)
3888 {
3889         int i;
3890         struct scatterlist *sg;
3891
3892         for_each_sg(sglist, sg, nelems, i) {
3893                 BUG_ON(!sg_page(sg));
3894                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3895                 sg->dma_length = sg->length;
3896         }
3897         return nelems;
3898 }
3899
3900 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3901                         enum dma_data_direction dir, unsigned long attrs)
3902 {
3903         int i;
3904         struct dmar_domain *domain;
3905         size_t size = 0;
3906         int prot = 0;
3907         unsigned long iova_pfn;
3908         int ret;
3909         struct scatterlist *sg;
3910         unsigned long start_vpfn;
3911         struct intel_iommu *iommu;
3912
3913         BUG_ON(dir == DMA_NONE);
3914         if (iommu_no_mapping(dev))
3915                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3916
3917         domain = get_valid_domain_for_dev(dev);
3918         if (!domain)
3919                 return 0;
3920
3921         iommu = domain_get_iommu(domain);
3922
3923         for_each_sg(sglist, sg, nelems, i)
3924                 size += aligned_nrpages(sg->offset, sg->length);
3925
3926         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3927                                 *dev->dma_mask);
3928         if (!iova_pfn) {
3929                 sglist->dma_length = 0;
3930                 return 0;
3931         }
3932
3933         /*
3934          * Check if DMAR supports zero-length reads on write only
3935          * mappings..
3936          */
3937         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3938                         !cap_zlr(iommu->cap))
3939                 prot |= DMA_PTE_READ;
3940         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3941                 prot |= DMA_PTE_WRITE;
3942
3943         start_vpfn = mm_to_dma_pfn(iova_pfn);
3944
3945         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3946         if (unlikely(ret)) {
3947                 dma_pte_free_pagetable(domain, start_vpfn,
3948                                        start_vpfn + size - 1);
3949                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3950                 return 0;
3951         }
3952
3953         /* it's a non-present to present mapping. Only flush if caching mode */
3954         if (cap_caching_mode(iommu->cap))
3955                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3956         else
3957                 iommu_flush_write_buffer(iommu);
3958
3959         return nelems;
3960 }
3961
3962 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3963 {
3964         return !dma_addr;
3965 }
3966
3967 struct dma_map_ops intel_dma_ops = {
3968         .alloc = intel_alloc_coherent,
3969         .free = intel_free_coherent,
3970         .map_sg = intel_map_sg,
3971         .unmap_sg = intel_unmap_sg,
3972         .map_page = intel_map_page,
3973         .unmap_page = intel_unmap_page,
3974         .mapping_error = intel_mapping_error,
3975 };
3976
3977 static inline int iommu_domain_cache_init(void)
3978 {
3979         int ret = 0;
3980
3981         iommu_domain_cache = kmem_cache_create("iommu_domain",
3982                                          sizeof(struct dmar_domain),
3983                                          0,
3984                                          SLAB_HWCACHE_ALIGN,
3985
3986                                          NULL);
3987         if (!iommu_domain_cache) {
3988                 pr_err("Couldn't create iommu_domain cache\n");
3989                 ret = -ENOMEM;
3990         }
3991
3992         return ret;
3993 }
3994
3995 static inline int iommu_devinfo_cache_init(void)
3996 {
3997         int ret = 0;
3998
3999         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4000                                          sizeof(struct device_domain_info),
4001                                          0,
4002                                          SLAB_HWCACHE_ALIGN,
4003                                          NULL);
4004         if (!iommu_devinfo_cache) {
4005                 pr_err("Couldn't create devinfo cache\n");
4006                 ret = -ENOMEM;
4007         }
4008
4009         return ret;
4010 }
4011
4012 static int __init iommu_init_mempool(void)
4013 {
4014         int ret;
4015         ret = iova_cache_get();
4016         if (ret)
4017                 return ret;
4018
4019         ret = iommu_domain_cache_init();
4020         if (ret)
4021                 goto domain_error;
4022
4023         ret = iommu_devinfo_cache_init();
4024         if (!ret)
4025                 return ret;
4026
4027         kmem_cache_destroy(iommu_domain_cache);
4028 domain_error:
4029         iova_cache_put();
4030
4031         return -ENOMEM;
4032 }
4033
4034 static void __init iommu_exit_mempool(void)
4035 {
4036         kmem_cache_destroy(iommu_devinfo_cache);
4037         kmem_cache_destroy(iommu_domain_cache);
4038         iova_cache_put();
4039 }
4040
4041 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4042 {
4043         struct dmar_drhd_unit *drhd;
4044         u32 vtbar;
4045         int rc;
4046
4047         /* We know that this device on this chipset has its own IOMMU.
4048          * If we find it under a different IOMMU, then the BIOS is lying
4049          * to us. Hope that the IOMMU for this device is actually
4050          * disabled, and it needs no translation...
4051          */
4052         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4053         if (rc) {
4054                 /* "can't" happen */
4055                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4056                 return;
4057         }
4058         vtbar &= 0xffff0000;
4059
4060         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4061         drhd = dmar_find_matched_drhd_unit(pdev);
4062         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4063                             TAINT_FIRMWARE_WORKAROUND,
4064                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4065                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4066 }
4067 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4068
4069 static void __init init_no_remapping_devices(void)
4070 {
4071         struct dmar_drhd_unit *drhd;
4072         struct device *dev;
4073         int i;
4074
4075         for_each_drhd_unit(drhd) {
4076                 if (!drhd->include_all) {
4077                         for_each_active_dev_scope(drhd->devices,
4078                                                   drhd->devices_cnt, i, dev)
4079                                 break;
4080                         /* ignore DMAR unit if no devices exist */
4081                         if (i == drhd->devices_cnt)
4082                                 drhd->ignored = 1;
4083                 }
4084         }
4085
4086         for_each_active_drhd_unit(drhd) {
4087                 if (drhd->include_all)
4088                         continue;
4089
4090                 for_each_active_dev_scope(drhd->devices,
4091                                           drhd->devices_cnt, i, dev)
4092                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4093                                 break;
4094                 if (i < drhd->devices_cnt)
4095                         continue;
4096
4097                 /* This IOMMU has *only* gfx devices. Either bypass it or
4098                    set the gfx_mapped flag, as appropriate */
4099                 if (dmar_map_gfx) {
4100                         intel_iommu_gfx_mapped = 1;
4101                 } else {
4102                         drhd->ignored = 1;
4103                         for_each_active_dev_scope(drhd->devices,
4104                                                   drhd->devices_cnt, i, dev)
4105                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4106                 }
4107         }
4108 }
4109
4110 #ifdef CONFIG_SUSPEND
4111 static int init_iommu_hw(void)
4112 {
4113         struct dmar_drhd_unit *drhd;
4114         struct intel_iommu *iommu = NULL;
4115
4116         for_each_active_iommu(iommu, drhd)
4117                 if (iommu->qi)
4118                         dmar_reenable_qi(iommu);
4119
4120         for_each_iommu(iommu, drhd) {
4121                 if (drhd->ignored) {
4122                         /*
4123                          * we always have to disable PMRs or DMA may fail on
4124                          * this device
4125                          */
4126                         if (force_on)
4127                                 iommu_disable_protect_mem_regions(iommu);
4128                         continue;
4129                 }
4130
4131                 iommu_flush_write_buffer(iommu);
4132
4133                 iommu_set_root_entry(iommu);
4134
4135                 iommu->flush.flush_context(iommu, 0, 0, 0,
4136                                            DMA_CCMD_GLOBAL_INVL);
4137                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4138                 iommu_enable_translation(iommu);
4139                 iommu_disable_protect_mem_regions(iommu);
4140         }
4141
4142         return 0;
4143 }
4144
4145 static void iommu_flush_all(void)
4146 {
4147         struct dmar_drhd_unit *drhd;
4148         struct intel_iommu *iommu;
4149
4150         for_each_active_iommu(iommu, drhd) {
4151                 iommu->flush.flush_context(iommu, 0, 0, 0,
4152                                            DMA_CCMD_GLOBAL_INVL);
4153                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4154                                          DMA_TLB_GLOBAL_FLUSH);
4155         }
4156 }
4157
4158 static int iommu_suspend(void)
4159 {
4160         struct dmar_drhd_unit *drhd;
4161         struct intel_iommu *iommu = NULL;
4162         unsigned long flag;
4163
4164         for_each_active_iommu(iommu, drhd) {
4165                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4166                                                  GFP_ATOMIC);
4167                 if (!iommu->iommu_state)
4168                         goto nomem;
4169         }
4170
4171         iommu_flush_all();
4172
4173         for_each_active_iommu(iommu, drhd) {
4174                 iommu_disable_translation(iommu);
4175
4176                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4177
4178                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4179                         readl(iommu->reg + DMAR_FECTL_REG);
4180                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4181                         readl(iommu->reg + DMAR_FEDATA_REG);
4182                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4183                         readl(iommu->reg + DMAR_FEADDR_REG);
4184                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4185                         readl(iommu->reg + DMAR_FEUADDR_REG);
4186
4187                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4188         }
4189         return 0;
4190
4191 nomem:
4192         for_each_active_iommu(iommu, drhd)
4193                 kfree(iommu->iommu_state);
4194
4195         return -ENOMEM;
4196 }
4197
4198 static void iommu_resume(void)
4199 {
4200         struct dmar_drhd_unit *drhd;
4201         struct intel_iommu *iommu = NULL;
4202         unsigned long flag;
4203
4204         if (init_iommu_hw()) {
4205                 if (force_on)
4206                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4207                 else
4208                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4209                 return;
4210         }
4211
4212         for_each_active_iommu(iommu, drhd) {
4213
4214                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4215
4216                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4217                         iommu->reg + DMAR_FECTL_REG);
4218                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4219                         iommu->reg + DMAR_FEDATA_REG);
4220                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4221                         iommu->reg + DMAR_FEADDR_REG);
4222                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4223                         iommu->reg + DMAR_FEUADDR_REG);
4224
4225                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4226         }
4227
4228         for_each_active_iommu(iommu, drhd)
4229                 kfree(iommu->iommu_state);
4230 }
4231
4232 static struct syscore_ops iommu_syscore_ops = {
4233         .resume         = iommu_resume,
4234         .suspend        = iommu_suspend,
4235 };
4236
4237 static void __init init_iommu_pm_ops(void)
4238 {
4239         register_syscore_ops(&iommu_syscore_ops);
4240 }
4241
4242 #else
4243 static inline void init_iommu_pm_ops(void) {}
4244 #endif  /* CONFIG_PM */
4245
4246
4247 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4248 {
4249         struct acpi_dmar_reserved_memory *rmrr;
4250         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4251         struct dmar_rmrr_unit *rmrru;
4252         size_t length;
4253
4254         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4255         if (!rmrru)
4256                 goto out;
4257
4258         rmrru->hdr = header;
4259         rmrr = (struct acpi_dmar_reserved_memory *)header;
4260         rmrru->base_address = rmrr->base_address;
4261         rmrru->end_address = rmrr->end_address;
4262
4263         length = rmrr->end_address - rmrr->base_address + 1;
4264         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4265                                               IOMMU_RESV_DIRECT);
4266         if (!rmrru->resv)
4267                 goto free_rmrru;
4268
4269         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4270                                 ((void *)rmrr) + rmrr->header.length,
4271                                 &rmrru->devices_cnt);
4272         if (rmrru->devices_cnt && rmrru->devices == NULL)
4273                 goto free_all;
4274
4275         list_add(&rmrru->list, &dmar_rmrr_units);
4276
4277         return 0;
4278 free_all:
4279         kfree(rmrru->resv);
4280 free_rmrru:
4281         kfree(rmrru);
4282 out:
4283         return -ENOMEM;
4284 }
4285
4286 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4287 {
4288         struct dmar_atsr_unit *atsru;
4289         struct acpi_dmar_atsr *tmp;
4290
4291         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4292                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4293                 if (atsr->segment != tmp->segment)
4294                         continue;
4295                 if (atsr->header.length != tmp->header.length)
4296                         continue;
4297                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4298                         return atsru;
4299         }
4300
4301         return NULL;
4302 }
4303
4304 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4305 {
4306         struct acpi_dmar_atsr *atsr;
4307         struct dmar_atsr_unit *atsru;
4308
4309         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4310                 return 0;
4311
4312         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4313         atsru = dmar_find_atsr(atsr);
4314         if (atsru)
4315                 return 0;
4316
4317         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4318         if (!atsru)
4319                 return -ENOMEM;
4320
4321         /*
4322          * If memory is allocated from slab by ACPI _DSM method, we need to
4323          * copy the memory content because the memory buffer will be freed
4324          * on return.
4325          */
4326         atsru->hdr = (void *)(atsru + 1);
4327         memcpy(atsru->hdr, hdr, hdr->length);
4328         atsru->include_all = atsr->flags & 0x1;
4329         if (!atsru->include_all) {
4330                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4331                                 (void *)atsr + atsr->header.length,
4332                                 &atsru->devices_cnt);
4333                 if (atsru->devices_cnt && atsru->devices == NULL) {
4334                         kfree(atsru);
4335                         return -ENOMEM;
4336                 }
4337         }
4338
4339         list_add_rcu(&atsru->list, &dmar_atsr_units);
4340
4341         return 0;
4342 }
4343
4344 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4345 {
4346         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4347         kfree(atsru);
4348 }
4349
4350 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4351 {
4352         struct acpi_dmar_atsr *atsr;
4353         struct dmar_atsr_unit *atsru;
4354
4355         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4356         atsru = dmar_find_atsr(atsr);
4357         if (atsru) {
4358                 list_del_rcu(&atsru->list);
4359                 synchronize_rcu();
4360                 intel_iommu_free_atsr(atsru);
4361         }
4362
4363         return 0;
4364 }
4365
4366 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4367 {
4368         int i;
4369         struct device *dev;
4370         struct acpi_dmar_atsr *atsr;
4371         struct dmar_atsr_unit *atsru;
4372
4373         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4374         atsru = dmar_find_atsr(atsr);
4375         if (!atsru)
4376                 return 0;
4377
4378         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4379                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4380                                           i, dev)
4381                         return -EBUSY;
4382         }
4383
4384         return 0;
4385 }
4386
4387 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4388 {
4389         int sp, ret = 0;
4390         struct intel_iommu *iommu = dmaru->iommu;
4391
4392         if (g_iommus[iommu->seq_id])
4393                 return 0;
4394
4395         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4396                 pr_warn("%s: Doesn't support hardware pass through.\n",
4397                         iommu->name);
4398                 return -ENXIO;
4399         }
4400         if (!ecap_sc_support(iommu->ecap) &&
4401             domain_update_iommu_snooping(iommu)) {
4402                 pr_warn("%s: Doesn't support snooping.\n",
4403                         iommu->name);
4404                 return -ENXIO;
4405         }
4406         sp = domain_update_iommu_superpage(iommu) - 1;
4407         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4408                 pr_warn("%s: Doesn't support large page.\n",
4409                         iommu->name);
4410                 return -ENXIO;
4411         }
4412
4413         /*
4414          * Disable translation if already enabled prior to OS handover.
4415          */
4416         if (iommu->gcmd & DMA_GCMD_TE)
4417                 iommu_disable_translation(iommu);
4418
4419         g_iommus[iommu->seq_id] = iommu;
4420         ret = iommu_init_domains(iommu);
4421         if (ret == 0)
4422                 ret = iommu_alloc_root_entry(iommu);
4423         if (ret)
4424                 goto out;
4425
4426 #ifdef CONFIG_INTEL_IOMMU_SVM
4427         if (pasid_enabled(iommu))
4428                 intel_svm_alloc_pasid_tables(iommu);
4429 #endif
4430
4431         if (dmaru->ignored) {
4432                 /*
4433                  * we always have to disable PMRs or DMA may fail on this device
4434                  */
4435                 if (force_on)
4436                         iommu_disable_protect_mem_regions(iommu);
4437                 return 0;
4438         }
4439
4440         intel_iommu_init_qi(iommu);
4441         iommu_flush_write_buffer(iommu);
4442
4443 #ifdef CONFIG_INTEL_IOMMU_SVM
4444         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4445                 ret = intel_svm_enable_prq(iommu);
4446                 if (ret)
4447                         goto disable_iommu;
4448         }
4449 #endif
4450         ret = dmar_set_interrupt(iommu);
4451         if (ret)
4452                 goto disable_iommu;
4453
4454         iommu_set_root_entry(iommu);
4455         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4456         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4457         iommu_enable_translation(iommu);
4458
4459         iommu_disable_protect_mem_regions(iommu);
4460         return 0;
4461
4462 disable_iommu:
4463         disable_dmar_iommu(iommu);
4464 out:
4465         free_dmar_iommu(iommu);
4466         return ret;
4467 }
4468
4469 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4470 {
4471         int ret = 0;
4472         struct intel_iommu *iommu = dmaru->iommu;
4473
4474         if (!intel_iommu_enabled)
4475                 return 0;
4476         if (iommu == NULL)
4477                 return -EINVAL;
4478
4479         if (insert) {
4480                 ret = intel_iommu_add(dmaru);
4481         } else {
4482                 disable_dmar_iommu(iommu);
4483                 free_dmar_iommu(iommu);
4484         }
4485
4486         return ret;
4487 }
4488
4489 static void intel_iommu_free_dmars(void)
4490 {
4491         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4492         struct dmar_atsr_unit *atsru, *atsr_n;
4493
4494         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4495                 list_del(&rmrru->list);
4496                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4497                 kfree(rmrru->resv);
4498                 kfree(rmrru);
4499         }
4500
4501         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4502                 list_del(&atsru->list);
4503                 intel_iommu_free_atsr(atsru);
4504         }
4505 }
4506
4507 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4508 {
4509         int i, ret = 1;
4510         struct pci_bus *bus;
4511         struct pci_dev *bridge = NULL;
4512         struct device *tmp;
4513         struct acpi_dmar_atsr *atsr;
4514         struct dmar_atsr_unit *atsru;
4515
4516         dev = pci_physfn(dev);
4517         for (bus = dev->bus; bus; bus = bus->parent) {
4518                 bridge = bus->self;
4519                 /* If it's an integrated device, allow ATS */
4520                 if (!bridge)
4521                         return 1;
4522                 /* Connected via non-PCIe: no ATS */
4523                 if (!pci_is_pcie(bridge) ||
4524                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4525                         return 0;
4526                 /* If we found the root port, look it up in the ATSR */
4527                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4528                         break;
4529         }
4530
4531         rcu_read_lock();
4532         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4533                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4534                 if (atsr->segment != pci_domain_nr(dev->bus))
4535                         continue;
4536
4537                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4538                         if (tmp == &bridge->dev)
4539                                 goto out;
4540
4541                 if (atsru->include_all)
4542                         goto out;
4543         }
4544         ret = 0;
4545 out:
4546         rcu_read_unlock();
4547
4548         return ret;
4549 }
4550
4551 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4552 {
4553         int ret = 0;
4554         struct dmar_rmrr_unit *rmrru;
4555         struct dmar_atsr_unit *atsru;
4556         struct acpi_dmar_atsr *atsr;
4557         struct acpi_dmar_reserved_memory *rmrr;
4558
4559         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4560                 return 0;
4561
4562         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4563                 rmrr = container_of(rmrru->hdr,
4564                                     struct acpi_dmar_reserved_memory, header);
4565                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4566                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4567                                 ((void *)rmrr) + rmrr->header.length,
4568                                 rmrr->segment, rmrru->devices,
4569                                 rmrru->devices_cnt);
4570                         if(ret < 0)
4571                                 return ret;
4572                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4573                         dmar_remove_dev_scope(info, rmrr->segment,
4574                                 rmrru->devices, rmrru->devices_cnt);
4575                 }
4576         }
4577
4578         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4579                 if (atsru->include_all)
4580                         continue;
4581
4582                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4583                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4584                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4585                                         (void *)atsr + atsr->header.length,
4586                                         atsr->segment, atsru->devices,
4587                                         atsru->devices_cnt);
4588                         if (ret > 0)
4589                                 break;
4590                         else if(ret < 0)
4591                                 return ret;
4592                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4593                         if (dmar_remove_dev_scope(info, atsr->segment,
4594                                         atsru->devices, atsru->devices_cnt))
4595                                 break;
4596                 }
4597         }
4598
4599         return 0;
4600 }
4601
4602 /*
4603  * Here we only respond to action of unbound device from driver.
4604  *
4605  * Added device is not attached to its DMAR domain here yet. That will happen
4606  * when mapping the device to iova.
4607  */
4608 static int device_notifier(struct notifier_block *nb,
4609                                   unsigned long action, void *data)
4610 {
4611         struct device *dev = data;
4612         struct dmar_domain *domain;
4613
4614         if (iommu_dummy(dev))
4615                 return 0;
4616
4617         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4618                 return 0;
4619
4620         domain = find_domain(dev);
4621         if (!domain)
4622                 return 0;
4623
4624         dmar_remove_one_dev_info(domain, dev);
4625         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4626                 domain_exit(domain);
4627
4628         return 0;
4629 }
4630
4631 static struct notifier_block device_nb = {
4632         .notifier_call = device_notifier,
4633 };
4634
4635 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4636                                        unsigned long val, void *v)
4637 {
4638         struct memory_notify *mhp = v;
4639         unsigned long long start, end;
4640         unsigned long start_vpfn, last_vpfn;
4641
4642         switch (val) {
4643         case MEM_GOING_ONLINE:
4644                 start = mhp->start_pfn << PAGE_SHIFT;
4645                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4646                 if (iommu_domain_identity_map(si_domain, start, end)) {
4647                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4648                                 start, end);
4649                         return NOTIFY_BAD;
4650                 }
4651                 break;
4652
4653         case MEM_OFFLINE:
4654         case MEM_CANCEL_ONLINE:
4655                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4656                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4657                 while (start_vpfn <= last_vpfn) {
4658                         struct iova *iova;
4659                         struct dmar_drhd_unit *drhd;
4660                         struct intel_iommu *iommu;
4661                         struct page *freelist;
4662
4663                         iova = find_iova(&si_domain->iovad, start_vpfn);
4664                         if (iova == NULL) {
4665                                 pr_debug("Failed get IOVA for PFN %lx\n",
4666                                          start_vpfn);
4667                                 break;
4668                         }
4669
4670                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4671                                                      start_vpfn, last_vpfn);
4672                         if (iova == NULL) {
4673                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4674                                         start_vpfn, last_vpfn);
4675                                 return NOTIFY_BAD;
4676                         }
4677
4678                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4679                                                iova->pfn_hi);
4680
4681                         rcu_read_lock();
4682                         for_each_active_iommu(iommu, drhd)
4683                                 iommu_flush_iotlb_psi(iommu, si_domain,
4684                                         iova->pfn_lo, iova_size(iova),
4685                                         !freelist, 0);
4686                         rcu_read_unlock();
4687                         dma_free_pagelist(freelist);
4688
4689                         start_vpfn = iova->pfn_hi + 1;
4690                         free_iova_mem(iova);
4691                 }
4692                 break;
4693         }
4694
4695         return NOTIFY_OK;
4696 }
4697
4698 static struct notifier_block intel_iommu_memory_nb = {
4699         .notifier_call = intel_iommu_memory_notifier,
4700         .priority = 0
4701 };
4702
4703 static void free_all_cpu_cached_iovas(unsigned int cpu)
4704 {
4705         int i;
4706
4707         for (i = 0; i < g_num_of_iommus; i++) {
4708                 struct intel_iommu *iommu = g_iommus[i];
4709                 struct dmar_domain *domain;
4710                 int did;
4711
4712                 if (!iommu)
4713                         continue;
4714
4715                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4716                         domain = get_iommu_domain(iommu, (u16)did);
4717
4718                         if (!domain)
4719                                 continue;
4720                         free_cpu_cached_iovas(cpu, &domain->iovad);
4721                 }
4722         }
4723 }
4724
4725 static int intel_iommu_cpu_dead(unsigned int cpu)
4726 {
4727         free_all_cpu_cached_iovas(cpu);
4728         flush_unmaps_timeout(cpu);
4729         return 0;
4730 }
4731
4732 static ssize_t intel_iommu_show_version(struct device *dev,
4733                                         struct device_attribute *attr,
4734                                         char *buf)
4735 {
4736         struct intel_iommu *iommu = dev_get_drvdata(dev);
4737         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4738         return sprintf(buf, "%d:%d\n",
4739                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4740 }
4741 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4742
4743 static ssize_t intel_iommu_show_address(struct device *dev,
4744                                         struct device_attribute *attr,
4745                                         char *buf)
4746 {
4747         struct intel_iommu *iommu = dev_get_drvdata(dev);
4748         return sprintf(buf, "%llx\n", iommu->reg_phys);
4749 }
4750 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4751
4752 static ssize_t intel_iommu_show_cap(struct device *dev,
4753                                     struct device_attribute *attr,
4754                                     char *buf)
4755 {
4756         struct intel_iommu *iommu = dev_get_drvdata(dev);
4757         return sprintf(buf, "%llx\n", iommu->cap);
4758 }
4759 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4760
4761 static ssize_t intel_iommu_show_ecap(struct device *dev,
4762                                     struct device_attribute *attr,
4763                                     char *buf)
4764 {
4765         struct intel_iommu *iommu = dev_get_drvdata(dev);
4766         return sprintf(buf, "%llx\n", iommu->ecap);
4767 }
4768 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4769
4770 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4771                                       struct device_attribute *attr,
4772                                       char *buf)
4773 {
4774         struct intel_iommu *iommu = dev_get_drvdata(dev);
4775         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4776 }
4777 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4778
4779 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4780                                            struct device_attribute *attr,
4781                                            char *buf)
4782 {
4783         struct intel_iommu *iommu = dev_get_drvdata(dev);
4784         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4785                                                   cap_ndoms(iommu->cap)));
4786 }
4787 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4788
4789 static struct attribute *intel_iommu_attrs[] = {
4790         &dev_attr_version.attr,
4791         &dev_attr_address.attr,
4792         &dev_attr_cap.attr,
4793         &dev_attr_ecap.attr,
4794         &dev_attr_domains_supported.attr,
4795         &dev_attr_domains_used.attr,
4796         NULL,
4797 };
4798
4799 static struct attribute_group intel_iommu_group = {
4800         .name = "intel-iommu",
4801         .attrs = intel_iommu_attrs,
4802 };
4803
4804 const struct attribute_group *intel_iommu_groups[] = {
4805         &intel_iommu_group,
4806         NULL,
4807 };
4808
4809 int __init intel_iommu_init(void)
4810 {
4811         int ret = -ENODEV;
4812         struct dmar_drhd_unit *drhd;
4813         struct intel_iommu *iommu;
4814
4815         /* VT-d is required for a TXT/tboot launch, so enforce that */
4816         force_on = tboot_force_iommu();
4817
4818         if (iommu_init_mempool()) {
4819                 if (force_on)
4820                         panic("tboot: Failed to initialize iommu memory\n");
4821                 return -ENOMEM;
4822         }
4823
4824         down_write(&dmar_global_lock);
4825         if (dmar_table_init()) {
4826                 if (force_on)
4827                         panic("tboot: Failed to initialize DMAR table\n");
4828                 goto out_free_dmar;
4829         }
4830
4831         if (dmar_dev_scope_init() < 0) {
4832                 if (force_on)
4833                         panic("tboot: Failed to initialize DMAR device scope\n");
4834                 goto out_free_dmar;
4835         }
4836
4837         if (no_iommu || dmar_disabled)
4838                 goto out_free_dmar;
4839
4840         if (list_empty(&dmar_rmrr_units))
4841                 pr_info("No RMRR found\n");
4842
4843         if (list_empty(&dmar_atsr_units))
4844                 pr_info("No ATSR found\n");
4845
4846         if (dmar_init_reserved_ranges()) {
4847                 if (force_on)
4848                         panic("tboot: Failed to reserve iommu ranges\n");
4849                 goto out_free_reserved_range;
4850         }
4851
4852         init_no_remapping_devices();
4853
4854         ret = init_dmars();
4855         if (ret) {
4856                 if (force_on)
4857                         panic("tboot: Failed to initialize DMARs\n");
4858                 pr_err("Initialization failed\n");
4859                 goto out_free_reserved_range;
4860         }
4861         up_write(&dmar_global_lock);
4862         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4863
4864 #ifdef CONFIG_SWIOTLB
4865         swiotlb = 0;
4866 #endif
4867         dma_ops = &intel_dma_ops;
4868
4869         init_iommu_pm_ops();
4870
4871         for_each_active_iommu(iommu, drhd)
4872                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4873                                                        intel_iommu_groups,
4874                                                        "%s", iommu->name);
4875
4876         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4877         bus_register_notifier(&pci_bus_type, &device_nb);
4878         if (si_domain && !hw_pass_through)
4879                 register_memory_notifier(&intel_iommu_memory_nb);
4880         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4881                           intel_iommu_cpu_dead);
4882         intel_iommu_enabled = 1;
4883
4884         return 0;
4885
4886 out_free_reserved_range:
4887         put_iova_domain(&reserved_iova_list);
4888 out_free_dmar:
4889         intel_iommu_free_dmars();
4890         up_write(&dmar_global_lock);
4891         iommu_exit_mempool();
4892         return ret;
4893 }
4894
4895 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4896 {
4897         struct intel_iommu *iommu = opaque;
4898
4899         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4900         return 0;
4901 }
4902
4903 /*
4904  * NB - intel-iommu lacks any sort of reference counting for the users of
4905  * dependent devices.  If multiple endpoints have intersecting dependent
4906  * devices, unbinding the driver from any one of them will possibly leave
4907  * the others unable to operate.
4908  */
4909 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4910 {
4911         if (!iommu || !dev || !dev_is_pci(dev))
4912                 return;
4913
4914         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4915 }
4916
4917 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4918 {
4919         struct intel_iommu *iommu;
4920         unsigned long flags;
4921
4922         assert_spin_locked(&device_domain_lock);
4923
4924         if (WARN_ON(!info))
4925                 return;
4926
4927         iommu = info->iommu;
4928
4929         if (info->dev) {
4930                 iommu_disable_dev_iotlb(info);
4931                 domain_context_clear(iommu, info->dev);
4932         }
4933
4934         unlink_domain_info(info);
4935
4936         spin_lock_irqsave(&iommu->lock, flags);
4937         domain_detach_iommu(info->domain, iommu);
4938         spin_unlock_irqrestore(&iommu->lock, flags);
4939
4940         free_devinfo_mem(info);
4941 }
4942
4943 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4944                                      struct device *dev)
4945 {
4946         struct device_domain_info *info;
4947         unsigned long flags;
4948
4949         spin_lock_irqsave(&device_domain_lock, flags);
4950         info = dev->archdata.iommu;
4951         __dmar_remove_one_dev_info(info);
4952         spin_unlock_irqrestore(&device_domain_lock, flags);
4953 }
4954
4955 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4956 {
4957         int adjust_width;
4958
4959         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4960                         DMA_32BIT_PFN);
4961         domain_reserve_special_ranges(domain);
4962
4963         /* calculate AGAW */
4964         domain->gaw = guest_width;
4965         adjust_width = guestwidth_to_adjustwidth(guest_width);
4966         domain->agaw = width_to_agaw(adjust_width);
4967
4968         domain->iommu_coherency = 0;
4969         domain->iommu_snooping = 0;
4970         domain->iommu_superpage = 0;
4971         domain->max_addr = 0;
4972
4973         /* always allocate the top pgd */
4974         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4975         if (!domain->pgd)
4976                 return -ENOMEM;
4977         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4978         return 0;
4979 }
4980
4981 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4982 {
4983         struct dmar_domain *dmar_domain;
4984         struct iommu_domain *domain;
4985
4986         if (type != IOMMU_DOMAIN_UNMANAGED)
4987                 return NULL;
4988
4989         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4990         if (!dmar_domain) {
4991                 pr_err("Can't allocate dmar_domain\n");
4992                 return NULL;
4993         }
4994         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4995                 pr_err("Domain initialization failed\n");
4996                 domain_exit(dmar_domain);
4997                 return NULL;
4998         }
4999         domain_update_iommu_cap(dmar_domain);
5000
5001         domain = &dmar_domain->domain;
5002         domain->geometry.aperture_start = 0;
5003         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5004         domain->geometry.force_aperture = true;
5005
5006         return domain;
5007 }
5008
5009 static void intel_iommu_domain_free(struct iommu_domain *domain)
5010 {
5011         domain_exit(to_dmar_domain(domain));
5012 }
5013
5014 static int intel_iommu_attach_device(struct iommu_domain *domain,
5015                                      struct device *dev)
5016 {
5017         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5018         struct intel_iommu *iommu;
5019         int addr_width;
5020         u8 bus, devfn;
5021
5022         if (device_is_rmrr_locked(dev)) {
5023                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5024                 return -EPERM;
5025         }
5026
5027         /* normally dev is not mapped */
5028         if (unlikely(domain_context_mapped(dev))) {
5029                 struct dmar_domain *old_domain;
5030
5031                 old_domain = find_domain(dev);
5032                 if (old_domain) {
5033                         rcu_read_lock();
5034                         dmar_remove_one_dev_info(old_domain, dev);
5035                         rcu_read_unlock();
5036
5037                         if (!domain_type_is_vm_or_si(old_domain) &&
5038                              list_empty(&old_domain->devices))
5039                                 domain_exit(old_domain);
5040                 }
5041         }
5042
5043         iommu = device_to_iommu(dev, &bus, &devfn);
5044         if (!iommu)
5045                 return -ENODEV;
5046
5047         /* check if this iommu agaw is sufficient for max mapped address */
5048         addr_width = agaw_to_width(iommu->agaw);
5049         if (addr_width > cap_mgaw(iommu->cap))
5050                 addr_width = cap_mgaw(iommu->cap);
5051
5052         if (dmar_domain->max_addr > (1LL << addr_width)) {
5053                 pr_err("%s: iommu width (%d) is not "
5054                        "sufficient for the mapped address (%llx)\n",
5055                        __func__, addr_width, dmar_domain->max_addr);
5056                 return -EFAULT;
5057         }
5058         dmar_domain->gaw = addr_width;
5059
5060         /*
5061          * Knock out extra levels of page tables if necessary
5062          */
5063         while (iommu->agaw < dmar_domain->agaw) {
5064                 struct dma_pte *pte;
5065
5066                 pte = dmar_domain->pgd;
5067                 if (dma_pte_present(pte)) {
5068                         dmar_domain->pgd = (struct dma_pte *)
5069                                 phys_to_virt(dma_pte_addr(pte));
5070                         free_pgtable_page(pte);
5071                 }
5072                 dmar_domain->agaw--;
5073         }
5074
5075         return domain_add_dev_info(dmar_domain, dev);
5076 }
5077
5078 static void intel_iommu_detach_device(struct iommu_domain *domain,
5079                                       struct device *dev)
5080 {
5081         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5082 }
5083
5084 static int intel_iommu_map(struct iommu_domain *domain,
5085                            unsigned long iova, phys_addr_t hpa,
5086                            size_t size, int iommu_prot)
5087 {
5088         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5089         u64 max_addr;
5090         int prot = 0;
5091         int ret;
5092
5093         if (iommu_prot & IOMMU_READ)
5094                 prot |= DMA_PTE_READ;
5095         if (iommu_prot & IOMMU_WRITE)
5096                 prot |= DMA_PTE_WRITE;
5097         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5098                 prot |= DMA_PTE_SNP;
5099
5100         max_addr = iova + size;
5101         if (dmar_domain->max_addr < max_addr) {
5102                 u64 end;
5103
5104                 /* check if minimum agaw is sufficient for mapped address */
5105                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5106                 if (end < max_addr) {
5107                         pr_err("%s: iommu width (%d) is not "
5108                                "sufficient for the mapped address (%llx)\n",
5109                                __func__, dmar_domain->gaw, max_addr);
5110                         return -EFAULT;
5111                 }
5112                 dmar_domain->max_addr = max_addr;
5113         }
5114         /* Round up size to next multiple of PAGE_SIZE, if it and
5115            the low bits of hpa would take us onto the next page */
5116         size = aligned_nrpages(hpa, size);
5117         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5118                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5119         return ret;
5120 }
5121
5122 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5123                                 unsigned long iova, size_t size)
5124 {
5125         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5126         struct page *freelist = NULL;
5127         struct intel_iommu *iommu;
5128         unsigned long start_pfn, last_pfn;
5129         unsigned int npages;
5130         int iommu_id, level = 0;
5131
5132         /* Cope with horrid API which requires us to unmap more than the
5133            size argument if it happens to be a large-page mapping. */
5134         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5135
5136         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5137                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5138
5139         start_pfn = iova >> VTD_PAGE_SHIFT;
5140         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5141
5142         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5143
5144         npages = last_pfn - start_pfn + 1;
5145
5146         for_each_domain_iommu(iommu_id, dmar_domain) {
5147                 iommu = g_iommus[iommu_id];
5148
5149                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5150                                       start_pfn, npages, !freelist, 0);
5151         }
5152
5153         dma_free_pagelist(freelist);
5154
5155         if (dmar_domain->max_addr == iova + size)
5156                 dmar_domain->max_addr = iova;
5157
5158         return size;
5159 }
5160
5161 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5162                                             dma_addr_t iova)
5163 {
5164         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5165         struct dma_pte *pte;
5166         int level = 0;
5167         u64 phys = 0;
5168
5169         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5170         if (pte)
5171                 phys = dma_pte_addr(pte);
5172
5173         return phys;
5174 }
5175
5176 static bool intel_iommu_capable(enum iommu_cap cap)
5177 {
5178         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5179                 return domain_update_iommu_snooping(NULL) == 1;
5180         if (cap == IOMMU_CAP_INTR_REMAP)
5181                 return irq_remapping_enabled == 1;
5182
5183         return false;
5184 }
5185
5186 static int intel_iommu_add_device(struct device *dev)
5187 {
5188         struct intel_iommu *iommu;
5189         struct iommu_group *group;
5190         u8 bus, devfn;
5191
5192         iommu = device_to_iommu(dev, &bus, &devfn);
5193         if (!iommu)
5194                 return -ENODEV;
5195
5196         iommu_device_link(iommu->iommu_dev, dev);
5197
5198         group = iommu_group_get_for_dev(dev);
5199
5200         if (IS_ERR(group))
5201                 return PTR_ERR(group);
5202
5203         iommu_group_put(group);
5204         return 0;
5205 }
5206
5207 static void intel_iommu_remove_device(struct device *dev)
5208 {
5209         struct intel_iommu *iommu;
5210         u8 bus, devfn;
5211
5212         iommu = device_to_iommu(dev, &bus, &devfn);
5213         if (!iommu)
5214                 return;
5215
5216         iommu_group_remove_device(dev);
5217
5218         iommu_device_unlink(iommu->iommu_dev, dev);
5219 }
5220
5221 static void intel_iommu_get_resv_regions(struct device *device,
5222                                          struct list_head *head)
5223 {
5224         struct iommu_resv_region *reg;
5225         struct dmar_rmrr_unit *rmrr;
5226         struct device *i_dev;
5227         int i;
5228
5229         rcu_read_lock();
5230         for_each_rmrr_units(rmrr) {
5231                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5232                                           i, i_dev) {
5233                         if (i_dev != device)
5234                                 continue;
5235
5236                         list_add_tail(&rmrr->resv->list, head);
5237                 }
5238         }
5239         rcu_read_unlock();
5240
5241         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5242                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5243                                       0, IOMMU_RESV_RESERVED);
5244         if (!reg)
5245                 return;
5246         list_add_tail(&reg->list, head);
5247 }
5248
5249 static void intel_iommu_put_resv_regions(struct device *dev,
5250                                          struct list_head *head)
5251 {
5252         struct iommu_resv_region *entry, *next;
5253
5254         list_for_each_entry_safe(entry, next, head, list) {
5255                 if (entry->type == IOMMU_RESV_RESERVED)
5256                         kfree(entry);
5257         }
5258 }
5259
5260 #ifdef CONFIG_INTEL_IOMMU_SVM
5261 #define MAX_NR_PASID_BITS (20)
5262 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5263 {
5264         /*
5265          * Convert ecap_pss to extend context entry pts encoding, also
5266          * respect the soft pasid_max value set by the iommu.
5267          * - number of PASID bits = ecap_pss + 1
5268          * - number of PASID table entries = 2^(pts + 5)
5269          * Therefore, pts = ecap_pss - 4
5270          * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5271          */
5272         if (ecap_pss(iommu->ecap) < 5)
5273                 return 0;
5274
5275         /* pasid_max is encoded as actual number of entries not the bits */
5276         return find_first_bit((unsigned long *)&iommu->pasid_max,
5277                         MAX_NR_PASID_BITS) - 5;
5278 }
5279
5280 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5281 {
5282         struct device_domain_info *info;
5283         struct context_entry *context;
5284         struct dmar_domain *domain;
5285         unsigned long flags;
5286         u64 ctx_lo;
5287         int ret;
5288
5289         domain = get_valid_domain_for_dev(sdev->dev);
5290         if (!domain)
5291                 return -EINVAL;
5292
5293         spin_lock_irqsave(&device_domain_lock, flags);
5294         spin_lock(&iommu->lock);
5295
5296         ret = -EINVAL;
5297         info = sdev->dev->archdata.iommu;
5298         if (!info || !info->pasid_supported)
5299                 goto out;
5300
5301         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5302         if (WARN_ON(!context))
5303                 goto out;
5304
5305         ctx_lo = context[0].lo;
5306
5307         sdev->did = domain->iommu_did[iommu->seq_id];
5308         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5309
5310         if (!(ctx_lo & CONTEXT_PASIDE)) {
5311                 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5312                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5313                         intel_iommu_get_pts(iommu);
5314
5315                 wmb();
5316                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5317                  * extended to permit requests-with-PASID if the PASIDE bit
5318                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5319                  * however, the PASIDE bit is ignored and requests-with-PASID
5320                  * are unconditionally blocked. Which makes less sense.
5321                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5322                  * "guest mode" translation types depending on whether ATS
5323                  * is available or not. Annoyingly, we can't use the new
5324                  * modes *unless* PASIDE is set. */
5325                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5326                         ctx_lo &= ~CONTEXT_TT_MASK;
5327                         if (info->ats_supported)
5328                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5329                         else
5330                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5331                 }
5332                 ctx_lo |= CONTEXT_PASIDE;
5333                 if (iommu->pasid_state_table)
5334                         ctx_lo |= CONTEXT_DINVE;
5335                 if (info->pri_supported)
5336                         ctx_lo |= CONTEXT_PRS;
5337                 context[0].lo = ctx_lo;
5338                 wmb();
5339                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5340                                            DMA_CCMD_MASK_NOBIT,
5341                                            DMA_CCMD_DEVICE_INVL);
5342         }
5343
5344         /* Enable PASID support in the device, if it wasn't already */
5345         if (!info->pasid_enabled)
5346                 iommu_enable_dev_iotlb(info);
5347
5348         if (info->ats_enabled) {
5349                 sdev->dev_iotlb = 1;
5350                 sdev->qdep = info->ats_qdep;
5351                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5352                         sdev->qdep = 0;
5353         }
5354         ret = 0;
5355
5356  out:
5357         spin_unlock(&iommu->lock);
5358         spin_unlock_irqrestore(&device_domain_lock, flags);
5359
5360         return ret;
5361 }
5362
5363 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5364 {
5365         struct intel_iommu *iommu;
5366         u8 bus, devfn;
5367
5368         if (iommu_dummy(dev)) {
5369                 dev_warn(dev,
5370                          "No IOMMU translation for device; cannot enable SVM\n");
5371                 return NULL;
5372         }
5373
5374         iommu = device_to_iommu(dev, &bus, &devfn);
5375         if ((!iommu)) {
5376                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5377                 return NULL;
5378         }
5379
5380         if (!iommu->pasid_table) {
5381                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5382                 return NULL;
5383         }
5384
5385         return iommu;
5386 }
5387 #endif /* CONFIG_INTEL_IOMMU_SVM */
5388
5389 static const struct iommu_ops intel_iommu_ops = {
5390         .capable                = intel_iommu_capable,
5391         .domain_alloc           = intel_iommu_domain_alloc,
5392         .domain_free            = intel_iommu_domain_free,
5393         .attach_dev             = intel_iommu_attach_device,
5394         .detach_dev             = intel_iommu_detach_device,
5395         .map                    = intel_iommu_map,
5396         .unmap                  = intel_iommu_unmap,
5397         .map_sg                 = default_iommu_map_sg,
5398         .iova_to_phys           = intel_iommu_iova_to_phys,
5399         .add_device             = intel_iommu_add_device,
5400         .remove_device          = intel_iommu_remove_device,
5401         .get_resv_regions       = intel_iommu_get_resv_regions,
5402         .put_resv_regions       = intel_iommu_put_resv_regions,
5403         .device_group           = pci_device_group,
5404         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5405 };
5406
5407 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5408 {
5409         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5410         pr_info("Disabling IOMMU for graphics on this chipset\n");
5411         dmar_map_gfx = 0;
5412 }
5413
5414 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5415 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5416 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5417 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5418 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5419 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5420 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5421
5422 static void quirk_iommu_rwbf(struct pci_dev *dev)
5423 {
5424         /*
5425          * Mobile 4 Series Chipset neglects to set RWBF capability,
5426          * but needs it. Same seems to hold for the desktop versions.
5427          */
5428         pr_info("Forcing write-buffer flush capability\n");
5429         rwbf_quirk = 1;
5430 }
5431
5432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5433 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5434 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5435 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5436 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5437 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5438 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5439
5440 #define GGC 0x52
5441 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5442 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5443 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5444 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5445 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5446 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5447 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5448 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5449
5450 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5451 {
5452         unsigned short ggc;
5453
5454         if (pci_read_config_word(dev, GGC, &ggc))
5455                 return;
5456
5457         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5458                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5459                 dmar_map_gfx = 0;
5460         } else if (dmar_map_gfx) {
5461                 /* we have to ensure the gfx device is idle before we flush */
5462                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5463                 intel_iommu_strict = 1;
5464        }
5465 }
5466 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5467 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5468 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5469 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5470
5471 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5472    ISOCH DMAR unit for the Azalia sound device, but not give it any
5473    TLB entries, which causes it to deadlock. Check for that.  We do
5474    this in a function called from init_dmars(), instead of in a PCI
5475    quirk, because we don't want to print the obnoxious "BIOS broken"
5476    message if VT-d is actually disabled.
5477 */
5478 static void __init check_tylersburg_isoch(void)
5479 {
5480         struct pci_dev *pdev;
5481         uint32_t vtisochctrl;
5482
5483         /* If there's no Azalia in the system anyway, forget it. */
5484         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5485         if (!pdev)
5486                 return;
5487         pci_dev_put(pdev);
5488
5489         /* System Management Registers. Might be hidden, in which case
5490            we can't do the sanity check. But that's OK, because the
5491            known-broken BIOSes _don't_ actually hide it, so far. */
5492         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5493         if (!pdev)
5494                 return;
5495
5496         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5497                 pci_dev_put(pdev);
5498                 return;
5499         }
5500
5501         pci_dev_put(pdev);
5502
5503         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5504         if (vtisochctrl & 1)
5505                 return;
5506
5507         /* Drop all bits other than the number of TLB entries */
5508         vtisochctrl &= 0x1c;
5509
5510         /* If we have the recommended number of TLB entries (16), fine. */
5511         if (vtisochctrl == 0x10)
5512                 return;
5513
5514         /* Zero TLB entries? You get to ride the short bus to school. */
5515         if (!vtisochctrl) {
5516                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5517                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5518                      dmi_get_system_info(DMI_BIOS_VENDOR),
5519                      dmi_get_system_info(DMI_BIOS_VERSION),
5520                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5521                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5522                 return;
5523         }
5524
5525         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5526                vtisochctrl);
5527 }