Merge branch 'master' into x86/memblock

author Tejun Heo <tj@kernel.org>

Mon, 28 Nov 2011 17:46:22 +0000 (09:46 -0800)

committer Tejun Heo <tj@kernel.org>

Mon, 28 Nov 2011 17:46:22 +0000 (09:46 -0800)
author Tejun Heo <tj@kernel.org>
Mon, 28 Nov 2011 17:46:22 +0000 (09:46 -0800)
committer Tejun Heo <tj@kernel.org>
Mon, 28 Nov 2011 17:46:22 +0000 (09:46 -0800)
diff --combined arch/powerpc/mm/numa.c

index 6f06ea53bca281fe028899dcd497553d74dc2043,b22a83a91cb852b92d023529e3f1b3bf011dce38..261adbd3b55a58734eec39966cc74ca40b335c33
--- 1/arch/powerpc/mm/numa.c
--- 2/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@@ -13,7 -13,7 +13,7 @@@
   #include <linux/init.h>
   #include <linux/mm.h>
   #include <linux/mmzone.h>
- #include <linux/module.h>
+ #include <linux/export.h>
   #include <linux/nodemask.h>
   #include <linux/cpu.h>
   #include <linux/notifier.h>
@@@ -127,25 -127,45 +127,25 @@@ static int __cpuinit fake_numa_create_n
   }
   
   /*
- - * get_active_region_work_fn - A helper function for get_node_active_region
- - *    Returns datax set to the start_pfn and end_pfn if they contain
- - *    the initial value of datax->start_pfn between them
- - * @start_pfn: start page(inclusive) of region to check
- - * @end_pfn: end page(exclusive) of region to check
- - * @datax: comes in with ->start_pfn set to value to search for and
- - *    goes out with active range if it contains it
- - * Returns 1 if search value is in range else 0
- - */
- -static int __init get_active_region_work_fn(unsigned long start_pfn,
- -                                      unsigned long end_pfn, void *datax)
- -{
- -      struct node_active_region *data;
- -      data = (struct node_active_region *)datax;
- -
- -      if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) {
- -              data->start_pfn = start_pfn;
- -              data->end_pfn = end_pfn;
- -              return 1;
- -      }
- -      return 0;
- -
- -}
- -
- -/*
- - * get_node_active_region - Return active region containing start_pfn
+ + * get_node_active_region - Return active region containing pfn
    * Active range returned is empty if none found.
- - * @start_pfn: The page to return the region for.
- - * @node_ar: Returned set to the active region containing start_pfn
+ + * @pfn: The page to return the region for
+ + * @node_ar: Returned set to the active region containing @pfn
    */
- -static void __init get_node_active_region(unsigned long start_pfn,
- -                     struct node_active_region *node_ar)
+ +static void __init get_node_active_region(unsigned long pfn,
+ +                                        struct node_active_region *node_ar)
   {
- -      int nid = early_pfn_to_nid(start_pfn);
- -
- -      node_ar->nid = nid;
- -      node_ar->start_pfn = start_pfn;
- -      node_ar->end_pfn = start_pfn;
- -      work_with_active_regions(nid, get_active_region_work_fn, node_ar);
+ +      unsigned long start_pfn, end_pfn;
+ +      int i, nid;
+ +
+ +      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ +              if (pfn >= start_pfn && pfn < end_pfn) {
+ +                      node_ar->nid = nid;
+ +                      node_ar->start_pfn = start_pfn;
+ +                      node_ar->end_pfn = end_pfn;
+ +                      break;
+ +              }
+ +      }
   }
   
   static void map_cpu_to_node(int cpu, int node)
@@@ -295,7 -315,10 +295,10 @@@ static int __init find_min_common_depth
         struct device_node *root;
         const char *vec5;
   
-       root = of_find_node_by_path("/rtas");
+       if (firmware_has_feature(FW_FEATURE_OPAL))
+               root = of_find_node_by_path("/ibm,opal");
+       else
+               root = of_find_node_by_path("/rtas");
         if (!root)
                 root = of_find_node_by_path("/");
   
@@@ -324,12 -347,19 +327,19 @@@
   
   #define VEC5_AFFINITY_BYTE    5
   #define VEC5_AFFINITY         0x80
-       chosen = of_find_node_by_path("/chosen");
-       if (chosen) {
-               vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL);
-               if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & VEC5_AFFINITY)) {
-                       dbg("Using form 1 affinity\n");
-                       form1_affinity = 1;
+ 
+       if (firmware_has_feature(FW_FEATURE_OPAL))
+               form1_affinity = 1;
+       else {
+               chosen = of_find_node_by_path("/chosen");
+               if (chosen) {
+                       vec5 = of_get_property(chosen,
+                                              "ibm,architecture-vec-5", NULL);
+                       if (vec5 && (vec5[VEC5_AFFINITY_BYTE] &
+                                                       VEC5_AFFINITY)) {
+                               dbg("Using form 1 affinity\n");
+                               form1_affinity = 1;
+                       }
                 }
         }
   
@@@ -689,8 -719,7 +699,7 @@@ static void __init parse_drconf_memory(
   
   static int __init parse_numa_properties(void)
   {
-       struct device_node *cpu = NULL;
-       struct device_node *memory = NULL;
+       struct device_node *memory;
         int default_nid = 0;
         unsigned long i;
   
@@@ -712,6 -741,7 +721,7 @@@
          * each node to be onlined must have NODE_DATA etc backing it.
          */
         for_each_present_cpu(i) {
+               struct device_node *cpu;
                 int nid;
   
                 cpu = of_get_cpu_node(i, NULL);
@@@ -730,8 -760,8 +740,8 @@@
         }
   
         get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
-       memory = NULL;
-       while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+ 
+       for_each_node_by_type(memory, "memory") {
                 unsigned long start;
                 unsigned long size;
                 int nid;
@@@ -780,8 -810,9 +790,9 @@@ new_range
         }
   
         /*
-        * Now do the same thing for each MEMBLOCK listed in the ibm,dynamic-memory
-        * property in the ibm,dynamic-reconfiguration-memory node.
+        * Now do the same thing for each MEMBLOCK listed in the
+        * ibm,dynamic-memory property in the
+        * ibm,dynamic-reconfiguration-memory node.
          */
         memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
         if (memory)
@@@ -1167,10 -1198,10 +1178,10 @@@ static int hot_add_drconf_scn_to_nid(st
    */
   int hot_add_node_scn_to_nid(unsigned long scn_addr)
   {
-       struct device_node *memory = NULL;
+       struct device_node *memory;
         int nid = -1;
   
-       while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+       for_each_node_by_type(memory, "memory") {
                 unsigned long start, size;
                 int ranges;
                 const unsigned int *memcell_buf;
@@@ -1194,11 -1225,12 +1205,12 @@@
                         break;
                 }
   
-               of_node_put(memory);
                 if (nid >= 0)
                         break;
         }
   
+       of_node_put(memory);
+ 
         return nid;
   }
   
diff --combined arch/sparc/mm/init_64.c

index 8415f614ce0c2f1284fe6eddda41d1fe6e750564,8e073d802139705aa9d6fc280c8e20f8d8f8e0f0..8584a25a9f0df742675fa8318746ee911a38aa91
--- 1/arch/sparc/mm/init_64.c
--- 2/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@@ -511,6 -511,11 +511,11 @@@ static void __init read_obp_translation
                 for (i = 0; i < prom_trans_ents; i++)
                         prom_trans[i].data &= ~0x0003fe0000000000UL;
         }
+ 
+       /* Force execute bit on.  */
+       for (i = 0; i < prom_trans_ents; i++)
+               prom_trans[i].data |= (tlb_type == hypervisor ?
+                                      _PAGE_EXEC_4V : _PAGE_EXEC_4U);
   }
   
   static void __init hypervisor_tlb_lock(unsigned long vaddr,
@@@ -785,7 -790,7 +790,7 @@@ static int find_node(unsigned long addr
         return -1;
   }
   
- -u64 memblock_nid_range(u64 start, u64 end, int *nid)
+ +static u64 memblock_nid_range(u64 start, u64 end, int *nid)
   {
         *nid = find_node(start);
         start += PAGE_SIZE;
@@@ -803,7 -808,7 +808,7 @@@
         return start;
   }
   #else
- -u64 memblock_nid_range(u64 start, u64 end, int *nid)
+ +static u64 memblock_nid_range(u64 start, u64 end, int *nid)
   {
         *nid = 0;
         return end;
@@@ -1597,6 -1602,44 +1602,44 @@@ static void __init tsb_phys_patch(void
   static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR];
   extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
   
+ static void patch_one_ktsb_phys(unsigned int *start, unsigned int *end, unsigned long pa)
+ {
+       pa >>= KTSB_PHYS_SHIFT;
+ 
+       while (start < end) {
+               unsigned int *ia = (unsigned int *)(unsigned long)*start;
+ 
+               ia[0] = (ia[0] & ~0x3fffff) | (pa >> 10);
+               __asm__ __volatile__("flush     %0" : : "r" (ia));
+ 
+               ia[1] = (ia[1] & ~0x3ff) | (pa & 0x3ff);
+               __asm__ __volatile__("flush     %0" : : "r" (ia + 1));
+ 
+               start++;
+       }
+ }
+ 
+ static void ktsb_phys_patch(void)
+ {
+       extern unsigned int __swapper_tsb_phys_patch;
+       extern unsigned int __swapper_tsb_phys_patch_end;
+       unsigned long ktsb_pa;
+ 
+       ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE);
+       patch_one_ktsb_phys(&__swapper_tsb_phys_patch,
+                           &__swapper_tsb_phys_patch_end, ktsb_pa);
+ #ifndef CONFIG_DEBUG_PAGEALLOC
+       {
+       extern unsigned int __swapper_4m_tsb_phys_patch;
+       extern unsigned int __swapper_4m_tsb_phys_patch_end;
+       ktsb_pa = (kern_base +
+                  ((unsigned long)&swapper_4m_tsb[0] - KERNBASE));
+       patch_one_ktsb_phys(&__swapper_4m_tsb_phys_patch,
+                           &__swapper_4m_tsb_phys_patch_end, ktsb_pa);
+       }
+ #endif
+ }
+ 
   static void __init sun4v_ktsb_init(void)
   {
         unsigned long ktsb_pa;
@@@ -1716,8 -1759,10 +1759,10 @@@ void __init paging_init(void
                 sun4u_pgprot_init();
   
         if (tlb_type == cheetah_plus ||
-           tlb_type == hypervisor)
+           tlb_type == hypervisor) {
                 tsb_phys_patch();
+               ktsb_phys_patch();
+       }
   
         if (tlb_type == hypervisor) {
                 sun4v_patch_tlb_handlers();
diff --combined arch/x86/Kconfig

index 28116d4f7b64c8da5528e9b452c937e4c2ee2b37,cb9a1044a771be75563305f8909097a67cd21778..5d1514c263f84a26c591f4951ae3989c861d2eae
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -20,13 -20,12 +20,14 @@@ config X8
         select HAVE_UNSTABLE_SCHED_CLOCK
         select HAVE_IDE
         select HAVE_OPROFILE
+       select HAVE_PCSPKR_PLATFORM
         select HAVE_PERF_EVENTS
         select HAVE_IRQ_WORK
         select HAVE_IOREMAP_PROT
         select HAVE_KPROBES
         select HAVE_MEMBLOCK
+ +      select HAVE_MEMBLOCK_NODE_MAP
+ +      select ARCH_DISCARD_MEMBLOCK
         select ARCH_WANT_OPTIONAL_GPIOLIB
         select ARCH_WANT_FRAME_POINTERS
         select HAVE_DMA_ATTRS
@@@ -65,13 -64,17 +66,17 @@@
         select HAVE_TEXT_POKE_SMP
         select HAVE_GENERIC_HARDIRQS
         select HAVE_SPARSE_IRQ
+       select SPARSE_IRQ
         select GENERIC_FIND_FIRST_BIT
         select GENERIC_IRQ_PROBE
         select GENERIC_PENDING_IRQ if SMP
         select GENERIC_IRQ_SHOW
+       select GENERIC_CLOCKEVENTS_MIN_ADJUST
         select IRQ_FORCED_THREADING
         select USE_GENERIC_SMP_HELPERS if SMP
         select HAVE_BPF_JIT if (X86_64 && NET)
+       select CLKEVT_I8253
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG
   
   config INSTRUCTION_DECODER
         def_bool (KPROBES || PERF_EVENTS)
@@@ -95,6 -98,10 +100,10 @@@ config CLOCKSOURCE_WATCHDO
   config GENERIC_CLOCKEVENTS
         def_bool y
   
+ config ARCH_CLOCKSOURCE_DATA
+       def_bool y
+       depends on X86_64
+ 
   config GENERIC_CLOCKEVENTS_BROADCAST
         def_bool y
         depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
@@@ -125,7 -132,7 +134,7 @@@ config SBU
         bool
   
   config NEED_DMA_MAP_STATE
-        def_bool (X86_64 || DMAR || DMA_API_DEBUG)
+        def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG)
   
   config NEED_SG_DMA_LENGTH
         def_bool y
@@@ -187,9 -194,6 +196,6 @@@ config NEED_PER_CPU_EMBED_FIRST_CHUN
   config NEED_PER_CPU_PAGE_FIRST_CHUNK
         def_bool y
   
- config HAVE_CPUMASK_OF_CPU_MAP
-       def_bool X86_64_SMP
- 
   config ARCH_HIBERNATION_POSSIBLE
         def_bool y
   
@@@ -215,7 -219,7 +221,7 @@@ config ARCH_SUPPORTS_DEBUG_PAGEALLO
   
   config HAVE_INTEL_TXT
         def_bool y
-       depends on EXPERIMENTAL && DMAR && ACPI
+       depends on EXPERIMENTAL && INTEL_IOMMU && ACPI
   
   config X86_32_SMP
         def_bool y
@@@ -274,7 -278,7 +280,7 @@@ config SM
           Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
           Management" code will be disabled if you say Y here.
   
-         See also <file:Documentation/i386/IO-APIC.txt>,
+         See also <file:Documentation/x86/i386/IO-APIC.txt>,
           <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
           <http://www.tldp.org/docs.html#howto>.
   
@@@ -282,7 -286,7 +288,7 @@@
   
   config X86_X2APIC
         bool "Support x2apic"
-       depends on X86_LOCAL_APIC && X86_64 && INTR_REMAP
+       depends on X86_LOCAL_APIC && X86_64 && IRQ_REMAP
         ---help---
           This enables x2apic support on CPUs that have this feature.
   
@@@ -386,12 -390,21 +392,21 @@@ config X86_INTEL_C
           This option compiles in support for the CE4100 SOC for settop
           boxes and media devices.
   
+ config X86_INTEL_MID
+       bool "Intel MID platform support"
+       depends on X86_32
+       depends on X86_EXTENDED_PLATFORM
+       ---help---
+         Select to build a kernel capable of supporting Intel MID platform
+         systems which do not have the PCI legacy interfaces (Moorestown,
+         Medfield). If you are building for a PC class system say N here.
+ 
+ if X86_INTEL_MID
+ 
   config X86_MRST
          bool "Moorestown MID platform"
         depends on PCI
         depends on PCI_GOANY
-       depends on X86_32
-       depends on X86_EXTENDED_PLATFORM
         depends on X86_IO_APIC
         select APB_TIMER
         select I2C
@@@ -406,6 -419,8 +421,8 @@@
           nor standard legacy replacement devices/features. e.g. Moorestown does
           not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
   
+ endif
+ 
   config X86_RDC321X
         bool "RDC R-321x SoC"
         depends on X86_32
@@@ -514,6 -529,18 +531,18 @@@ menuconfig PARAVIRT_GUES
   
   if PARAVIRT_GUEST
   
+ config PARAVIRT_TIME_ACCOUNTING
+       bool "Paravirtual steal time accounting"
+       select PARAVIRT
+       default n
+       ---help---
+         Select this option to enable fine granularity task steal time
+         accounting. Time spent executing other tasks in parallel with
+         the current vCPU is discounted from the vCPU power. To account for
+         that, there can be a small performance impact.
+ 
+         If in doubt, say N here.
+ 
   source "arch/x86/xen/Kconfig"
   
   config KVM_CLOCK
@@@ -619,6 -646,7 +648,7 @@@ config HPET_EMULATE_RT
   config APB_TIMER
          def_bool y if MRST
          prompt "Langwell APB Timer Support" if X86_MRST
+        select DW_APB_TIMER
          help
            APB timer is the replacement for 8254, HPET on X86 MID platforms.
            The APBT provides a stable time base on SMP
@@@ -682,33 -710,6 +712,6 @@@ config CALGARY_IOMMU_ENABLED_BY_DEFAUL
           Calgary anyway, pass 'iommu=calgary' on the kernel command line.
           If unsure, say Y.
   
- config AMD_IOMMU
-       bool "AMD IOMMU support"
-       select SWIOTLB
-       select PCI_MSI
-       select PCI_IOV
-       depends on X86_64 && PCI && ACPI
-       ---help---
-         With this option you can enable support for AMD IOMMU hardware in
-         your system. An IOMMU is a hardware component which provides
-         remapping of DMA memory accesses from devices. With an AMD IOMMU you
-         can isolate the the DMA memory of different devices and protect the
-         system from misbehaving device drivers or hardware.
- 
-         You can find out if your system has an AMD IOMMU if you look into
-         your BIOS for an option to enable it or if you have an IVRS ACPI
-         table.
- 
- config AMD_IOMMU_STATS
-       bool "Export AMD IOMMU statistics to debugfs"
-       depends on AMD_IOMMU
-       select DEBUG_FS
-       ---help---
-         This option enables code in the AMD IOMMU driver to collect various
-         statistics about whats happening in the driver and exports that
-         information to userspace via debugfs.
-         If unsure, say N.
- 
   # need this always selected by IOMMU for the VIA workaround
   config SWIOTLB
         def_bool y if X86_64
@@@ -722,9 -723,6 +725,6 @@@
   config IOMMU_HELPER
         def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
   
- config IOMMU_API
-       def_bool (AMD_IOMMU || DMAR)
- 
   config MAXSMP
         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
         depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
@@@ -1172,7 -1170,7 +1172,7 @@@ comment "NUMA (Summit) requires SMP, 64
   config AMD_NUMA
         def_bool y
         prompt "Old style AMD Opteron NUMA detection"
-       depends on NUMA && PCI
+       depends on X86_64 && NUMA && PCI
         ---help---
           Enable AMD NUMA node topology detection.  You should say Y here if
           you have a multi processor AMD system. This uses an old method to
@@@ -1453,6 -1451,15 +1453,15 @@@ config ARCH_USES_PG_UNCACHE
         def_bool y
         depends on X86_PAT
   
+ config ARCH_RANDOM
+       def_bool y
+       prompt "x86 architectural random number generator" if EXPERT
+       ---help---
+         Enable the x86 architectural RDRAND instruction
+         (Intel Bull Mountain technology) to generate random numbers.
+         If supported, this is a high bandwidth, cryptographically
+         secure hardware random number generator.
+ 
   config EFI
         bool "EFI runtime service support"
         depends on ACPI
@@@ -1739,8 -1746,8 +1748,8 @@@ menuconfig AP
           machines with more than one CPU.
   
           In order to use APM, you will need supporting software. For location
-         and more information, read <file:Documentation/power/pm.txt> and the
-         Battery Powered Linux mini-HOWTO, available from
+         and more information, read <file:Documentation/power/apm-acpi.txt>
+         and the Battery Powered Linux mini-HOWTO, available from
           <http://www.tldp.org/docs.html#howto>.
   
           This driver does not spin down disk drives (see the hdparm(8)
@@@ -1907,7 -1914,7 +1916,7 @@@ config PCI_BIO
   # x86-64 doesn't support PCI BIOS access from long mode so always go direct.
   config PCI_DIRECT
         def_bool y
-       depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC))
+       depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC || PCI_GOMMCONFIG))
   
   config PCI_MMCONFIG
         def_bool y
@@@ -1944,55 -1951,6 +1953,6 @@@ config PCI_CNB20LE_QUIR
   
           You should say N unless you know you need this.
   
- config DMAR
-       bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
-       depends on PCI_MSI && ACPI && EXPERIMENTAL
-       help
-         DMA remapping (DMAR) devices support enables independent address
-         translations for Direct Memory Access (DMA) from devices.
-         These DMA remapping devices are reported via ACPI tables
-         and include PCI device scope covered by these DMA
-         remapping devices.
- 
- config DMAR_DEFAULT_ON
-       def_bool y
-       prompt "Enable DMA Remapping Devices by default"
-       depends on DMAR
-       help
-         Selecting this option will enable a DMAR device at boot time if
-         one is found. If this option is not selected, DMAR support can
-         be enabled by passing intel_iommu=on to the kernel. It is
-         recommended you say N here while the DMAR code remains
-         experimental.
- 
- config DMAR_BROKEN_GFX_WA
-       bool "Workaround broken graphics drivers (going away soon)"
-       depends on DMAR && BROKEN
-       ---help---
-         Current Graphics drivers tend to use physical address
-         for DMA and avoid using DMA APIs. Setting this config
-         option permits the IOMMU driver to set a unity map for
-         all the OS-visible memory. Hence the driver can continue
-         to use physical addresses for DMA, at least until this
-         option is removed in the 2.6.32 kernel.
- 
- config DMAR_FLOPPY_WA
-       def_bool y
-       depends on DMAR
-       ---help---
-         Floppy disk drivers are known to bypass DMA API calls
-         thereby failing to work when IOMMU is enabled. This
-         workaround will setup a 1:1 mapping for the first
-         16MiB to make floppy (an ISA device) work.
- 
- config INTR_REMAP
-       bool "Support for Interrupt Remapping (EXPERIMENTAL)"
-       depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
-       ---help---
-         Supports Interrupt remapping for IO-APIC and MSI devices.
-         To use x2apic mode in the CPU's which support x2APIC enhancements or
-         to support platforms with CPU's having > 8 bit APIC ID, say Y.
- 
   source "drivers/pci/pcie/Kconfig"
   
   source "drivers/pci/Kconfig"
@@@ -2075,11 -2033,58 +2035,58 @@@ config OLP
           Add support for detecting the unique features of the OLPC
           XO hardware.
   
- config OLPC_XO1
-       tristate "OLPC XO-1 support"
-       depends on OLPC && MFD_CS5535
+ config OLPC_XO1_PM
+       bool "OLPC XO-1 Power Management"
+       depends on OLPC && MFD_CS5535 && PM_SLEEP
+       select MFD_CORE
+       ---help---
+         Add support for poweroff and suspend of the OLPC XO-1 laptop.
+ 
+ config OLPC_XO1_RTC
+       bool "OLPC XO-1 Real Time Clock"
+       depends on OLPC_XO1_PM && RTC_DRV_CMOS
+       ---help---
+         Add support for the XO-1 real time clock, which can be used as a
+         programmable wakeup source.
+ 
+ config OLPC_XO1_SCI
+       bool "OLPC XO-1 SCI extras"
+       depends on OLPC && OLPC_XO1_PM
+       select POWER_SUPPLY
+       select GPIO_CS5535
+       select MFD_CORE
+       ---help---
+         Add support for SCI-based features of the OLPC XO-1 laptop:
+          - EC-driven system wakeups
+          - Power button
+          - Ebook switch
+          - Lid switch
+          - AC adapter status updates
+          - Battery status updates
+ 
+ config OLPC_XO15_SCI
+       bool "OLPC XO-1.5 SCI extras"
+       depends on OLPC && ACPI
+       select POWER_SUPPLY
+       ---help---
+         Add support for SCI-based features of the OLPC XO-1.5 laptop:
+          - EC-driven system wakeups
+          - AC adapter status updates
+          - Battery status updates
+ 
+ config ALIX
+       bool "PCEngines ALIX System Support (LED setup)"
+       select GPIOLIB
         ---help---
-         Add support for non-essential features of the OLPC XO-1 laptop.
+         This option enables system support for the PCEngines ALIX.
+         At present this just sets up LEDs for GPIO control on
+         ALIX2/3/6 boards.  However, other system specific setup should
+         get added here.
+ 
+         Note: You must still enable the drivers for GPIO and LED support
+         (GPIO_CS5535 & LEDS_GPIO) to actually use the LEDs
+ 
+         Note: You have to set alix.force=1 for boards with Award BIOS.
   
   endif # X86_32
   
diff --combined arch/x86/kernel/e820.c

index 84475f1e220127bcb3ae1414634ed4395c815496,303a0e48f076feb3feb522d4052ac4b958995d42..056e65d5012bc9edbbdc57fb3ced81b5148d79e2
--- 1/arch/x86/kernel/e820.c
--- 2/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@@ -12,6 -12,7 +12,7 @@@
   #include <linux/types.h>
   #include <linux/init.h>
   #include <linux/crash_dump.h>
+ #include <linux/export.h>
   #include <linux/bootmem.h>
   #include <linux/pfn.h>
   #include <linux/suspend.h>
@@@ -737,17 -738,35 +738,17 @@@ core_initcall(e820_mark_nvs_memory)
   /*
    * pre allocated 4k and reserved it in memblock and e820_saved
    */
- -u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+ +u64 __init early_reserve_e820(u64 size, u64 align)
   {
- -      u64 size = 0;
         u64 addr;
- -      u64 start;
   
- -      for (start = startt; ; start += size) {
- -              start = memblock_x86_find_in_range_size(start, &size, align);
- -              if (start == MEMBLOCK_ERROR)
- -                      return 0;
- -              if (size >= sizet)
- -                      break;
+ +      addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+ +      if (addr) {
+ +              e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
+ +              printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+ +              update_e820_saved();
         }
   
- -#ifdef CONFIG_X86_32
- -      if (start >= MAXMEM)
- -              return 0;
- -      if (start + size > MAXMEM)
- -              size = MAXMEM - start;
- -#endif
- -
- -      addr = round_down(start + size - sizet, align);
- -      if (addr < start)
- -              return 0;
- -      memblock_x86_reserve_range(addr, addr + sizet, "new next");
- -      e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
- -      printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
- -      update_e820_saved();
- -
         return addr;
   }
   
@@@ -1093,30 -1112,15 +1094,30 @@@ void __init memblock_x86_fill(void
   void __init memblock_find_dma_reserve(void)
   {
   #ifdef CONFIG_X86_64
- -      u64 free_size_pfn;
- -      u64 mem_size_pfn;
+ +      u64 nr_pages = 0, nr_free_pages = 0;
+ +      unsigned long start_pfn, end_pfn;
+ +      phys_addr_t start, end;
+ +      int i;
+ +      u64 u;
+ +
         /*
          * need to find out used area below MAX_DMA_PFN
          * need to use memblock to get free size in [0, MAX_DMA_PFN]
          * at first, and assume boot_mem will not take below MAX_DMA_PFN
          */
- -      mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
- -      free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
- -      set_dma_reserve(mem_size_pfn - free_size_pfn);
+ +      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ +              start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN);
+ +              end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN);
+ +              nr_pages += end_pfn - start_pfn;
+ +      }
+ +
+ +      for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+ +              start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
+ +              end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
+ +              if (start_pfn < end_pfn)
+ +                      nr_free_pages += end_pfn - start_pfn;
+ +      }
+ +
+ +      set_dma_reserve(nr_pages - nr_free_pages);
   #endif
   }
diff --combined arch/x86/kernel/setup.c

index 97d227ec995d100f45f62c1a856340e68e3b6375,cf0ef986cb6dff51348c17c691491f6f48c61a60..d05444ac2aea59378d1aac95e53c44f0a7a19a4e
--- 1/arch/x86/kernel/setup.c
--- 2/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@@ -306,8 -306,7 +306,8 @@@ static void __init cleanup_highmap(void
   static void __init reserve_brk(void)
   {
         if (_brk_end > _brk_start)
- -              memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
+ +              memblock_reserve(__pa(_brk_start),
+ +                               __pa(_brk_end) - __pa(_brk_start));
   
         /* Mark brk area as locked down and no longer taking any
            new allocations */
@@@ -332,13 -331,13 +332,13 @@@ static void __init relocate_initrd(void
         ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
                                          PAGE_SIZE);
   
- -      if (ramdisk_here == MEMBLOCK_ERROR)
+ +      if (!ramdisk_here)
                 panic("Cannot find place for new RAMDISK of size %lld\n",
                          ramdisk_size);
   
         /* Note: this includes all the lowmem currently occupied by
            the initrd, we rely on that fact to keep the data intact. */
- -      memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
+ +      memblock_reserve(ramdisk_here, area_size);
         initrd_start = ramdisk_here + PAGE_OFFSET;
         initrd_end   = initrd_start + ramdisk_size;
         printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@@ -394,7 -393,7 +394,7 @@@ static void __init reserve_initrd(void
         initrd_start = 0;
   
         if (ramdisk_size >= (end_of_lowmem>>1)) {
- -              memblock_x86_free_range(ramdisk_image, ramdisk_end);
+ +              memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
                 printk(KERN_ERR "initrd too large to handle, "
                        "disabling initrd\n");
                 return;
@@@ -417,7 -416,7 +417,7 @@@
   
         relocate_initrd();
   
- -      memblock_x86_free_range(ramdisk_image, ramdisk_end);
+ +      memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
   }
   #else
   static void __init reserve_initrd(void)
@@@ -491,13 -490,15 +491,13 @@@ static void __init memblock_x86_reserve
   {
         struct setup_data *data;
         u64 pa_data;
- -      char buf[32];
   
         if (boot_params.hdr.version < 0x0209)
                 return;
         pa_data = boot_params.hdr.setup_data;
         while (pa_data) {
                 data = early_memremap(pa_data, sizeof(*data));
- -              sprintf(buf, "setup data %x", data->type);
- -              memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ +              memblock_reserve(pa_data, sizeof(*data) + data->len);
                 pa_data = data->next;
                 early_iounmap(data, sizeof(*data));
         }
@@@ -553,7 -554,7 +553,7 @@@ static void __init reserve_crashkernel(
                 crash_base = memblock_find_in_range(alignment,
                                CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
   
- -              if (crash_base == MEMBLOCK_ERROR) {
+ +              if (!crash_base) {
                         pr_info("crashkernel reservation failed - No suitable area found.\n");
                         return;
                 }
@@@ -567,7 -568,7 +567,7 @@@
                         return;
                 }
         }
- -      memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
+ +      memblock_reserve(crash_base, crash_size);
   
         printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
                         "for crashkernel (System RAM: %ldMB)\n",
@@@ -625,7 -626,7 +625,7 @@@ static __init void reserve_ibft_region(
         addr = find_ibft_region(&size);
   
         if (size)
- -              memblock_x86_reserve_range(addr, addr + size, "* ibft");
+ +              memblock_reserve(addr, size);
   }
   
   static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
@@@ -1044,6 -1045,8 +1044,8 @@@ void __init setup_arch(char **cmdline_p
   
         x86_init.timers.wallclock_init();
   
+       x86_platform.wallclock_init();
+ 
         mcheck_init();
   
         arch_init_ideal_nops();
diff --combined arch/x86/mm/init.c

index 0b736b99d92555537d6656384f6704864983483b,87488b93a65ce19695947bc6ca7f58f948e0f24b..a298914058f9c98ebeaba6a84725f7310a1e6f5d
--- 1/arch/x86/mm/init.c
--- 2/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@@ -63,12 -63,11 +63,11 @@@ static void __init find_early_table_spa
   #ifdef CONFIG_X86_32
         /* for fixmap */
         tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
- 
-       good_end = max_pfn_mapped << PAGE_SHIFT;
   #endif
+       good_end = max_pfn_mapped << PAGE_SHIFT;
   
         base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
- -      if (base == MEMBLOCK_ERROR)
+ +      if (!base)
                 panic("Cannot find space for the kernel page tables");
   
         pgt_buf_start = base >> PAGE_SHIFT;
@@@ -81,7 -80,7 +80,7 @@@
   
   void __init native_pagetable_reserve(u64 start, u64 end)
   {
- -      memblock_x86_reserve_range(start, end, "PGTABLE");
+ +      memblock_reserve(start, end - start);
   }
   
   struct map_range {
@@@ -280,8 -279,8 +279,8 @@@ unsigned long __init_refok init_memory_
          * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
          * so that they can be reused for other purposes.
          *
- -       * On native it just means calling memblock_x86_reserve_range, on Xen it
- -       * also means marking RW the pagetable pages that we allocated before
+ +       * On native it just means calling memblock_reserve, on Xen it also
+ +       * means marking RW the pagetable pages that we allocated before
          * but that haven't been used.
          *
          * In fact on xen we mark RO the whole range pgt_buf_start -
diff --combined arch/x86/mm/init_64.c

index 7fb064cbdcec18e6c61ad3ea92593c858107fac5,bbaaa005bf0e865a9c3fc84a7ff1ed9854b80888..a8a56ce3a962ad7c1c46285830dd8cb994695231
--- 1/arch/x86/mm/init_64.c
--- 2/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@@ -28,6 -28,7 +28,7 @@@
   #include <linux/poison.h>
   #include <linux/dma-mapping.h>
   #include <linux/module.h>
+ #include <linux/memory.h>
   #include <linux/memory_hotplug.h>
   #include <linux/nmi.h>
   #include <linux/gfp.h>
@@@ -607,7 -608,7 +608,7 @@@ kernel_physical_mapping_init(unsigned l
   #ifndef CONFIG_NUMA
   void __init initmem_init(void)
   {
- -      memblock_x86_register_active_regions(0, 0, max_pfn);
+ +      memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
   }
   #endif
   
@@@ -895,8 -896,6 +896,6 @@@ const char *arch_vma_name(struct vm_are
   }
   
   #ifdef CONFIG_X86_UV
- #define MIN_MEMORY_BLOCK_SIZE   (1 << SECTION_SIZE_BITS)
- 
   unsigned long memory_block_size_bytes(void)
   {
         if (is_uv_system()) {
diff --combined arch/x86/platform/efi/efi.c

index 3b4e86bda3cb9fc667dbf892af0f50e5bfd28c0c,37718f0f053d53346566c80958885f159b61dcde..4a01967f02e76c5d0156e35b94ef303162d661e6
--- 1/arch/x86/platform/efi/efi.c
--- 2/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@@ -29,6 -29,7 +29,7 @@@
   #include <linux/kernel.h>
   #include <linux/init.h>
   #include <linux/efi.h>
+ #include <linux/export.h>
   #include <linux/bootmem.h>
   #include <linux/memblock.h>
   #include <linux/spinlock.h>
@@@ -51,7 -52,17 +52,17 @@@
   int efi_enabled;
   EXPORT_SYMBOL(efi_enabled);
   
- struct efi efi;
+ struct efi __read_mostly efi = {
+       .mps        = EFI_INVALID_TABLE_ADDR,
+       .acpi       = EFI_INVALID_TABLE_ADDR,
+       .acpi20     = EFI_INVALID_TABLE_ADDR,
+       .smbios     = EFI_INVALID_TABLE_ADDR,
+       .sal_systab = EFI_INVALID_TABLE_ADDR,
+       .boot_info  = EFI_INVALID_TABLE_ADDR,
+       .hcdp       = EFI_INVALID_TABLE_ADDR,
+       .uga        = EFI_INVALID_TABLE_ADDR,
+       .uv_systab  = EFI_INVALID_TABLE_ADDR,
+ };
   EXPORT_SYMBOL(efi);
   
   struct efi_memory_map memmap;
@@@ -79,26 -90,50 +90,50 @@@ early_param("add_efi_memmap", setup_add
   
   static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
   {
-       return efi_call_virt2(get_time, tm, tc);
+       unsigned long flags;
+       efi_status_t status;
+ 
+       spin_lock_irqsave(&rtc_lock, flags);
+       status = efi_call_virt2(get_time, tm, tc);
+       spin_unlock_irqrestore(&rtc_lock, flags);
+       return status;
   }
   
   static efi_status_t virt_efi_set_time(efi_time_t *tm)
   {
-       return efi_call_virt1(set_time, tm);
+       unsigned long flags;
+       efi_status_t status;
+ 
+       spin_lock_irqsave(&rtc_lock, flags);
+       status = efi_call_virt1(set_time, tm);
+       spin_unlock_irqrestore(&rtc_lock, flags);
+       return status;
   }
   
   static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
                                              efi_bool_t *pending,
                                              efi_time_t *tm)
   {
-       return efi_call_virt3(get_wakeup_time,
-                             enabled, pending, tm);
+       unsigned long flags;
+       efi_status_t status;
+ 
+       spin_lock_irqsave(&rtc_lock, flags);
+       status = efi_call_virt3(get_wakeup_time,
+                               enabled, pending, tm);
+       spin_unlock_irqrestore(&rtc_lock, flags);
+       return status;
   }
   
   static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
   {
-       return efi_call_virt2(set_wakeup_time,
-                             enabled, tm);
+       unsigned long flags;
+       efi_status_t status;
+ 
+       spin_lock_irqsave(&rtc_lock, flags);
+       status = efi_call_virt2(set_wakeup_time,
+                               enabled, tm);
+       spin_unlock_irqrestore(&rtc_lock, flags);
+       return status;
   }
   
   static efi_status_t virt_efi_get_variable(efi_char16_t *name,
@@@ -122,7 -157,7 +157,7 @@@ static efi_status_t virt_efi_get_next_v
   
   static efi_status_t virt_efi_set_variable(efi_char16_t *name,
                                           efi_guid_t *vendor,
-                                         unsigned long attr,
+                                         u32 attr,
                                           unsigned long data_size,
                                           void *data)
   {
@@@ -131,6 -166,18 +166,18 @@@
                               data_size, data);
   }
   
+ static efi_status_t virt_efi_query_variable_info(u32 attr,
+                                                u64 *storage_space,
+                                                u64 *remaining_space,
+                                                u64 *max_variable_size)
+ {
+       if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+               return EFI_UNSUPPORTED;
+ 
+       return efi_call_virt4(query_variable_info, attr, storage_space,
+                             remaining_space, max_variable_size);
+ }
+ 
   static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
   {
         return efi_call_virt1(get_next_high_mono_count, count);
@@@ -145,6 -192,28 +192,28 @@@ static void virt_efi_reset_system(int r
                        data_size, data);
   }
   
+ static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules,
+                                           unsigned long count,
+                                           unsigned long sg_list)
+ {
+       if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+               return EFI_UNSUPPORTED;
+ 
+       return efi_call_virt3(update_capsule, capsules, count, sg_list);
+ }
+ 
+ static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules,
+                                               unsigned long count,
+                                               u64 *max_size,
+                                               int *reset_type)
+ {
+       if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+               return EFI_UNSUPPORTED;
+ 
+       return efi_call_virt4(query_capsule_caps, capsules, count, max_size,
+                             reset_type);
+ }
+ 
   static efi_status_t __init phys_efi_set_virtual_address_map(
         unsigned long memory_map_size,
         unsigned long descriptor_size,
@@@ -164,11 -233,14 +233,14 @@@
   static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
                                              efi_time_cap_t *tc)
   {
+       unsigned long flags;
         efi_status_t status;
   
+       spin_lock_irqsave(&rtc_lock, flags);
         efi_call_phys_prelog();
         status = efi_call_phys2(efi_phys.get_time, tm, tc);
         efi_call_phys_epilog();
+       spin_unlock_irqrestore(&rtc_lock, flags);
         return status;
   }
   
@@@ -280,7 -352,8 +352,7 @@@ void __init efi_memblock_x86_reserve_ra
                 boot_params.efi_info.efi_memdesc_size;
         memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
         memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
- -      memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size,
- -                    "EFI memmap");
+ +      memblock_reserve(pmap, memmap.nr_map * memmap.desc_size);
   }
   
   #if EFI_DEBUG
@@@ -324,14 -397,16 +396,14 @@@ void __init efi_reserve_boot_services(v
                 if ((start+size >= virt_to_phys(_text)
                                 && start <= virt_to_phys(_end)) ||
                         !e820_all_mapped(start, start+size, E820_RAM) ||
- -                      memblock_x86_check_reserved_size(&start, &size,
- -                                                      1<<EFI_PAGE_SHIFT)) {
+ +                      memblock_is_region_reserved(start, size)) {
                         /* Could not reserve, skip it */
                         md->num_pages = 0;
                         memblock_dbg(PFX "Could not reserve boot range "
                                         "[0x%010llx-0x%010llx]\n",
                                                 start, start+size-1);
                 } else
- -                      memblock_x86_reserve_range(start, start+size,
- -                                                      "EFI Boot");
+ +                      memblock_reserve(start, size);
         }
   }
   
@@@ -666,6 -741,9 +738,9 @@@ void __init efi_enter_virtual_mode(void
         efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
         efi.reset_system = virt_efi_reset_system;
         efi.set_virtual_address_map = NULL;
+       efi.query_variable_info = virt_efi_query_variable_info;
+       efi.update_capsule = virt_efi_update_capsule;
+       efi.query_capsule_caps = virt_efi_query_capsule_caps;
         if (__supported_pte_mask & _PAGE_NX)
                 runtime_code_page_mkexec();
         early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
diff --combined arch/x86/xen/mmu.c

index ad54fa10f8a235e3d8595e8cdb7ea95a42eea1a2,87f6673b1207d6e1f5fc2861a0b1797489b0e862..f4bf8aa574f432aed85bea73fed2c0620c040e5b
--- 1/arch/x86/xen/mmu.c
--- 2/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@@ -48,6 -48,8 +48,8 @@@
   #include <linux/memblock.h>
   #include <linux/seq_file.h>
   
+ #include <trace/events/xen.h>
+ 
   #include <asm/pgtable.h>
   #include <asm/tlbflush.h>
   #include <asm/fixmap.h>
@@@ -194,6 -196,8 +196,8 @@@ void xen_set_domain_pte(pte_t *ptep, pt
         struct multicall_space mcs;
         struct mmu_update *u;
   
+       trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
+ 
         mcs = xen_mc_entry(sizeof(*u));
         u = mcs.args;
   
@@@ -225,6 -229,24 +229,24 @@@ static void xen_extend_mmu_update(cons
         *u = *update;
   }
   
+ static void xen_extend_mmuext_op(const struct mmuext_op *op)
+ {
+       struct multicall_space mcs;
+       struct mmuext_op *u;
+ 
+       mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
+ 
+       if (mcs.mc != NULL) {
+               mcs.mc->args[1]++;
+       } else {
+               mcs = __xen_mc_entry(sizeof(*u));
+               MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+       }
+ 
+       u = mcs.args;
+       *u = *op;
+ }
+ 
   static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
   {
         struct mmu_update u;
@@@ -245,6 -267,8 +267,8 @@@
   
   static void xen_set_pmd(pmd_t *ptr, pmd_t val)
   {
+       trace_xen_mmu_set_pmd(ptr, val);
+ 
         /* If page is not pinned, we can just update the entry
            directly */
         if (!xen_page_pinned(ptr)) {
@@@ -282,22 -306,30 +306,30 @@@ static bool xen_batched_set_pte(pte_t *
         return true;
   }
   
- static void xen_set_pte(pte_t *ptep, pte_t pteval)
+ static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
   {
         if (!xen_batched_set_pte(ptep, pteval))
                 native_set_pte(ptep, pteval);
   }
   
+ static void xen_set_pte(pte_t *ptep, pte_t pteval)
+ {
+       trace_xen_mmu_set_pte(ptep, pteval);
+       __xen_set_pte(ptep, pteval);
+ }
+ 
   static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t pteval)
   {
-       xen_set_pte(ptep, pteval);
+       trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
+       __xen_set_pte(ptep, pteval);
   }
   
   pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
                                  unsigned long addr, pte_t *ptep)
   {
         /* Just return the pte as-is.  We preserve the bits on commit */
+       trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
         return *ptep;
   }
   
@@@ -306,6 -338,7 +338,7 @@@ void xen_ptep_modify_prot_commit(struc
   {
         struct mmu_update u;
   
+       trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
         xen_mc_batch();
   
         u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
@@@ -462,41 -495,6 +495,6 @@@ static pte_t xen_make_pte(pteval_t pte
   }
   PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
   
- #ifdef CONFIG_XEN_DEBUG
- pte_t xen_make_pte_debug(pteval_t pte)
- {
-       phys_addr_t addr = (pte & PTE_PFN_MASK);
-       phys_addr_t other_addr;
-       bool io_page = false;
-       pte_t _pte;
- 
-       if (pte & _PAGE_IOMAP)
-               io_page = true;
- 
-       _pte = xen_make_pte(pte);
- 
-       if (!addr)
-               return _pte;
- 
-       if (io_page &&
-           (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
-               other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
-               WARN_ONCE(addr != other_addr,
-                       "0x%lx is using VM_IO, but it is 0x%lx!\n",
-                       (unsigned long)addr, (unsigned long)other_addr);
-       } else {
-               pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
-               other_addr = (_pte.pte & PTE_PFN_MASK);
-               WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
-                       "0x%lx is missing VM_IO (and wasn't fixed)!\n",
-                       (unsigned long)addr);
-       }
- 
-       return _pte;
- }
- PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
- #endif
- 
   static pgd_t xen_make_pgd(pgdval_t pgd)
   {
         pgd = pte_pfn_to_mfn(pgd);
@@@ -530,6 -528,8 +528,8 @@@ static void xen_set_pud_hyper(pud_t *pt
   
   static void xen_set_pud(pud_t *ptr, pud_t val)
   {
+       trace_xen_mmu_set_pud(ptr, val);
+ 
         /* If page is not pinned, we can just update the entry
            directly */
         if (!xen_page_pinned(ptr)) {
@@@ -543,17 -543,20 +543,20 @@@
   #ifdef CONFIG_X86_PAE
   static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
   {
+       trace_xen_mmu_set_pte_atomic(ptep, pte);
         set_64bit((u64 *)ptep, native_pte_val(pte));
   }
   
   static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
   {
+       trace_xen_mmu_pte_clear(mm, addr, ptep);
         if (!xen_batched_set_pte(ptep, native_make_pte(0)))
                 native_pte_clear(mm, addr, ptep);
   }
   
   static void xen_pmd_clear(pmd_t *pmdp)
   {
+       trace_xen_mmu_pmd_clear(pmdp);
         set_pmd(pmdp, __pmd(0));
   }
   #endif        /* CONFIG_X86_PAE */
@@@ -629,6 -632,8 +632,8 @@@ static void xen_set_pgd(pgd_t *ptr, pgd
   {
         pgd_t *user_ptr = xen_get_user_pgd(ptr);
   
+       trace_xen_mmu_set_pgd(ptr, user_ptr, val);
+ 
         /* If page is not pinned, we can just update the entry
            directly */
         if (!xen_page_pinned(ptr)) {
@@@ -788,14 -793,12 +793,12 @@@ static void xen_pte_unlock(void *v
   
   static void xen_do_pin(unsigned level, unsigned long pfn)
   {
-       struct mmuext_op *op;
-       struct multicall_space mcs;
+       struct mmuext_op op;
   
-       mcs = __xen_mc_entry(sizeof(*op));
-       op = mcs.args;
-       op->cmd = level;
-       op->arg1.mfn = pfn_to_mfn(pfn);
-       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+       op.cmd = level;
+       op.arg1.mfn = pfn_to_mfn(pfn);
+ 
+       xen_extend_mmuext_op(&op);
   }
   
   static int xen_pin_page(struct mm_struct *mm, struct page *page,
@@@ -863,6 -866,8 +866,8 @@@
      read-only, and can be pinned. */
   static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
   {
+       trace_xen_mmu_pgd_pin(mm, pgd);
+ 
         xen_mc_batch();
   
         if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
@@@ -988,6 -993,8 +993,8 @@@ static int xen_unpin_page(struct mm_str
   /* Release a pagetables pages back as normal RW */
   static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
   {
+       trace_xen_mmu_pgd_unpin(mm, pgd);
+ 
         xen_mc_batch();
   
         xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
@@@ -1196,6 -1203,8 +1203,8 @@@ static void xen_flush_tlb(void
         struct mmuext_op *op;
         struct multicall_space mcs;
   
+       trace_xen_mmu_flush_tlb(0);
+ 
         preempt_disable();
   
         mcs = xen_mc_entry(sizeof(*op));
@@@ -1214,6 -1223,8 +1223,8 @@@ static void xen_flush_tlb_single(unsign
         struct mmuext_op *op;
         struct multicall_space mcs;
   
+       trace_xen_mmu_flush_tlb_single(addr);
+ 
         preempt_disable();
   
         mcs = xen_mc_entry(sizeof(*op));
@@@ -1240,6 -1251,8 +1251,8 @@@ static void xen_flush_tlb_others(const 
         } *args;
         struct multicall_space mcs;
   
+       trace_xen_mmu_flush_tlb_others(cpus, mm, va);
+ 
         if (cpumask_empty(cpus))
                 return;         /* nothing to do */
   
@@@ -1275,10 -1288,11 +1288,11 @@@ static void set_current_cr3(void *v
   
   static void __xen_write_cr3(bool kernel, unsigned long cr3)
   {
-       struct mmuext_op *op;
-       struct multicall_space mcs;
+       struct mmuext_op op;
         unsigned long mfn;
   
+       trace_xen_mmu_write_cr3(kernel, cr3);
+ 
         if (cr3)
                 mfn = pfn_to_mfn(PFN_DOWN(cr3));
         else
@@@ -1286,13 -1300,10 +1300,10 @@@
   
         WARN_ON(mfn == 0 && kernel);
   
-       mcs = __xen_mc_entry(sizeof(*op));
- 
-       op = mcs.args;
-       op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
-       op->arg1.mfn = mfn;
+       op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+       op.arg1.mfn = mfn;
   
-       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+       xen_extend_mmuext_op(&op);
   
         if (kernel) {
                 percpu_write(xen_cr3, cr3);
@@@ -1451,19 -1462,52 +1462,52 @@@ static void __init xen_release_pmd_init
         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
   }
   
+ static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+ {
+       struct multicall_space mcs;
+       struct mmuext_op *op;
+ 
+       mcs = __xen_mc_entry(sizeof(*op));
+       op = mcs.args;
+       op->cmd = cmd;
+       op->arg1.mfn = pfn_to_mfn(pfn);
+ 
+       MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+ }
+ 
+ static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
+ {
+       struct multicall_space mcs;
+       unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
+ 
+       mcs = __xen_mc_entry(0);
+       MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
+                               pfn_pte(pfn, prot), 0);
+ }
+ 
   /* This needs to make sure the new pte page is pinned iff its being
      attached to a pinned pagetable. */
- static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
+ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
+                                   unsigned level)
   {
-       struct page *page = pfn_to_page(pfn);
+       bool pinned = PagePinned(virt_to_page(mm->pgd));
+ 
+       trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
+ 
+       if (pinned) {
+               struct page *page = pfn_to_page(pfn);
   
-       if (PagePinned(virt_to_page(mm->pgd))) {
                 SetPagePinned(page);
   
                 if (!PageHighMem(page)) {
-                       make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+                       xen_mc_batch();
+ 
+                       __set_pfn_prot(pfn, PAGE_KERNEL_RO);
+ 
                         if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-                               pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+                               __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+ 
+                       xen_mc_issue(PARAVIRT_LAZY_MMU);
                 } else {
                         /* make sure there are no stray mappings of
                            this page */
@@@ -1483,15 -1527,23 +1527,23 @@@ static void xen_alloc_pmd(struct mm_str
   }
   
   /* This should never happen until we're OK to use struct page */
- static void xen_release_ptpage(unsigned long pfn, unsigned level)
+ static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
   {
         struct page *page = pfn_to_page(pfn);
+       bool pinned = PagePinned(page);
+ 
+       trace_xen_mmu_release_ptpage(pfn, level, pinned);
   
-       if (PagePinned(page)) {
+       if (pinned) {
                 if (!PageHighMem(page)) {
+                       xen_mc_batch();
+ 
                         if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-                               pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
-                       make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+                               __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+ 
+                       __set_pfn_prot(pfn, PAGE_KERNEL);
+ 
+                       xen_mc_issue(PARAVIRT_LAZY_MMU);
                 }
                 ClearPagePinned(page);
         }
@@@ -1626,15 -1678,17 +1678,17 @@@ static void __init xen_map_identity_ear
   void __init xen_setup_machphys_mapping(void)
   {
         struct xen_machphys_mapping mapping;
-       unsigned long machine_to_phys_nr_ents;
   
         if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
                 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
-               machine_to_phys_nr_ents = mapping.max_mfn + 1;
+               machine_to_phys_nr = mapping.max_mfn + 1;
         } else {
-               machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
+               machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
         }
-       machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
+ #ifdef CONFIG_X86_32
+       WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
+               < machine_to_phys_mapping);
+ #endif
   }
   
   #ifdef CONFIG_X86_64
@@@ -1720,8 -1774,10 +1774,8 @@@ pgd_t * __init xen_setup_kernel_pagetab
         __xen_write_cr3(true, __pa(pgd));
         xen_mc_issue(PARAVIRT_LAZY_CPU);
   
- -      memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
- -                    __pa(xen_start_info->pt_base +
- -                         xen_start_info->nr_pt_frames * PAGE_SIZE),
- -                    "XEN PAGETABLES");
+ +      memblock_reserve(__pa(xen_start_info->pt_base),
+ +                       xen_start_info->nr_pt_frames * PAGE_SIZE);
   
         return pgd;
   }
@@@ -1797,8 -1853,10 +1851,8 @@@ pgd_t * __init xen_setup_kernel_pagetab
                           PFN_DOWN(__pa(initial_page_table)));
         xen_write_cr3(__pa(initial_page_table));
   
- -      memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
- -                    __pa(xen_start_info->pt_base +
- -                         xen_start_info->nr_pt_frames * PAGE_SIZE),
- -                    "XEN PAGETABLES");
+ +      memblock_reserve(__pa(xen_start_info->pt_base),
+ +                       xen_start_info->nr_pt_frames * PAGE_SIZE));
   
         return initial_page_table;
   }
@@@ -1825,6 -1883,7 +1879,7 @@@ static void xen_set_fixmap(unsigned idx
   # endif
   #else
         case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+       case VVAR_PAGE:
   #endif
         case FIX_TEXT_POKE0:
         case FIX_TEXT_POKE1:
@@@ -1865,7 -1924,8 +1920,8 @@@
   #ifdef CONFIG_X86_64
         /* Replicate changes to map the vsyscall page into the user
            pagetable vsyscall mapping. */
-       if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+       if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) ||
+           idx == VVAR_PAGE) {
                 unsigned long vaddr = __fix_to_virt(idx);
                 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
         }
@@@ -1897,9 -1957,6 +1953,6 @@@ void __init xen_ident_map_ISA(void
   
   static void __init xen_post_allocator_init(void)
   {
- #ifdef CONFIG_XEN_DEBUG
-       pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
- #endif
         pv_mmu_ops.set_pte = xen_set_pte;
         pv_mmu_ops.set_pmd = xen_set_pmd;
         pv_mmu_ops.set_pud = xen_set_pud;
@@@ -2309,17 -2366,3 +2362,3 @@@ out
         return err;
   }
   EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
- 
- #ifdef CONFIG_XEN_DEBUG_FS
- static int p2m_dump_open(struct inode *inode, struct file *filp)
- {
-       return single_open(filp, p2m_dump_show, NULL);
- }
- 
- static const struct file_operations p2m_dump_fops = {
-       .open           = p2m_dump_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
- };
- #endif /* CONFIG_XEN_DEBUG_FS */
diff --combined arch/x86/xen/setup.c

index 73daaf75801aae1275241569d13d01f0a122a04d,38d0af4fefec19f52d5e724c8f08102d391dc2e6..f5e1362550e76130eb4b770e926825ec76cb56f1
--- 1/arch/x86/xen/setup.c
--- 2/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@@ -9,6 -9,7 +9,7 @@@
   #include <linux/mm.h>
   #include <linux/pm.h>
   #include <linux/memblock.h>
+ #include <linux/cpuidle.h>
   
   #include <asm/elf.h>
   #include <asm/vdso.h>
@@@ -36,7 -37,10 +37,10 @@@ extern void xen_syscall_target(void)
   extern void xen_syscall32_target(void);
   
   /* Amount of extra memory space we add to the e820 ranges */
- phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
+ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
+ 
+ /* Number of pages released from the initial allocation. */
+ unsigned long xen_released_pages;
   
   /* 
    * The maximum amount of extra memory compared to the base size.  The
@@@ -50,50 -54,47 +54,47 @@@
    */
   #define EXTRA_MEM_RATIO               (10)
   
- static void __init xen_add_extra_mem(unsigned long pages)
+ static void __init xen_add_extra_mem(u64 start, u64 size)
   {
         unsigned long pfn;
+       int i;
   
-       u64 size = (u64)pages * PAGE_SIZE;
-       u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
- 
-       if (!pages)
-               return;
- 
-       e820_add_region(extra_start, size, E820_RAM);
-       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- 
-       memblock_reserve(extra_start, size);
+       for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+               /* Add new region. */
+               if (xen_extra_mem[i].size == 0) {
+                       xen_extra_mem[i].start = start;
+                       xen_extra_mem[i].size  = size;
+                       break;
+               }
+               /* Append to existing region. */
+               if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
+                       xen_extra_mem[i].size += size;
+                       break;
+               }
+       }
+       if (i == XEN_EXTRA_MEM_MAX_REGIONS)
+               printk(KERN_WARNING "Warning: not enough extra memory regions\n");
   
-       xen_extra_mem_size += size;
- -      memblock_x86_reserve_range(start, start + size, "XEN EXTRA");
++      memblock_reserve(start, size);
   
-       xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
+       xen_max_p2m_pfn = PFN_DOWN(start + size);
   
-       for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
+       for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++)
                 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
   }
   
- static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
-                                             phys_addr_t end_addr)
+ static unsigned long __init xen_release_chunk(unsigned long start,
+                                             unsigned long end)
   {
         struct xen_memory_reservation reservation = {
                 .address_bits = 0,
                 .extent_order = 0,
                 .domid        = DOMID_SELF
         };
-       unsigned long start, end;
         unsigned long len = 0;
         unsigned long pfn;
         int ret;
   
-       start = PFN_UP(start_addr);
-       end = PFN_DOWN(end_addr);
- 
-       if (end <= start)
-               return 0;
- 
-       printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ",
-              start, end);
         for(pfn = start; pfn < end; pfn++) {
                 unsigned long mfn = pfn_to_mfn(pfn);
   
@@@ -106,100 -107,104 +107,104 @@@
   
                 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
                                            &reservation);
-               WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
-                    start, end, ret);
+               WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
                 if (ret == 1) {
                         __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
                         len++;
                 }
         }
-       printk(KERN_CONT "%ld pages freed\n", len);
+       printk(KERN_INFO "Freeing  %lx-%lx pfn range: %lu pages freed\n",
+              start, end, len);
   
         return len;
   }
   
- static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
-                                                    const struct e820map *e820)
+ static unsigned long __init xen_set_identity_and_release(
+       const struct e820entry *list, size_t map_size, unsigned long nr_pages)
   {
-       phys_addr_t max_addr = PFN_PHYS(max_pfn);
-       phys_addr_t last_end = ISA_END_ADDRESS;
+       phys_addr_t start = 0;
         unsigned long released = 0;
+       unsigned long identity = 0;
+       const struct e820entry *entry;
         int i;
   
-       /* Free any unused memory above the low 1Mbyte. */
-       for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
-               phys_addr_t end = e820->map[i].addr;
-               end = min(max_addr, end);
+       /*
+        * Combine non-RAM regions and gaps until a RAM region (or the
+        * end of the map) is reached, then set the 1:1 map and
+        * release the pages (if available) in those non-RAM regions.
+        *
+        * The combined non-RAM regions are rounded to a whole number
+        * of pages so any partial pages are accessible via the 1:1
+        * mapping.  This is needed for some BIOSes that put (for
+        * example) the DMI tables in a reserved region that begins on
+        * a non-page boundary.
+        */
+       for (i = 0, entry = list; i < map_size; i++, entry++) {
+               phys_addr_t end = entry->addr + entry->size;
+ 
+               if (entry->type == E820_RAM || i == map_size - 1) {
+                       unsigned long start_pfn = PFN_DOWN(start);
+                       unsigned long end_pfn = PFN_UP(end);
   
-               if (last_end < end)
-                       released += xen_release_chunk(last_end, end);
-               last_end = max(last_end, e820->map[i].addr + e820->map[i].size);
+                       if (entry->type == E820_RAM)
+                               end_pfn = PFN_UP(entry->addr);
+ 
+                       if (start_pfn < end_pfn) {
+                               if (start_pfn < nr_pages)
+                                       released += xen_release_chunk(
+                                               start_pfn, min(end_pfn, nr_pages));
+ 
+                               identity += set_phys_range_identity(
+                                       start_pfn, end_pfn);
+                       }
+                       start = end;
+               }
         }
   
-       if (last_end < max_addr)
-               released += xen_release_chunk(last_end, max_addr);
+       printk(KERN_INFO "Released %lu pages of unused memory\n", released);
+       printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
   
-       printk(KERN_INFO "released %ld pages of unused memory\n", released);
         return released;
   }
   
- static unsigned long __init xen_set_identity(const struct e820entry *list,
-                                            ssize_t map_size)
+ static unsigned long __init xen_get_max_pages(void)
   {
-       phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
-       phys_addr_t start_pci = last;
-       const struct e820entry *entry;
-       unsigned long identity = 0;
-       int i;
- 
-       for (i = 0, entry = list; i < map_size; i++, entry++) {
-               phys_addr_t start = entry->addr;
-               phys_addr_t end = start + entry->size;
- 
-               if (start < last)
-                       start = last;
- 
-               if (end <= start)
-                       continue;
+       unsigned long max_pages = MAX_DOMAIN_PAGES;
+       domid_t domid = DOMID_SELF;
+       int ret;
   
-               /* Skip over the 1MB region. */
-               if (last > end)
-                       continue;
+       ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
+       if (ret > 0)
+               max_pages = ret;
+       return min(max_pages, MAX_DOMAIN_PAGES);
+ }
   
-               if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) {
-                       if (start > start_pci)
-                               identity += set_phys_range_identity(
-                                               PFN_UP(start_pci), PFN_DOWN(start));
+ static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
+ {
+       u64 end = start + size;
   
-                       /* Without saving 'last' we would gooble RAM too
-                        * at the end of the loop. */
-                       last = end;
-                       start_pci = end;
-                       continue;
-               }
-               start_pci = min(start, start_pci);
-               last = end;
+       /* Align RAM regions to page boundaries. */
+       if (type == E820_RAM) {
+               start = PAGE_ALIGN(start);
+               end &= ~((u64)PAGE_SIZE - 1);
         }
-       if (last > start_pci)
-               identity += set_phys_range_identity(
-                                       PFN_UP(start_pci), PFN_DOWN(last));
-       return identity;
+ 
+       e820_add_region(start, end - start, type);
   }
+ 
   /**
    * machine_specific_memory_setup - Hook for machine specific memory setup.
    **/
   char * __init xen_memory_setup(void)
   {
         static struct e820entry map[E820MAX] __initdata;
-       static struct e820entry map_raw[E820MAX] __initdata;
   
         unsigned long max_pfn = xen_start_info->nr_pages;
         unsigned long long mem_end;
         int rc;
         struct xen_memory_map memmap;
+       unsigned long max_pages;
         unsigned long extra_pages = 0;
-       unsigned long extra_limit;
-       unsigned long identity_pages = 0;
         int i;
         int op;
   
@@@ -225,58 -230,65 +230,65 @@@
         }
         BUG_ON(rc);
   
-       memcpy(map_raw, map, sizeof(map));
-       e820.nr_map = 0;
-       xen_extra_mem_start = mem_end;
-       for (i = 0; i < memmap.nr_entries; i++) {
-               unsigned long long end;
- 
-               /* Guard against non-page aligned E820 entries. */
-               if (map[i].type == E820_RAM)
-                       map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
- 
-               end = map[i].addr + map[i].size;
-               if (map[i].type == E820_RAM && end > mem_end) {
-                       /* RAM off the end - may be partially included */
-                       u64 delta = min(map[i].size, end - mem_end);
- 
-                       map[i].size -= delta;
-                       end -= delta;
- 
-                       extra_pages += PFN_DOWN(delta);
-                       /*
-                        * Set RAM below 4GB that is not for us to be unusable.
-                        * This prevents "System RAM" address space from being
-                        * used as potential resource for I/O address (happens
-                        * when 'allocate_resource' is called).
-                        */
-                       if (delta &&
-                               (xen_initial_domain() && end < 0x100000000ULL))
-                               e820_add_region(end, delta, E820_UNUSABLE);
+       /* Make sure the Xen-supplied memory map is well-ordered. */
+       sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
+ 
+       max_pages = xen_get_max_pages();
+       if (max_pages > max_pfn)
+               extra_pages += max_pages - max_pfn;
+ 
+       /*
+        * Set P2M for all non-RAM pages and E820 gaps to be identity
+        * type PFNs.  Any RAM pages that would be made inaccesible by
+        * this are first released.
+        */
+       xen_released_pages = xen_set_identity_and_release(
+               map, memmap.nr_entries, max_pfn);
+       extra_pages += xen_released_pages;
+ 
+       /*
+        * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
+        * factor the base size.  On non-highmem systems, the base
+        * size is the full initial memory allocation; on highmem it
+        * is limited to the max size of lowmem, so that it doesn't
+        * get completely filled.
+        *
+        * In principle there could be a problem in lowmem systems if
+        * the initial memory is also very large with respect to
+        * lowmem, but we won't try to deal with that here.
+        */
+       extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+                         extra_pages);
+ 
+       i = 0;
+       while (i < memmap.nr_entries) {
+               u64 addr = map[i].addr;
+               u64 size = map[i].size;
+               u32 type = map[i].type;
+ 
+               if (type == E820_RAM) {
+                       if (addr < mem_end) {
+                               size = min(size, mem_end - addr);
+                       } else if (extra_pages) {
+                               size = min(size, (u64)extra_pages * PAGE_SIZE);
+                               extra_pages -= size / PAGE_SIZE;
+                               xen_add_extra_mem(addr, size);
+                       } else
+                               type = E820_UNUSABLE;
                 }
   
-               if (map[i].size > 0 && end > xen_extra_mem_start)
-                       xen_extra_mem_start = end;
+               xen_align_and_add_e820_region(addr, size, type);
   
-               /* Add region if any remains */
-               if (map[i].size > 0)
-                       e820_add_region(map[i].addr, map[i].size, map[i].type);
+               map[i].addr += size;
+               map[i].size -= size;
+               if (map[i].size == 0)
+                       i++;
         }
-       /* Align the balloon area so that max_low_pfn does not get set
-        * to be at the _end_ of the PCI gap at the far end (fee01000).
-        * Note that xen_extra_mem_start gets set in the loop above to be
-        * past the last E820 region. */
-       if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32)))
-               xen_extra_mem_start = (1ULL<<32);
   
         /*
          * In domU, the ISA region is normal, usable memory, but we
          * reserve ISA memory anyway because too many things poke
          * about in there.
-        *
-        * In Dom0, the host E820 information can leave gaps in the
-        * ISA range, which would cause us to release those pages.  To
-        * avoid this, we unconditionally reserve them here.
          */
         e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
                         E820_RESERVED);
@@@ -287,41 -299,12 +299,11 @@@
          *  - xen_start_info
          * See comment above "struct start_info" in <xen/interface/xen.h>
          */
- -      memblock_x86_reserve_range(__pa(xen_start_info->mfn_list),
- -                    __pa(xen_start_info->pt_base),
- -                      "XEN START INFO");
+ +      memblock_reserve(__pa(xen_start_info->mfn_list),
+ +                       xen_start_info->pt_base - xen_start_info->mfn_list);
   
         sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
   
-       extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
- 
-       /*
-        * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
-        * factor the base size.  On non-highmem systems, the base
-        * size is the full initial memory allocation; on highmem it
-        * is limited to the max size of lowmem, so that it doesn't
-        * get completely filled.
-        *
-        * In principle there could be a problem in lowmem systems if
-        * the initial memory is also very large with respect to
-        * lowmem, but we won't try to deal with that here.
-        */
-       extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
-                         max_pfn + extra_pages);
- 
-       if (extra_limit >= max_pfn)
-               extra_pages = extra_limit - max_pfn;
-       else
-               extra_pages = 0;
- 
-       xen_add_extra_mem(extra_pages);
- 
-       /*
-        * Set P2M for all non-RAM pages and E820 gaps to be identity
-        * type PFNs. We supply it with the non-sanitized version
-        * of the E820.
-        */
-       identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
-       printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
         return "Xen";
   }
   
@@@ -425,7 -408,7 +407,7 @@@ void __init xen_arch_setup(void
   #ifdef CONFIG_X86_32
         boot_cpu_data.hlt_works_ok = 1;
   #endif
-       pm_idle = default_idle;
+       disable_cpuidle();
         boot_option_idle_override = IDLE_HALT;
   
         fiddle_vdso();
diff --combined drivers/iommu/intel-iommu.c

index 0000000000000000000000000000000000000000,c0c7820d4c46b406465e0d2d8e059a80ce819476..bcbd693b351ae8f250496388ff5f0921a90b0be6

mode 000000,100644..100644
--- /dev/null
--- 2/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@@ -1,0 -1,4179 +1,4173 @@@
- -static int __init si_domain_work_fn(unsigned long start_pfn,
- -                                  unsigned long end_pfn, void *datax)
- -{
- -      int *ret = datax;
- -
- -      *ret = iommu_domain_identity_map(si_domain,
- -                                       (uint64_t)start_pfn << PAGE_SHIFT,
- -                                       (uint64_t)end_pfn << PAGE_SHIFT);
- -      return *ret;
- -
- -}
- -
+ /*
+  * Copyright (c) 2006, Intel Corporation.
+  *
+  * This program is free software; you can redistribute it and/or modify it
+  * under the terms and conditions of the GNU General Public License,
+  * version 2, as published by the Free Software Foundation.
+  *
+  * This program is distributed in the hope it will be useful, but WITHOUT
+  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+  * more details.
+  *
+  * You should have received a copy of the GNU General Public License along with
+  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+  * Place - Suite 330, Boston, MA 02111-1307 USA.
+  *
+  * Copyright (C) 2006-2008 Intel Corporation
+  * Author: Ashok Raj <ashok.raj@intel.com>
+  * Author: Shaohua Li <shaohua.li@intel.com>
+  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+  * Author: Fenghua Yu <fenghua.yu@intel.com>
+  */
+ 
+ #include <linux/init.h>
+ #include <linux/bitmap.h>
+ #include <linux/debugfs.h>
+ #include <linux/export.h>
+ #include <linux/slab.h>
+ #include <linux/irq.h>
+ #include <linux/interrupt.h>
+ #include <linux/spinlock.h>
+ #include <linux/pci.h>
+ #include <linux/dmar.h>
+ #include <linux/dma-mapping.h>
+ #include <linux/mempool.h>
+ #include <linux/timer.h>
+ #include <linux/iova.h>
+ #include <linux/iommu.h>
+ #include <linux/intel-iommu.h>
+ #include <linux/syscore_ops.h>
+ #include <linux/tboot.h>
+ #include <linux/dmi.h>
+ #include <linux/pci-ats.h>
+ #include <asm/cacheflush.h>
+ #include <asm/iommu.h>
+ 
+ #define ROOT_SIZE             VTD_PAGE_SIZE
+ #define CONTEXT_SIZE          VTD_PAGE_SIZE
+ 
+ #define IS_BRIDGE_HOST_DEVICE(pdev) \
+                           ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
+ #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
+ #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
+ #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
+ 
+ #define IOAPIC_RANGE_START    (0xfee00000)
+ #define IOAPIC_RANGE_END      (0xfeefffff)
+ #define IOVA_START_ADDR               (0x1000)
+ 
+ #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
+ 
+ #define MAX_AGAW_WIDTH 64
+ 
+ #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
+ #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
+ 
+ /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
+    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
+ #define DOMAIN_MAX_PFN(gaw)   ((unsigned long) min_t(uint64_t, \
+                               __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
+ #define DOMAIN_MAX_ADDR(gaw)  (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
+ 
+ #define IOVA_PFN(addr)                ((addr) >> PAGE_SHIFT)
+ #define DMA_32BIT_PFN         IOVA_PFN(DMA_BIT_MASK(32))
+ #define DMA_64BIT_PFN         IOVA_PFN(DMA_BIT_MASK(64))
+ 
+ /* page table handling */
+ #define LEVEL_STRIDE          (9)
+ #define LEVEL_MASK            (((u64)1 << LEVEL_STRIDE) - 1)
+ 
+ static inline int agaw_to_level(int agaw)
+ {
+       return agaw + 2;
+ }
+ 
+ static inline int agaw_to_width(int agaw)
+ {
+       return 30 + agaw * LEVEL_STRIDE;
+ }
+ 
+ static inline int width_to_agaw(int width)
+ {
+       return (width - 30) / LEVEL_STRIDE;
+ }
+ 
+ static inline unsigned int level_to_offset_bits(int level)
+ {
+       return (level - 1) * LEVEL_STRIDE;
+ }
+ 
+ static inline int pfn_level_offset(unsigned long pfn, int level)
+ {
+       return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
+ }
+ 
+ static inline unsigned long level_mask(int level)
+ {
+       return -1UL << level_to_offset_bits(level);
+ }
+ 
+ static inline unsigned long level_size(int level)
+ {
+       return 1UL << level_to_offset_bits(level);
+ }
+ 
+ static inline unsigned long align_to_level(unsigned long pfn, int level)
+ {
+       return (pfn + level_size(level) - 1) & level_mask(level);
+ }
+ 
+ static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
+ {
+       return  1 << ((lvl - 1) * LEVEL_STRIDE);
+ }
+ 
+ /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
+    are never going to work. */
+ static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
+ {
+       return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
+ }
+ 
+ static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
+ {
+       return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
+ }
+ static inline unsigned long page_to_dma_pfn(struct page *pg)
+ {
+       return mm_to_dma_pfn(page_to_pfn(pg));
+ }
+ static inline unsigned long virt_to_dma_pfn(void *p)
+ {
+       return page_to_dma_pfn(virt_to_page(p));
+ }
+ 
+ /* global iommu list, set NULL for ignored DMAR units */
+ static struct intel_iommu **g_iommus;
+ 
+ static void __init check_tylersburg_isoch(void);
+ static int rwbf_quirk;
+ 
+ /*
+  * set to 1 to panic kernel if can't successfully enable VT-d
+  * (used when kernel is launched w/ TXT)
+  */
+ static int force_on = 0;
+ 
+ /*
+  * 0: Present
+  * 1-11: Reserved
+  * 12-63: Context Ptr (12 - (haw-1))
+  * 64-127: Reserved
+  */
+ struct root_entry {
+       u64     val;
+       u64     rsvd1;
+ };
+ #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
+ static inline bool root_present(struct root_entry *root)
+ {
+       return (root->val & 1);
+ }
+ static inline void set_root_present(struct root_entry *root)
+ {
+       root->val |= 1;
+ }
+ static inline void set_root_value(struct root_entry *root, unsigned long value)
+ {
+       root->val |= value & VTD_PAGE_MASK;
+ }
+ 
+ static inline struct context_entry *
+ get_context_addr_from_root(struct root_entry *root)
+ {
+       return (struct context_entry *)
+               (root_present(root)?phys_to_virt(
+               root->val & VTD_PAGE_MASK) :
+               NULL);
+ }
+ 
+ /*
+  * low 64 bits:
+  * 0: present
+  * 1: fault processing disable
+  * 2-3: translation type
+  * 12-63: address space root
+  * high 64 bits:
+  * 0-2: address width
+  * 3-6: aval
+  * 8-23: domain id
+  */
+ struct context_entry {
+       u64 lo;
+       u64 hi;
+ };
+ 
+ static inline bool context_present(struct context_entry *context)
+ {
+       return (context->lo & 1);
+ }
+ static inline void context_set_present(struct context_entry *context)
+ {
+       context->lo |= 1;
+ }
+ 
+ static inline void context_set_fault_enable(struct context_entry *context)
+ {
+       context->lo &= (((u64)-1) << 2) | 1;
+ }
+ 
+ static inline void context_set_translation_type(struct context_entry *context,
+                                               unsigned long value)
+ {
+       context->lo &= (((u64)-1) << 4) | 3;
+       context->lo |= (value & 3) << 2;
+ }
+ 
+ static inline void context_set_address_root(struct context_entry *context,
+                                           unsigned long value)
+ {
+       context->lo |= value & VTD_PAGE_MASK;
+ }
+ 
+ static inline void context_set_address_width(struct context_entry *context,
+                                            unsigned long value)
+ {
+       context->hi |= value & 7;
+ }
+ 
+ static inline void context_set_domain_id(struct context_entry *context,
+                                        unsigned long value)
+ {
+       context->hi |= (value & ((1 << 16) - 1)) << 8;
+ }
+ 
+ static inline void context_clear_entry(struct context_entry *context)
+ {
+       context->lo = 0;
+       context->hi = 0;
+ }
+ 
+ /*
+  * 0: readable
+  * 1: writable
+  * 2-6: reserved
+  * 7: super page
+  * 8-10: available
+  * 11: snoop behavior
+  * 12-63: Host physcial address
+  */
+ struct dma_pte {
+       u64 val;
+ };
+ 
+ static inline void dma_clear_pte(struct dma_pte *pte)
+ {
+       pte->val = 0;
+ }
+ 
+ static inline void dma_set_pte_readable(struct dma_pte *pte)
+ {
+       pte->val |= DMA_PTE_READ;
+ }
+ 
+ static inline void dma_set_pte_writable(struct dma_pte *pte)
+ {
+       pte->val |= DMA_PTE_WRITE;
+ }
+ 
+ static inline void dma_set_pte_snp(struct dma_pte *pte)
+ {
+       pte->val |= DMA_PTE_SNP;
+ }
+ 
+ static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
+ {
+       pte->val = (pte->val & ~3) | (prot & 3);
+ }
+ 
+ static inline u64 dma_pte_addr(struct dma_pte *pte)
+ {
+ #ifdef CONFIG_64BIT
+       return pte->val & VTD_PAGE_MASK;
+ #else
+       /* Must have a full atomic 64-bit read */
+       return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
+ #endif
+ }
+ 
+ static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
+ {
+       pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
+ }
+ 
+ static inline bool dma_pte_present(struct dma_pte *pte)
+ {
+       return (pte->val & 3) != 0;
+ }
+ 
+ static inline bool dma_pte_superpage(struct dma_pte *pte)
+ {
+       return (pte->val & (1 << 7));
+ }
+ 
+ static inline int first_pte_in_page(struct dma_pte *pte)
+ {
+       return !((unsigned long)pte & ~VTD_PAGE_MASK);
+ }
+ 
+ /*
+  * This domain is a statically identity mapping domain.
+  *    1. This domain creats a static 1:1 mapping to all usable memory.
+  *    2. It maps to each iommu if successful.
+  *    3. Each iommu mapps to this domain if successful.
+  */
+ static struct dmar_domain *si_domain;
+ static int hw_pass_through = 1;
+ 
+ /* devices under the same p2p bridge are owned in one domain */
+ #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
+ 
+ /* domain represents a virtual machine, more than one devices
+  * across iommus may be owned in one domain, e.g. kvm guest.
+  */
+ #define DOMAIN_FLAG_VIRTUAL_MACHINE   (1 << 1)
+ 
+ /* si_domain contains mulitple devices */
+ #define DOMAIN_FLAG_STATIC_IDENTITY   (1 << 2)
+ 
+ struct dmar_domain {
+       int     id;                     /* domain id */
+       int     nid;                    /* node id */
+       unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
+ 
+       struct list_head devices;       /* all devices' list */
+       struct iova_domain iovad;       /* iova's that belong to this domain */
+ 
+       struct dma_pte  *pgd;           /* virtual address */
+       int             gaw;            /* max guest address width */
+ 
+       /* adjusted guest address width, 0 is level 2 30-bit */
+       int             agaw;
+ 
+       int             flags;          /* flags to find out type of domain */
+ 
+       int             iommu_coherency;/* indicate coherency of iommu access */
+       int             iommu_snooping; /* indicate snooping control feature*/
+       int             iommu_count;    /* reference count of iommu */
+       int             iommu_superpage;/* Level of superpages supported:
+                                          0 == 4KiB (no superpages), 1 == 2MiB,
+                                          2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
+       spinlock_t      iommu_lock;     /* protect iommu set in domain */
+       u64             max_addr;       /* maximum mapped address */
+ };
+ 
+ /* PCI domain-device relationship */
+ struct device_domain_info {
+       struct list_head link;  /* link to domain siblings */
+       struct list_head global; /* link to global list */
+       int segment;            /* PCI domain */
+       u8 bus;                 /* PCI bus number */
+       u8 devfn;               /* PCI devfn number */
+       struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
+       struct intel_iommu *iommu; /* IOMMU used by this device */
+       struct dmar_domain *domain; /* pointer to domain */
+ };
+ 
+ static void flush_unmaps_timeout(unsigned long data);
+ 
+ DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
+ 
+ #define HIGH_WATER_MARK 250
+ struct deferred_flush_tables {
+       int next;
+       struct iova *iova[HIGH_WATER_MARK];
+       struct dmar_domain *domain[HIGH_WATER_MARK];
+ };
+ 
+ static struct deferred_flush_tables *deferred_flush;
+ 
+ /* bitmap for indexing intel_iommus */
+ static int g_num_of_iommus;
+ 
+ static DEFINE_SPINLOCK(async_umap_flush_lock);
+ static LIST_HEAD(unmaps_to_do);
+ 
+ static int timer_on;
+ static long list_size;
+ 
+ static void domain_remove_dev_info(struct dmar_domain *domain);
+ 
+ #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
+ int dmar_disabled = 0;
+ #else
+ int dmar_disabled = 1;
+ #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
+ 
+ static int dmar_map_gfx = 1;
+ static int dmar_forcedac;
+ static int intel_iommu_strict;
+ static int intel_iommu_superpage = 1;
+ 
+ int intel_iommu_gfx_mapped;
+ EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
+ 
+ #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
+ static DEFINE_SPINLOCK(device_domain_lock);
+ static LIST_HEAD(device_domain_list);
+ 
+ static struct iommu_ops intel_iommu_ops;
+ 
+ static int __init intel_iommu_setup(char *str)
+ {
+       if (!str)
+               return -EINVAL;
+       while (*str) {
+               if (!strncmp(str, "on", 2)) {
+                       dmar_disabled = 0;
+                       printk(KERN_INFO "Intel-IOMMU: enabled\n");
+               } else if (!strncmp(str, "off", 3)) {
+                       dmar_disabled = 1;
+                       printk(KERN_INFO "Intel-IOMMU: disabled\n");
+               } else if (!strncmp(str, "igfx_off", 8)) {
+                       dmar_map_gfx = 0;
+                       printk(KERN_INFO
+                               "Intel-IOMMU: disable GFX device mapping\n");
+               } else if (!strncmp(str, "forcedac", 8)) {
+                       printk(KERN_INFO
+                               "Intel-IOMMU: Forcing DAC for PCI devices\n");
+                       dmar_forcedac = 1;
+               } else if (!strncmp(str, "strict", 6)) {
+                       printk(KERN_INFO
+                               "Intel-IOMMU: disable batched IOTLB flush\n");
+                       intel_iommu_strict = 1;
+               } else if (!strncmp(str, "sp_off", 6)) {
+                       printk(KERN_INFO
+                               "Intel-IOMMU: disable supported super page\n");
+                       intel_iommu_superpage = 0;
+               }
+ 
+               str += strcspn(str, ",");
+               while (*str == ',')
+                       str++;
+       }
+       return 0;
+ }
+ __setup("intel_iommu=", intel_iommu_setup);
+ 
+ static struct kmem_cache *iommu_domain_cache;
+ static struct kmem_cache *iommu_devinfo_cache;
+ static struct kmem_cache *iommu_iova_cache;
+ 
+ static inline void *alloc_pgtable_page(int node)
+ {
+       struct page *page;
+       void *vaddr = NULL;
+ 
+       page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
+       if (page)
+               vaddr = page_address(page);
+       return vaddr;
+ }
+ 
+ static inline void free_pgtable_page(void *vaddr)
+ {
+       free_page((unsigned long)vaddr);
+ }
+ 
+ static inline void *alloc_domain_mem(void)
+ {
+       return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
+ }
+ 
+ static void free_domain_mem(void *vaddr)
+ {
+       kmem_cache_free(iommu_domain_cache, vaddr);
+ }
+ 
+ static inline void * alloc_devinfo_mem(void)
+ {
+       return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
+ }
+ 
+ static inline void free_devinfo_mem(void *vaddr)
+ {
+       kmem_cache_free(iommu_devinfo_cache, vaddr);
+ }
+ 
+ struct iova *alloc_iova_mem(void)
+ {
+       return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
+ }
+ 
+ void free_iova_mem(struct iova *iova)
+ {
+       kmem_cache_free(iommu_iova_cache, iova);
+ }
+ 
+ 
+ static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
+ {
+       unsigned long sagaw;
+       int agaw = -1;
+ 
+       sagaw = cap_sagaw(iommu->cap);
+       for (agaw = width_to_agaw(max_gaw);
+            agaw >= 0; agaw--) {
+               if (test_bit(agaw, &sagaw))
+                       break;
+       }
+ 
+       return agaw;
+ }
+ 
+ /*
+  * Calculate max SAGAW for each iommu.
+  */
+ int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+ {
+       return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
+ }
+ 
+ /*
+  * calculate agaw for each iommu.
+  * "SAGAW" may be different across iommus, use a default agaw, and
+  * get a supported less agaw for iommus that don't support the default agaw.
+  */
+ int iommu_calculate_agaw(struct intel_iommu *iommu)
+ {
+       return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+ }
+ 
+ /* This functionin only returns single iommu in a domain */
+ static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
+ {
+       int iommu_id;
+ 
+       /* si_domain and vm domain should not get here. */
+       BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
+       BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
+ 
+       iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
+       if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
+               return NULL;
+ 
+       return g_iommus[iommu_id];
+ }
+ 
+ static void domain_update_iommu_coherency(struct dmar_domain *domain)
+ {
+       int i;
+ 
+       domain->iommu_coherency = 1;
+ 
+       for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
+               if (!ecap_coherent(g_iommus[i]->ecap)) {
+                       domain->iommu_coherency = 0;
+                       break;
+               }
+       }
+ }
+ 
+ static void domain_update_iommu_snooping(struct dmar_domain *domain)
+ {
+       int i;
+ 
+       domain->iommu_snooping = 1;
+ 
+       for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
+               if (!ecap_sc_support(g_iommus[i]->ecap)) {
+                       domain->iommu_snooping = 0;
+                       break;
+               }
+       }
+ }
+ 
+ static void domain_update_iommu_superpage(struct dmar_domain *domain)
+ {
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu = NULL;
+       int mask = 0xf;
+ 
+       if (!intel_iommu_superpage) {
+               domain->iommu_superpage = 0;
+               return;
+       }
+ 
+       /* set iommu_superpage to the smallest common denominator */
+       for_each_active_iommu(iommu, drhd) {
+               mask &= cap_super_page_val(iommu->cap);
+               if (!mask) {
+                       break;
+               }
+       }
+       domain->iommu_superpage = fls(mask);
+ }
+ 
+ /* Some capabilities may be different across iommus */
+ static void domain_update_iommu_cap(struct dmar_domain *domain)
+ {
+       domain_update_iommu_coherency(domain);
+       domain_update_iommu_snooping(domain);
+       domain_update_iommu_superpage(domain);
+ }
+ 
+ static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
+ {
+       struct dmar_drhd_unit *drhd = NULL;
+       int i;
+ 
+       for_each_drhd_unit(drhd) {
+               if (drhd->ignored)
+                       continue;
+               if (segment != drhd->segment)
+                       continue;
+ 
+               for (i = 0; i < drhd->devices_cnt; i++) {
+                       if (drhd->devices[i] &&
+                           drhd->devices[i]->bus->number == bus &&
+                           drhd->devices[i]->devfn == devfn)
+                               return drhd->iommu;
+                       if (drhd->devices[i] &&
+                           drhd->devices[i]->subordinate &&
+                           drhd->devices[i]->subordinate->number <= bus &&
+                           drhd->devices[i]->subordinate->subordinate >= bus)
+                               return drhd->iommu;
+               }
+ 
+               if (drhd->include_all)
+                       return drhd->iommu;
+       }
+ 
+       return NULL;
+ }
+ 
+ static void domain_flush_cache(struct dmar_domain *domain,
+                              void *addr, int size)
+ {
+       if (!domain->iommu_coherency)
+               clflush_cache_range(addr, size);
+ }
+ 
+ /* Gets context entry for a given bus and devfn */
+ static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
+               u8 bus, u8 devfn)
+ {
+       struct root_entry *root;
+       struct context_entry *context;
+       unsigned long phy_addr;
+       unsigned long flags;
+ 
+       spin_lock_irqsave(&iommu->lock, flags);
+       root = &iommu->root_entry[bus];
+       context = get_context_addr_from_root(root);
+       if (!context) {
+               context = (struct context_entry *)
+                               alloc_pgtable_page(iommu->node);
+               if (!context) {
+                       spin_unlock_irqrestore(&iommu->lock, flags);
+                       return NULL;
+               }
+               __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
+               phy_addr = virt_to_phys((void *)context);
+               set_root_value(root, phy_addr);
+               set_root_present(root);
+               __iommu_flush_cache(iommu, root, sizeof(*root));
+       }
+       spin_unlock_irqrestore(&iommu->lock, flags);
+       return &context[devfn];
+ }
+ 
+ static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
+ {
+       struct root_entry *root;
+       struct context_entry *context;
+       int ret;
+       unsigned long flags;
+ 
+       spin_lock_irqsave(&iommu->lock, flags);
+       root = &iommu->root_entry[bus];
+       context = get_context_addr_from_root(root);
+       if (!context) {
+               ret = 0;
+               goto out;
+       }
+       ret = context_present(&context[devfn]);
+ out:
+       spin_unlock_irqrestore(&iommu->lock, flags);
+       return ret;
+ }
+ 
+ static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
+ {
+       struct root_entry *root;
+       struct context_entry *context;
+       unsigned long flags;
+ 
+       spin_lock_irqsave(&iommu->lock, flags);
+       root = &iommu->root_entry[bus];
+       context = get_context_addr_from_root(root);
+       if (context) {
+               context_clear_entry(&context[devfn]);
+               __iommu_flush_cache(iommu, &context[devfn], \
+                       sizeof(*context));
+       }
+       spin_unlock_irqrestore(&iommu->lock, flags);
+ }
+ 
+ static void free_context_table(struct intel_iommu *iommu)
+ {
+       struct root_entry *root;
+       int i;
+       unsigned long flags;
+       struct context_entry *context;
+ 
+       spin_lock_irqsave(&iommu->lock, flags);
+       if (!iommu->root_entry) {
+               goto out;
+       }
+       for (i = 0; i < ROOT_ENTRY_NR; i++) {
+               root = &iommu->root_entry[i];
+               context = get_context_addr_from_root(root);
+               if (context)
+                       free_pgtable_page(context);
+       }
+       free_pgtable_page(iommu->root_entry);
+       iommu->root_entry = NULL;
+ out:
+       spin_unlock_irqrestore(&iommu->lock, flags);
+ }
+ 
+ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
+                                     unsigned long pfn, int target_level)
+ {
+       int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+       struct dma_pte *parent, *pte = NULL;
+       int level = agaw_to_level(domain->agaw);
+       int offset;
+ 
+       BUG_ON(!domain->pgd);
+       BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
+       parent = domain->pgd;
+ 
+       while (level > 0) {
+               void *tmp_page;
+ 
+               offset = pfn_level_offset(pfn, level);
+               pte = &parent[offset];
+               if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
+                       break;
+               if (level == target_level)
+                       break;
+ 
+               if (!dma_pte_present(pte)) {
+                       uint64_t pteval;
+ 
+                       tmp_page = alloc_pgtable_page(domain->nid);
+ 
+                       if (!tmp_page)
+                               return NULL;
+ 
+                       domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
+                       pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
+                       if (cmpxchg64(&pte->val, 0ULL, pteval)) {
+                               /* Someone else set it while we were thinking; use theirs. */
+                               free_pgtable_page(tmp_page);
+                       } else {
+                               dma_pte_addr(pte);
+                               domain_flush_cache(domain, pte, sizeof(*pte));
+                       }
+               }
+               parent = phys_to_virt(dma_pte_addr(pte));
+               level--;
+       }
+ 
+       return pte;
+ }
+ 
+ 
+ /* return address's pte at specific level */
+ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
+                                        unsigned long pfn,
+                                        int level, int *large_page)
+ {
+       struct dma_pte *parent, *pte = NULL;
+       int total = agaw_to_level(domain->agaw);
+       int offset;
+ 
+       parent = domain->pgd;
+       while (level <= total) {
+               offset = pfn_level_offset(pfn, total);
+               pte = &parent[offset];
+               if (level == total)
+                       return pte;
+ 
+               if (!dma_pte_present(pte)) {
+                       *large_page = total;
+                       break;
+               }
+ 
+               if (pte->val & DMA_PTE_LARGE_PAGE) {
+                       *large_page = total;
+                       return pte;
+               }
+ 
+               parent = phys_to_virt(dma_pte_addr(pte));
+               total--;
+       }
+       return NULL;
+ }
+ 
+ /* clear last level pte, a tlb flush should be followed */
+ static int dma_pte_clear_range(struct dmar_domain *domain,
+                               unsigned long start_pfn,
+                               unsigned long last_pfn)
+ {
+       int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+       unsigned int large_page = 1;
+       struct dma_pte *first_pte, *pte;
+       int order;
+ 
+       BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
+       BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
+       BUG_ON(start_pfn > last_pfn);
+ 
+       /* we don't need lock here; nobody else touches the iova range */
+       do {
+               large_page = 1;
+               first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
+               if (!pte) {
+                       start_pfn = align_to_level(start_pfn + 1, large_page + 1);
+                       continue;
+               }
+               do {
+                       dma_clear_pte(pte);
+                       start_pfn += lvl_to_nr_pages(large_page);
+                       pte++;
+               } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
+ 
+               domain_flush_cache(domain, first_pte,
+                                  (void *)pte - (void *)first_pte);
+ 
+       } while (start_pfn && start_pfn <= last_pfn);
+ 
+       order = (large_page - 1) * 9;
+       return order;
+ }
+ 
+ /* free page table pages. last level pte should already be cleared */
+ static void dma_pte_free_pagetable(struct dmar_domain *domain,
+                                  unsigned long start_pfn,
+                                  unsigned long last_pfn)
+ {
+       int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+       struct dma_pte *first_pte, *pte;
+       int total = agaw_to_level(domain->agaw);
+       int level;
+       unsigned long tmp;
+       int large_page = 2;
+ 
+       BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
+       BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
+       BUG_ON(start_pfn > last_pfn);
+ 
+       /* We don't need lock here; nobody else touches the iova range */
+       level = 2;
+       while (level <= total) {
+               tmp = align_to_level(start_pfn, level);
+ 
+               /* If we can't even clear one PTE at this level, we're done */
+               if (tmp + level_size(level) - 1 > last_pfn)
+                       return;
+ 
+               do {
+                       large_page = level;
+                       first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
+                       if (large_page > level)
+                               level = large_page + 1;
+                       if (!pte) {
+                               tmp = align_to_level(tmp + 1, level + 1);
+                               continue;
+                       }
+                       do {
+                               if (dma_pte_present(pte)) {
+                                       free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
+                                       dma_clear_pte(pte);
+                               }
+                               pte++;
+                               tmp += level_size(level);
+                       } while (!first_pte_in_page(pte) &&
+                                tmp + level_size(level) - 1 <= last_pfn);
+ 
+                       domain_flush_cache(domain, first_pte,
+                                          (void *)pte - (void *)first_pte);
+                       
+               } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
+               level++;
+       }
+       /* free pgd */
+       if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
+               free_pgtable_page(domain->pgd);
+               domain->pgd = NULL;
+       }
+ }
+ 
+ /* iommu handling */
+ static int iommu_alloc_root_entry(struct intel_iommu *iommu)
+ {
+       struct root_entry *root;
+       unsigned long flags;
+ 
+       root = (struct root_entry *)alloc_pgtable_page(iommu->node);
+       if (!root)
+               return -ENOMEM;
+ 
+       __iommu_flush_cache(iommu, root, ROOT_SIZE);
+ 
+       spin_lock_irqsave(&iommu->lock, flags);
+       iommu->root_entry = root;
+       spin_unlock_irqrestore(&iommu->lock, flags);
+ 
+       return 0;
+ }
+ 
+ static void iommu_set_root_entry(struct intel_iommu *iommu)
+ {
+       void *addr;
+       u32 sts;
+       unsigned long flag;
+ 
+       addr = iommu->root_entry;
+ 
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
+ 
+       writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
+ 
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+                     readl, (sts & DMA_GSTS_RTPS), sts);
+ 
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ }
+ 
+ static void iommu_flush_write_buffer(struct intel_iommu *iommu)
+ {
+       u32 val;
+       unsigned long flag;
+ 
+       if (!rwbf_quirk && !cap_rwbf(iommu->cap))
+               return;
+ 
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
+ 
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+                     readl, (!(val & DMA_GSTS_WBFS)), val);
+ 
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ }
+ 
+ /* return value determine if we need a write buffer flush */
+ static void __iommu_flush_context(struct intel_iommu *iommu,
+                                 u16 did, u16 source_id, u8 function_mask,
+                                 u64 type)
+ {
+       u64 val = 0;
+       unsigned long flag;
+ 
+       switch (type) {
+       case DMA_CCMD_GLOBAL_INVL:
+               val = DMA_CCMD_GLOBAL_INVL;
+               break;
+       case DMA_CCMD_DOMAIN_INVL:
+               val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
+               break;
+       case DMA_CCMD_DEVICE_INVL:
+               val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
+                       | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
+               break;
+       default:
+               BUG();
+       }
+       val |= DMA_CCMD_ICC;
+ 
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
+ 
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
+               dmar_readq, (!(val & DMA_CCMD_ICC)), val);
+ 
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ }
+ 
+ /* return value determine if we need a write buffer flush */
+ static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
+                               u64 addr, unsigned int size_order, u64 type)
+ {
+       int tlb_offset = ecap_iotlb_offset(iommu->ecap);
+       u64 val = 0, val_iva = 0;
+       unsigned long flag;
+ 
+       switch (type) {
+       case DMA_TLB_GLOBAL_FLUSH:
+               /* global flush doesn't need set IVA_REG */
+               val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
+               break;
+       case DMA_TLB_DSI_FLUSH:
+               val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+               break;
+       case DMA_TLB_PSI_FLUSH:
+               val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+               /* Note: always flush non-leaf currently */
+               val_iva = size_order | addr;
+               break;
+       default:
+               BUG();
+       }
+       /* Note: set drain read/write */
+ #if 0
+       /*
+        * This is probably to be super secure.. Looks like we can
+        * ignore it without any impact.
+        */
+       if (cap_read_drain(iommu->cap))
+               val |= DMA_TLB_READ_DRAIN;
+ #endif
+       if (cap_write_drain(iommu->cap))
+               val |= DMA_TLB_WRITE_DRAIN;
+ 
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       /* Note: Only uses first TLB reg currently */
+       if (val_iva)
+               dmar_writeq(iommu->reg + tlb_offset, val_iva);
+       dmar_writeq(iommu->reg + tlb_offset + 8, val);
+ 
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, tlb_offset + 8,
+               dmar_readq, (!(val & DMA_TLB_IVT)), val);
+ 
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ 
+       /* check IOTLB invalidation granularity */
+       if (DMA_TLB_IAIG(val) == 0)
+               printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
+       if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
+               pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
+                       (unsigned long long)DMA_TLB_IIRG(type),
+                       (unsigned long long)DMA_TLB_IAIG(val));
+ }
+ 
+ static struct device_domain_info *iommu_support_dev_iotlb(
+       struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
+ {
+       int found = 0;
+       unsigned long flags;
+       struct device_domain_info *info;
+       struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
+ 
+       if (!ecap_dev_iotlb_support(iommu->ecap))
+               return NULL;
+ 
+       if (!iommu->qi)
+               return NULL;
+ 
+       spin_lock_irqsave(&device_domain_lock, flags);
+       list_for_each_entry(info, &domain->devices, link)
+               if (info->bus == bus && info->devfn == devfn) {
+                       found = 1;
+                       break;
+               }
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+ 
+       if (!found || !info->dev)
+               return NULL;
+ 
+       if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
+               return NULL;
+ 
+       if (!dmar_find_matched_atsr_unit(info->dev))
+               return NULL;
+ 
+       info->iommu = iommu;
+ 
+       return info;
+ }
+ 
+ static void iommu_enable_dev_iotlb(struct device_domain_info *info)
+ {
+       if (!info)
+               return;
+ 
+       pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
+ }
+ 
+ static void iommu_disable_dev_iotlb(struct device_domain_info *info)
+ {
+       if (!info->dev || !pci_ats_enabled(info->dev))
+               return;
+ 
+       pci_disable_ats(info->dev);
+ }
+ 
+ static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
+                                 u64 addr, unsigned mask)
+ {
+       u16 sid, qdep;
+       unsigned long flags;
+       struct device_domain_info *info;
+ 
+       spin_lock_irqsave(&device_domain_lock, flags);
+       list_for_each_entry(info, &domain->devices, link) {
+               if (!info->dev || !pci_ats_enabled(info->dev))
+                       continue;
+ 
+               sid = info->bus << 8 | info->devfn;
+               qdep = pci_ats_queue_depth(info->dev);
+               qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
+       }
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+ }
+ 
+ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
+                                 unsigned long pfn, unsigned int pages, int map)
+ {
+       unsigned int mask = ilog2(__roundup_pow_of_two(pages));
+       uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
+ 
+       BUG_ON(pages == 0);
+ 
+       /*
+        * Fallback to domain selective flush if no PSI support or the size is
+        * too big.
+        * PSI requires page size to be 2 ^ x, and the base address is naturally
+        * aligned to the size
+        */
+       if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
+               iommu->flush.flush_iotlb(iommu, did, 0, 0,
+                                               DMA_TLB_DSI_FLUSH);
+       else
+               iommu->flush.flush_iotlb(iommu, did, addr, mask,
+                                               DMA_TLB_PSI_FLUSH);
+ 
+       /*
+        * In caching mode, changes of pages from non-present to present require
+        * flush. However, device IOTLB doesn't need to be flushed in this case.
+        */
+       if (!cap_caching_mode(iommu->cap) || !map)
+               iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
+ }
+ 
+ static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
+ {
+       u32 pmen;
+       unsigned long flags;
+ 
+       raw_spin_lock_irqsave(&iommu->register_lock, flags);
+       pmen = readl(iommu->reg + DMAR_PMEN_REG);
+       pmen &= ~DMA_PMEN_EPM;
+       writel(pmen, iommu->reg + DMAR_PMEN_REG);
+ 
+       /* wait for the protected region status bit to clear */
+       IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
+               readl, !(pmen & DMA_PMEN_PRS), pmen);
+ 
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+ }
+ 
+ static int iommu_enable_translation(struct intel_iommu *iommu)
+ {
+       u32 sts;
+       unsigned long flags;
+ 
+       raw_spin_lock_irqsave(&iommu->register_lock, flags);
+       iommu->gcmd |= DMA_GCMD_TE;
+       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+ 
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+                     readl, (sts & DMA_GSTS_TES), sts);
+ 
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+       return 0;
+ }
+ 
+ static int iommu_disable_translation(struct intel_iommu *iommu)
+ {
+       u32 sts;
+       unsigned long flag;
+ 
+       raw_spin_lock_irqsave(&iommu->register_lock, flag);
+       iommu->gcmd &= ~DMA_GCMD_TE;
+       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+ 
+       /* Make sure hardware complete it */
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+                     readl, (!(sts & DMA_GSTS_TES)), sts);
+ 
+       raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+       return 0;
+ }
+ 
+ 
+ static int iommu_init_domains(struct intel_iommu *iommu)
+ {
+       unsigned long ndomains;
+       unsigned long nlongs;
+ 
+       ndomains = cap_ndoms(iommu->cap);
+       pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
+                       ndomains);
+       nlongs = BITS_TO_LONGS(ndomains);
+ 
+       spin_lock_init(&iommu->lock);
+ 
+       /* TBD: there might be 64K domains,
+        * consider other allocation for future chip
+        */
+       iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
+       if (!iommu->domain_ids) {
+               printk(KERN_ERR "Allocating domain id array failed\n");
+               return -ENOMEM;
+       }
+       iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
+                       GFP_KERNEL);
+       if (!iommu->domains) {
+               printk(KERN_ERR "Allocating domain array failed\n");
+               return -ENOMEM;
+       }
+ 
+       /*
+        * if Caching mode is set, then invalid translations are tagged
+        * with domainid 0. Hence we need to pre-allocate it.
+        */
+       if (cap_caching_mode(iommu->cap))
+               set_bit(0, iommu->domain_ids);
+       return 0;
+ }
+ 
+ 
+ static void domain_exit(struct dmar_domain *domain);
+ static void vm_domain_exit(struct dmar_domain *domain);
+ 
+ void free_dmar_iommu(struct intel_iommu *iommu)
+ {
+       struct dmar_domain *domain;
+       int i;
+       unsigned long flags;
+ 
+       if ((iommu->domains) && (iommu->domain_ids)) {
+               for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
+                       domain = iommu->domains[i];
+                       clear_bit(i, iommu->domain_ids);
+ 
+                       spin_lock_irqsave(&domain->iommu_lock, flags);
+                       if (--domain->iommu_count == 0) {
+                               if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
+                                       vm_domain_exit(domain);
+                               else
+                                       domain_exit(domain);
+                       }
+                       spin_unlock_irqrestore(&domain->iommu_lock, flags);
+               }
+       }
+ 
+       if (iommu->gcmd & DMA_GCMD_TE)
+               iommu_disable_translation(iommu);
+ 
+       if (iommu->irq) {
+               irq_set_handler_data(iommu->irq, NULL);
+               /* This will mask the irq */
+               free_irq(iommu->irq, iommu);
+               destroy_irq(iommu->irq);
+       }
+ 
+       kfree(iommu->domains);
+       kfree(iommu->domain_ids);
+ 
+       g_iommus[iommu->seq_id] = NULL;
+ 
+       /* if all iommus are freed, free g_iommus */
+       for (i = 0; i < g_num_of_iommus; i++) {
+               if (g_iommus[i])
+                       break;
+       }
+ 
+       if (i == g_num_of_iommus)
+               kfree(g_iommus);
+ 
+       /* free context mapping */
+       free_context_table(iommu);
+ }
+ 
+ static struct dmar_domain *alloc_domain(void)
+ {
+       struct dmar_domain *domain;
+ 
+       domain = alloc_domain_mem();
+       if (!domain)
+               return NULL;
+ 
+       domain->nid = -1;
+       memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
+       domain->flags = 0;
+ 
+       return domain;
+ }
+ 
+ static int iommu_attach_domain(struct dmar_domain *domain,
+                              struct intel_iommu *iommu)
+ {
+       int num;
+       unsigned long ndomains;
+       unsigned long flags;
+ 
+       ndomains = cap_ndoms(iommu->cap);
+ 
+       spin_lock_irqsave(&iommu->lock, flags);
+ 
+       num = find_first_zero_bit(iommu->domain_ids, ndomains);
+       if (num >= ndomains) {
+               spin_unlock_irqrestore(&iommu->lock, flags);
+               printk(KERN_ERR "IOMMU: no free domain ids\n");
+               return -ENOMEM;
+       }
+ 
+       domain->id = num;
+       set_bit(num, iommu->domain_ids);
+       set_bit(iommu->seq_id, &domain->iommu_bmp);
+       iommu->domains[num] = domain;
+       spin_unlock_irqrestore(&iommu->lock, flags);
+ 
+       return 0;
+ }
+ 
+ static void iommu_detach_domain(struct dmar_domain *domain,
+                               struct intel_iommu *iommu)
+ {
+       unsigned long flags;
+       int num, ndomains;
+       int found = 0;
+ 
+       spin_lock_irqsave(&iommu->lock, flags);
+       ndomains = cap_ndoms(iommu->cap);
+       for_each_set_bit(num, iommu->domain_ids, ndomains) {
+               if (iommu->domains[num] == domain) {
+                       found = 1;
+                       break;
+               }
+       }
+ 
+       if (found) {
+               clear_bit(num, iommu->domain_ids);
+               clear_bit(iommu->seq_id, &domain->iommu_bmp);
+               iommu->domains[num] = NULL;
+       }
+       spin_unlock_irqrestore(&iommu->lock, flags);
+ }
+ 
+ static struct iova_domain reserved_iova_list;
+ static struct lock_class_key reserved_rbtree_key;
+ 
+ static int dmar_init_reserved_ranges(void)
+ {
+       struct pci_dev *pdev = NULL;
+       struct iova *iova;
+       int i;
+ 
+       init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
+ 
+       lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
+               &reserved_rbtree_key);
+ 
+       /* IOAPIC ranges shouldn't be accessed by DMA */
+       iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
+               IOVA_PFN(IOAPIC_RANGE_END));
+       if (!iova) {
+               printk(KERN_ERR "Reserve IOAPIC range failed\n");
+               return -ENODEV;
+       }
+ 
+       /* Reserve all PCI MMIO to avoid peer-to-peer access */
+       for_each_pci_dev(pdev) {
+               struct resource *r;
+ 
+               for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+                       r = &pdev->resource[i];
+                       if (!r->flags || !(r->flags & IORESOURCE_MEM))
+                               continue;
+                       iova = reserve_iova(&reserved_iova_list,
+                                           IOVA_PFN(r->start),
+                                           IOVA_PFN(r->end));
+                       if (!iova) {
+                               printk(KERN_ERR "Reserve iova failed\n");
+                               return -ENODEV;
+                       }
+               }
+       }
+       return 0;
+ }
+ 
+ static void domain_reserve_special_ranges(struct dmar_domain *domain)
+ {
+       copy_reserved_iova(&reserved_iova_list, &domain->iovad);
+ }
+ 
+ static inline int guestwidth_to_adjustwidth(int gaw)
+ {
+       int agaw;
+       int r = (gaw - 12) % 9;
+ 
+       if (r == 0)
+               agaw = gaw;
+       else
+               agaw = gaw + 9 - r;
+       if (agaw > 64)
+               agaw = 64;
+       return agaw;
+ }
+ 
+ static int domain_init(struct dmar_domain *domain, int guest_width)
+ {
+       struct intel_iommu *iommu;
+       int adjust_width, agaw;
+       unsigned long sagaw;
+ 
+       init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
+       spin_lock_init(&domain->iommu_lock);
+ 
+       domain_reserve_special_ranges(domain);
+ 
+       /* calculate AGAW */
+       iommu = domain_get_iommu(domain);
+       if (guest_width > cap_mgaw(iommu->cap))
+               guest_width = cap_mgaw(iommu->cap);
+       domain->gaw = guest_width;
+       adjust_width = guestwidth_to_adjustwidth(guest_width);
+       agaw = width_to_agaw(adjust_width);
+       sagaw = cap_sagaw(iommu->cap);
+       if (!test_bit(agaw, &sagaw)) {
+               /* hardware doesn't support it, choose a bigger one */
+               pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
+               agaw = find_next_bit(&sagaw, 5, agaw);
+               if (agaw >= 5)
+                       return -ENODEV;
+       }
+       domain->agaw = agaw;
+       INIT_LIST_HEAD(&domain->devices);
+ 
+       if (ecap_coherent(iommu->ecap))
+               domain->iommu_coherency = 1;
+       else
+               domain->iommu_coherency = 0;
+ 
+       if (ecap_sc_support(iommu->ecap))
+               domain->iommu_snooping = 1;
+       else
+               domain->iommu_snooping = 0;
+ 
+       domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
+       domain->iommu_count = 1;
+       domain->nid = iommu->node;
+ 
+       /* always allocate the top pgd */
+       domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
+       if (!domain->pgd)
+               return -ENOMEM;
+       __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
+       return 0;
+ }
+ 
+ static void domain_exit(struct dmar_domain *domain)
+ {
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+ 
+       /* Domain 0 is reserved, so dont process it */
+       if (!domain)
+               return;
+ 
+       /* Flush any lazy unmaps that may reference this domain */
+       if (!intel_iommu_strict)
+               flush_unmaps_timeout(0);
+ 
+       domain_remove_dev_info(domain);
+       /* destroy iovas */
+       put_iova_domain(&domain->iovad);
+ 
+       /* clear ptes */
+       dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+ 
+       /* free page tables */
+       dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+ 
+       for_each_active_iommu(iommu, drhd)
+               if (test_bit(iommu->seq_id, &domain->iommu_bmp))
+                       iommu_detach_domain(domain, iommu);
+ 
+       free_domain_mem(domain);
+ }
+ 
+ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
+                                u8 bus, u8 devfn, int translation)
+ {
+       struct context_entry *context;
+       unsigned long flags;
+       struct intel_iommu *iommu;
+       struct dma_pte *pgd;
+       unsigned long num;
+       unsigned long ndomains;
+       int id;
+       int agaw;
+       struct device_domain_info *info = NULL;
+ 
+       pr_debug("Set context mapping for %02x:%02x.%d\n",
+               bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+ 
+       BUG_ON(!domain->pgd);
+       BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
+              translation != CONTEXT_TT_MULTI_LEVEL);
+ 
+       iommu = device_to_iommu(segment, bus, devfn);
+       if (!iommu)
+               return -ENODEV;
+ 
+       context = device_to_context_entry(iommu, bus, devfn);
+       if (!context)
+               return -ENOMEM;
+       spin_lock_irqsave(&iommu->lock, flags);
+       if (context_present(context)) {
+               spin_unlock_irqrestore(&iommu->lock, flags);
+               return 0;
+       }
+ 
+       id = domain->id;
+       pgd = domain->pgd;
+ 
+       if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
+           domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
+               int found = 0;
+ 
+               /* find an available domain id for this device in iommu */
+               ndomains = cap_ndoms(iommu->cap);
+               for_each_set_bit(num, iommu->domain_ids, ndomains) {
+                       if (iommu->domains[num] == domain) {
+                               id = num;
+                               found = 1;
+                               break;
+                       }
+               }
+ 
+               if (found == 0) {
+                       num = find_first_zero_bit(iommu->domain_ids, ndomains);
+                       if (num >= ndomains) {
+                               spin_unlock_irqrestore(&iommu->lock, flags);
+                               printk(KERN_ERR "IOMMU: no free domain ids\n");
+                               return -EFAULT;
+                       }
+ 
+                       set_bit(num, iommu->domain_ids);
+                       iommu->domains[num] = domain;
+                       id = num;
+               }
+ 
+               /* Skip top levels of page tables for
+                * iommu which has less agaw than default.
+                * Unnecessary for PT mode.
+                */
+               if (translation != CONTEXT_TT_PASS_THROUGH) {
+                       for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
+                               pgd = phys_to_virt(dma_pte_addr(pgd));
+                               if (!dma_pte_present(pgd)) {
+                                       spin_unlock_irqrestore(&iommu->lock, flags);
+                                       return -ENOMEM;
+                               }
+                       }
+               }
+       }
+ 
+       context_set_domain_id(context, id);
+ 
+       if (translation != CONTEXT_TT_PASS_THROUGH) {
+               info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
+               translation = info ? CONTEXT_TT_DEV_IOTLB :
+                                    CONTEXT_TT_MULTI_LEVEL;
+       }
+       /*
+        * In pass through mode, AW must be programmed to indicate the largest
+        * AGAW value supported by hardware. And ASR is ignored by hardware.
+        */
+       if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
+               context_set_address_width(context, iommu->msagaw);
+       else {
+               context_set_address_root(context, virt_to_phys(pgd));
+               context_set_address_width(context, iommu->agaw);
+       }
+ 
+       context_set_translation_type(context, translation);
+       context_set_fault_enable(context);
+       context_set_present(context);
+       domain_flush_cache(domain, context, sizeof(*context));
+ 
+       /*
+        * It's a non-present to present mapping. If hardware doesn't cache
+        * non-present entry we only need to flush the write-buffer. If the
+        * _does_ cache non-present entries, then it does so in the special
+        * domain #0, which we have to flush:
+        */
+       if (cap_caching_mode(iommu->cap)) {
+               iommu->flush.flush_context(iommu, 0,
+                                          (((u16)bus) << 8) | devfn,
+                                          DMA_CCMD_MASK_NOBIT,
+                                          DMA_CCMD_DEVICE_INVL);
+               iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
+       } else {
+               iommu_flush_write_buffer(iommu);
+       }
+       iommu_enable_dev_iotlb(info);
+       spin_unlock_irqrestore(&iommu->lock, flags);
+ 
+       spin_lock_irqsave(&domain->iommu_lock, flags);
+       if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
+               domain->iommu_count++;
+               if (domain->iommu_count == 1)
+                       domain->nid = iommu->node;
+               domain_update_iommu_cap(domain);
+       }
+       spin_unlock_irqrestore(&domain->iommu_lock, flags);
+       return 0;
+ }
+ 
+ static int
+ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
+                       int translation)
+ {
+       int ret;
+       struct pci_dev *tmp, *parent;
+ 
+       ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
+                                        pdev->bus->number, pdev->devfn,
+                                        translation);
+       if (ret)
+               return ret;
+ 
+       /* dependent device mapping */
+       tmp = pci_find_upstream_pcie_bridge(pdev);
+       if (!tmp)
+               return 0;
+       /* Secondary interface's bus number and devfn 0 */
+       parent = pdev->bus->self;
+       while (parent != tmp) {
+               ret = domain_context_mapping_one(domain,
+                                                pci_domain_nr(parent->bus),
+                                                parent->bus->number,
+                                                parent->devfn, translation);
+               if (ret)
+                       return ret;
+               parent = parent->bus->self;
+       }
+       if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
+               return domain_context_mapping_one(domain,
+                                       pci_domain_nr(tmp->subordinate),
+                                       tmp->subordinate->number, 0,
+                                       translation);
+       else /* this is a legacy PCI bridge */
+               return domain_context_mapping_one(domain,
+                                                 pci_domain_nr(tmp->bus),
+                                                 tmp->bus->number,
+                                                 tmp->devfn,
+                                                 translation);
+ }
+ 
+ static int domain_context_mapped(struct pci_dev *pdev)
+ {
+       int ret;
+       struct pci_dev *tmp, *parent;
+       struct intel_iommu *iommu;
+ 
+       iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
+                               pdev->devfn);
+       if (!iommu)
+               return -ENODEV;
+ 
+       ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
+       if (!ret)
+               return ret;
+       /* dependent device mapping */
+       tmp = pci_find_upstream_pcie_bridge(pdev);
+       if (!tmp)
+               return ret;
+       /* Secondary interface's bus number and devfn 0 */
+       parent = pdev->bus->self;
+       while (parent != tmp) {
+               ret = device_context_mapped(iommu, parent->bus->number,
+                                           parent->devfn);
+               if (!ret)
+                       return ret;
+               parent = parent->bus->self;
+       }
+       if (pci_is_pcie(tmp))
+               return device_context_mapped(iommu, tmp->subordinate->number,
+                                            0);
+       else
+               return device_context_mapped(iommu, tmp->bus->number,
+                                            tmp->devfn);
+ }
+ 
+ /* Returns a number of VTD pages, but aligned to MM page size */
+ static inline unsigned long aligned_nrpages(unsigned long host_addr,
+                                           size_t size)
+ {
+       host_addr &= ~PAGE_MASK;
+       return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
+ }
+ 
+ /* Return largest possible superpage level for a given mapping */
+ static inline int hardware_largepage_caps(struct dmar_domain *domain,
+                                         unsigned long iov_pfn,
+                                         unsigned long phy_pfn,
+                                         unsigned long pages)
+ {
+       int support, level = 1;
+       unsigned long pfnmerge;
+ 
+       support = domain->iommu_superpage;
+ 
+       /* To use a large page, the virtual *and* physical addresses
+          must be aligned to 2MiB/1GiB/etc. Lower bits set in either
+          of them will mean we have to use smaller pages. So just
+          merge them and check both at once. */
+       pfnmerge = iov_pfn | phy_pfn;
+ 
+       while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
+               pages >>= VTD_STRIDE_SHIFT;
+               if (!pages)
+                       break;
+               pfnmerge >>= VTD_STRIDE_SHIFT;
+               level++;
+               support--;
+       }
+       return level;
+ }
+ 
+ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+                           struct scatterlist *sg, unsigned long phys_pfn,
+                           unsigned long nr_pages, int prot)
+ {
+       struct dma_pte *first_pte = NULL, *pte = NULL;
+       phys_addr_t uninitialized_var(pteval);
+       int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+       unsigned long sg_res;
+       unsigned int largepage_lvl = 0;
+       unsigned long lvl_pages = 0;
+ 
+       BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
+ 
+       if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
+               return -EINVAL;
+ 
+       prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
+ 
+       if (sg)
+               sg_res = 0;
+       else {
+               sg_res = nr_pages + 1;
+               pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
+       }
+ 
+       while (nr_pages > 0) {
+               uint64_t tmp;
+ 
+               if (!sg_res) {
+                       sg_res = aligned_nrpages(sg->offset, sg->length);
+                       sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
+                       sg->dma_length = sg->length;
+                       pteval = page_to_phys(sg_page(sg)) | prot;
+                       phys_pfn = pteval >> VTD_PAGE_SHIFT;
+               }
+ 
+               if (!pte) {
+                       largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
+ 
+                       first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
+                       if (!pte)
+                               return -ENOMEM;
+                       /* It is large page*/
+                       if (largepage_lvl > 1)
+                               pteval |= DMA_PTE_LARGE_PAGE;
+                       else
+                               pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
+ 
+               }
+               /* We don't need lock here, nobody else
+                * touches the iova range
+                */
+               tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
+               if (tmp) {
+                       static int dumps = 5;
+                       printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
+                              iov_pfn, tmp, (unsigned long long)pteval);
+                       if (dumps) {
+                               dumps--;
+                               debug_dma_dump_mappings(NULL);
+                       }
+                       WARN_ON(1);
+               }
+ 
+               lvl_pages = lvl_to_nr_pages(largepage_lvl);
+ 
+               BUG_ON(nr_pages < lvl_pages);
+               BUG_ON(sg_res < lvl_pages);
+ 
+               nr_pages -= lvl_pages;
+               iov_pfn += lvl_pages;
+               phys_pfn += lvl_pages;
+               pteval += lvl_pages * VTD_PAGE_SIZE;
+               sg_res -= lvl_pages;
+ 
+               /* If the next PTE would be the first in a new page, then we
+                  need to flush the cache on the entries we've just written.
+                  And then we'll need to recalculate 'pte', so clear it and
+                  let it get set again in the if (!pte) block above.
+ 
+                  If we're done (!nr_pages) we need to flush the cache too.
+ 
+                  Also if we've been setting superpages, we may need to
+                  recalculate 'pte' and switch back to smaller pages for the
+                  end of the mapping, if the trailing size is not enough to
+                  use another superpage (i.e. sg_res < lvl_pages). */
+               pte++;
+               if (!nr_pages || first_pte_in_page(pte) ||
+                   (largepage_lvl > 1 && sg_res < lvl_pages)) {
+                       domain_flush_cache(domain, first_pte,
+                                          (void *)pte - (void *)first_pte);
+                       pte = NULL;
+               }
+ 
+               if (!sg_res && nr_pages)
+                       sg = sg_next(sg);
+       }
+       return 0;
+ }
+ 
+ static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+                                   struct scatterlist *sg, unsigned long nr_pages,
+                                   int prot)
+ {
+       return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
+ }
+ 
+ static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+                                    unsigned long phys_pfn, unsigned long nr_pages,
+                                    int prot)
+ {
+       return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
+ }
+ 
+ static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
+ {
+       if (!iommu)
+               return;
+ 
+       clear_context_table(iommu, bus, devfn);
+       iommu->flush.flush_context(iommu, 0, 0, 0,
+                                          DMA_CCMD_GLOBAL_INVL);
+       iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+ }
+ 
+ static void domain_remove_dev_info(struct dmar_domain *domain)
+ {
+       struct device_domain_info *info;
+       unsigned long flags;
+       struct intel_iommu *iommu;
+ 
+       spin_lock_irqsave(&device_domain_lock, flags);
+       while (!list_empty(&domain->devices)) {
+               info = list_entry(domain->devices.next,
+                       struct device_domain_info, link);
+               list_del(&info->link);
+               list_del(&info->global);
+               if (info->dev)
+                       info->dev->dev.archdata.iommu = NULL;
+               spin_unlock_irqrestore(&device_domain_lock, flags);
+ 
+               iommu_disable_dev_iotlb(info);
+               iommu = device_to_iommu(info->segment, info->bus, info->devfn);
+               iommu_detach_dev(iommu, info->bus, info->devfn);
+               free_devinfo_mem(info);
+ 
+               spin_lock_irqsave(&device_domain_lock, flags);
+       }
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+ }
+ 
+ /*
+  * find_domain
+  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
+  */
+ static struct dmar_domain *
+ find_domain(struct pci_dev *pdev)
+ {
+       struct device_domain_info *info;
+ 
+       /* No lock here, assumes no domain exit in normal case */
+       info = pdev->dev.archdata.iommu;
+       if (info)
+               return info->domain;
+       return NULL;
+ }
+ 
+ /* domain is initialized */
+ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
+ {
+       struct dmar_domain *domain, *found = NULL;
+       struct intel_iommu *iommu;
+       struct dmar_drhd_unit *drhd;
+       struct device_domain_info *info, *tmp;
+       struct pci_dev *dev_tmp;
+       unsigned long flags;
+       int bus = 0, devfn = 0;
+       int segment;
+       int ret;
+ 
+       domain = find_domain(pdev);
+       if (domain)
+               return domain;
+ 
+       segment = pci_domain_nr(pdev->bus);
+ 
+       dev_tmp = pci_find_upstream_pcie_bridge(pdev);
+       if (dev_tmp) {
+               if (pci_is_pcie(dev_tmp)) {
+                       bus = dev_tmp->subordinate->number;
+                       devfn = 0;
+               } else {
+                       bus = dev_tmp->bus->number;
+                       devfn = dev_tmp->devfn;
+               }
+               spin_lock_irqsave(&device_domain_lock, flags);
+               list_for_each_entry(info, &device_domain_list, global) {
+                       if (info->segment == segment &&
+                           info->bus == bus && info->devfn == devfn) {
+                               found = info->domain;
+                               break;
+                       }
+               }
+               spin_unlock_irqrestore(&device_domain_lock, flags);
+               /* pcie-pci bridge already has a domain, uses it */
+               if (found) {
+                       domain = found;
+                       goto found_domain;
+               }
+       }
+ 
+       domain = alloc_domain();
+       if (!domain)
+               goto error;
+ 
+       /* Allocate new domain for the device */
+       drhd = dmar_find_matched_drhd_unit(pdev);
+       if (!drhd) {
+               printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
+                       pci_name(pdev));
+               return NULL;
+       }
+       iommu = drhd->iommu;
+ 
+       ret = iommu_attach_domain(domain, iommu);
+       if (ret) {
+               free_domain_mem(domain);
+               goto error;
+       }
+ 
+       if (domain_init(domain, gaw)) {
+               domain_exit(domain);
+               goto error;
+       }
+ 
+       /* register pcie-to-pci device */
+       if (dev_tmp) {
+               info = alloc_devinfo_mem();
+               if (!info) {
+                       domain_exit(domain);
+                       goto error;
+               }
+               info->segment = segment;
+               info->bus = bus;
+               info->devfn = devfn;
+               info->dev = NULL;
+               info->domain = domain;
+               /* This domain is shared by devices under p2p bridge */
+               domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
+ 
+               /* pcie-to-pci bridge already has a domain, uses it */
+               found = NULL;
+               spin_lock_irqsave(&device_domain_lock, flags);
+               list_for_each_entry(tmp, &device_domain_list, global) {
+                       if (tmp->segment == segment &&
+                           tmp->bus == bus && tmp->devfn == devfn) {
+                               found = tmp->domain;
+                               break;
+                       }
+               }
+               if (found) {
+                       spin_unlock_irqrestore(&device_domain_lock, flags);
+                       free_devinfo_mem(info);
+                       domain_exit(domain);
+                       domain = found;
+               } else {
+                       list_add(&info->link, &domain->devices);
+                       list_add(&info->global, &device_domain_list);
+                       spin_unlock_irqrestore(&device_domain_lock, flags);
+               }
+       }
+ 
+ found_domain:
+       info = alloc_devinfo_mem();
+       if (!info)
+               goto error;
+       info->segment = segment;
+       info->bus = pdev->bus->number;
+       info->devfn = pdev->devfn;
+       info->dev = pdev;
+       info->domain = domain;
+       spin_lock_irqsave(&device_domain_lock, flags);
+       /* somebody is fast */
+       found = find_domain(pdev);
+       if (found != NULL) {
+               spin_unlock_irqrestore(&device_domain_lock, flags);
+               if (found != domain) {
+                       domain_exit(domain);
+                       domain = found;
+               }
+               free_devinfo_mem(info);
+               return domain;
+       }
+       list_add(&info->link, &domain->devices);
+       list_add(&info->global, &device_domain_list);
+       pdev->dev.archdata.iommu = info;
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+       return domain;
+ error:
+       /* recheck it here, maybe others set it */
+       return find_domain(pdev);
+ }
+ 
+ static int iommu_identity_mapping;
+ #define IDENTMAP_ALL          1
+ #define IDENTMAP_GFX          2
+ #define IDENTMAP_AZALIA               4
+ 
+ static int iommu_domain_identity_map(struct dmar_domain *domain,
+                                    unsigned long long start,
+                                    unsigned long long end)
+ {
+       unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
+       unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
+ 
+       if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
+                         dma_to_mm_pfn(last_vpfn))) {
+               printk(KERN_ERR "IOMMU: reserve iova failed\n");
+               return -ENOMEM;
+       }
+ 
+       pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
+                start, end, domain->id);
+       /*
+        * RMRR range might have overlap with physical memory range,
+        * clear it first
+        */
+       dma_pte_clear_range(domain, first_vpfn, last_vpfn);
+ 
+       return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
+                                 last_vpfn - first_vpfn + 1,
+                                 DMA_PTE_READ|DMA_PTE_WRITE);
+ }
+ 
+ static int iommu_prepare_identity_map(struct pci_dev *pdev,
+                                     unsigned long long start,
+                                     unsigned long long end)
+ {
+       struct dmar_domain *domain;
+       int ret;
+ 
+       domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+       if (!domain)
+               return -ENOMEM;
+ 
+       /* For _hardware_ passthrough, don't bother. But for software
+          passthrough, we do it anyway -- it may indicate a memory
+          range which is reserved in E820, so which didn't get set
+          up to start with in si_domain */
+       if (domain == si_domain && hw_pass_through) {
+               printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
+                      pci_name(pdev), start, end);
+               return 0;
+       }
+ 
+       printk(KERN_INFO
+              "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
+              pci_name(pdev), start, end);
+       
+       if (end < start) {
+               WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
+                       "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+                       dmi_get_system_info(DMI_BIOS_VENDOR),
+                       dmi_get_system_info(DMI_BIOS_VERSION),
+                    dmi_get_system_info(DMI_PRODUCT_VERSION));
+               ret = -EIO;
+               goto error;
+       }
+ 
+       if (end >> agaw_to_width(domain->agaw)) {
+               WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
+                    "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+                    agaw_to_width(domain->agaw),
+                    dmi_get_system_info(DMI_BIOS_VENDOR),
+                    dmi_get_system_info(DMI_BIOS_VERSION),
+                    dmi_get_system_info(DMI_PRODUCT_VERSION));
+               ret = -EIO;
+               goto error;
+       }
+ 
+       ret = iommu_domain_identity_map(domain, start, end);
+       if (ret)
+               goto error;
+ 
+       /* context entry init */
+       ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
+       if (ret)
+               goto error;
+ 
+       return 0;
+ 
+  error:
+       domain_exit(domain);
+       return ret;
+ }
+ 
+ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
+       struct pci_dev *pdev)
+ {
+       if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
+               return 0;
+       return iommu_prepare_identity_map(pdev, rmrr->base_address,
+               rmrr->end_address);
+ }
+ 
+ #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
+ static inline void iommu_prepare_isa(void)
+ {
+       struct pci_dev *pdev;
+       int ret;
+ 
+       pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
+       if (!pdev)
+               return;
+ 
+       printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
+       ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
+ 
+       if (ret)
+               printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
+                      "floppy might not work\n");
+ 
+ }
+ #else
+ static inline void iommu_prepare_isa(void)
+ {
+       return;
+ }
+ #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
+ 
+ static int md_domain_init(struct dmar_domain *domain, int guest_width);
+ 
- -              work_with_active_regions(nid, si_domain_work_fn, &ret);
- -              if (ret)
- -                      return ret;
+ static int __init si_domain_init(int hw)
+ {
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       int nid, ret = 0;
+ 
+       si_domain = alloc_domain();
+       if (!si_domain)
+               return -EFAULT;
+ 
+       pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
+ 
+       for_each_active_iommu(iommu, drhd) {
+               ret = iommu_attach_domain(si_domain, iommu);
+               if (ret) {
+                       domain_exit(si_domain);
+                       return -EFAULT;
+               }
+       }
+ 
+       if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+               domain_exit(si_domain);
+               return -EFAULT;
+       }
+ 
+       si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
+ 
+       if (hw)
+               return 0;
+ 
+       for_each_online_node(nid) {
++              unsigned long start_pfn, end_pfn;
++              int i;
++
++              for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
++                      ret = iommu_domain_identity_map(si_domain,
++                                      PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
++                      if (ret)
++                              return ret;
++              }
+       }
+ 
+       return 0;
+ }
+ 
+ static void domain_remove_one_dev_info(struct dmar_domain *domain,
+                                         struct pci_dev *pdev);
+ static int identity_mapping(struct pci_dev *pdev)
+ {
+       struct device_domain_info *info;
+ 
+       if (likely(!iommu_identity_mapping))
+               return 0;
+ 
+       info = pdev->dev.archdata.iommu;
+       if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
+               return (info->domain == si_domain);
+ 
+       return 0;
+ }
+ 
+ static int domain_add_dev_info(struct dmar_domain *domain,
+                              struct pci_dev *pdev,
+                              int translation)
+ {
+       struct device_domain_info *info;
+       unsigned long flags;
+       int ret;
+ 
+       info = alloc_devinfo_mem();
+       if (!info)
+               return -ENOMEM;
+ 
+       ret = domain_context_mapping(domain, pdev, translation);
+       if (ret) {
+               free_devinfo_mem(info);
+               return ret;
+       }
+ 
+       info->segment = pci_domain_nr(pdev->bus);
+       info->bus = pdev->bus->number;
+       info->devfn = pdev->devfn;
+       info->dev = pdev;
+       info->domain = domain;
+ 
+       spin_lock_irqsave(&device_domain_lock, flags);
+       list_add(&info->link, &domain->devices);
+       list_add(&info->global, &device_domain_list);
+       pdev->dev.archdata.iommu = info;
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+ 
+       return 0;
+ }
+ 
+ static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
+ {
+       if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
+               return 1;
+ 
+       if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
+               return 1;
+ 
+       if (!(iommu_identity_mapping & IDENTMAP_ALL))
+               return 0;
+ 
+       /*
+        * We want to start off with all devices in the 1:1 domain, and
+        * take them out later if we find they can't access all of memory.
+        *
+        * However, we can't do this for PCI devices behind bridges,
+        * because all PCI devices behind the same bridge will end up
+        * with the same source-id on their transactions.
+        *
+        * Practically speaking, we can't change things around for these
+        * devices at run-time, because we can't be sure there'll be no
+        * DMA transactions in flight for any of their siblings.
+        * 
+        * So PCI devices (unless they're on the root bus) as well as
+        * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
+        * the 1:1 domain, just in _case_ one of their siblings turns out
+        * not to be able to map all of memory.
+        */
+       if (!pci_is_pcie(pdev)) {
+               if (!pci_is_root_bus(pdev->bus))
+                       return 0;
+               if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
+                       return 0;
+       } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
+               return 0;
+ 
+       /* 
+        * At boot time, we don't yet know if devices will be 64-bit capable.
+        * Assume that they will -- if they turn out not to be, then we can 
+        * take them out of the 1:1 domain later.
+        */
+       if (!startup) {
+               /*
+                * If the device's dma_mask is less than the system's memory
+                * size then this is not a candidate for identity mapping.
+                */
+               u64 dma_mask = pdev->dma_mask;
+ 
+               if (pdev->dev.coherent_dma_mask &&
+                   pdev->dev.coherent_dma_mask < dma_mask)
+                       dma_mask = pdev->dev.coherent_dma_mask;
+ 
+               return dma_mask >= dma_get_required_mask(&pdev->dev);
+       }
+ 
+       return 1;
+ }
+ 
+ static int __init iommu_prepare_static_identity_mapping(int hw)
+ {
+       struct pci_dev *pdev = NULL;
+       int ret;
+ 
+       ret = si_domain_init(hw);
+       if (ret)
+               return -EFAULT;
+ 
+       for_each_pci_dev(pdev) {
+               /* Skip Host/PCI Bridge devices */
+               if (IS_BRIDGE_HOST_DEVICE(pdev))
+                       continue;
+               if (iommu_should_identity_map(pdev, 1)) {
+                       printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
+                              hw ? "hardware" : "software", pci_name(pdev));
+ 
+                       ret = domain_add_dev_info(si_domain, pdev,
+                                                    hw ? CONTEXT_TT_PASS_THROUGH :
+                                                    CONTEXT_TT_MULTI_LEVEL);
+                       if (ret)
+                               return ret;
+               }
+       }
+ 
+       return 0;
+ }
+ 
+ static int __init init_dmars(void)
+ {
+       struct dmar_drhd_unit *drhd;
+       struct dmar_rmrr_unit *rmrr;
+       struct pci_dev *pdev;
+       struct intel_iommu *iommu;
+       int i, ret;
+ 
+       /*
+        * for each drhd
+        *    allocate root
+        *    initialize and program root entry to not present
+        * endfor
+        */
+       for_each_drhd_unit(drhd) {
+               g_num_of_iommus++;
+               /*
+                * lock not needed as this is only incremented in the single
+                * threaded kernel __init code path all other access are read
+                * only
+                */
+       }
+ 
+       g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
+                       GFP_KERNEL);
+       if (!g_iommus) {
+               printk(KERN_ERR "Allocating global iommu array failed\n");
+               ret = -ENOMEM;
+               goto error;
+       }
+ 
+       deferred_flush = kzalloc(g_num_of_iommus *
+               sizeof(struct deferred_flush_tables), GFP_KERNEL);
+       if (!deferred_flush) {
+               ret = -ENOMEM;
+               goto error;
+       }
+ 
+       for_each_drhd_unit(drhd) {
+               if (drhd->ignored)
+                       continue;
+ 
+               iommu = drhd->iommu;
+               g_iommus[iommu->seq_id] = iommu;
+ 
+               ret = iommu_init_domains(iommu);
+               if (ret)
+                       goto error;
+ 
+               /*
+                * TBD:
+                * we could share the same root & context tables
+                * among all IOMMU's. Need to Split it later.
+                */
+               ret = iommu_alloc_root_entry(iommu);
+               if (ret) {
+                       printk(KERN_ERR "IOMMU: allocate root entry failed\n");
+                       goto error;
+               }
+               if (!ecap_pass_through(iommu->ecap))
+                       hw_pass_through = 0;
+       }
+ 
+       /*
+        * Start from the sane iommu hardware state.
+        */
+       for_each_drhd_unit(drhd) {
+               if (drhd->ignored)
+                       continue;
+ 
+               iommu = drhd->iommu;
+ 
+               /*
+                * If the queued invalidation is already initialized by us
+                * (for example, while enabling interrupt-remapping) then
+                * we got the things already rolling from a sane state.
+                */
+               if (iommu->qi)
+                       continue;
+ 
+               /*
+                * Clear any previous faults.
+                */
+               dmar_fault(-1, iommu);
+               /*
+                * Disable queued invalidation if supported and already enabled
+                * before OS handover.
+                */
+               dmar_disable_qi(iommu);
+       }
+ 
+       for_each_drhd_unit(drhd) {
+               if (drhd->ignored)
+                       continue;
+ 
+               iommu = drhd->iommu;
+ 
+               if (dmar_enable_qi(iommu)) {
+                       /*
+                        * Queued Invalidate not enabled, use Register Based
+                        * Invalidate
+                        */
+                       iommu->flush.flush_context = __iommu_flush_context;
+                       iommu->flush.flush_iotlb = __iommu_flush_iotlb;
+                       printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
+                              "invalidation\n",
+                               iommu->seq_id,
+                              (unsigned long long)drhd->reg_base_addr);
+               } else {
+                       iommu->flush.flush_context = qi_flush_context;
+                       iommu->flush.flush_iotlb = qi_flush_iotlb;
+                       printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
+                              "invalidation\n",
+                               iommu->seq_id,
+                              (unsigned long long)drhd->reg_base_addr);
+               }
+       }
+ 
+       if (iommu_pass_through)
+               iommu_identity_mapping |= IDENTMAP_ALL;
+ 
+ #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
+       iommu_identity_mapping |= IDENTMAP_GFX;
+ #endif
+ 
+       check_tylersburg_isoch();
+ 
+       /*
+        * If pass through is not set or not enabled, setup context entries for
+        * identity mappings for rmrr, gfx, and isa and may fall back to static
+        * identity mapping if iommu_identity_mapping is set.
+        */
+       if (iommu_identity_mapping) {
+               ret = iommu_prepare_static_identity_mapping(hw_pass_through);
+               if (ret) {
+                       printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
+                       goto error;
+               }
+       }
+       /*
+        * For each rmrr
+        *   for each dev attached to rmrr
+        *   do
+        *     locate drhd for dev, alloc domain for dev
+        *     allocate free domain
+        *     allocate page table entries for rmrr
+        *     if context not allocated for bus
+        *           allocate and init context
+        *           set present in root table for this bus
+        *     init context with domain, translation etc
+        *    endfor
+        * endfor
+        */
+       printk(KERN_INFO "IOMMU: Setting RMRR:\n");
+       for_each_rmrr_units(rmrr) {
+               for (i = 0; i < rmrr->devices_cnt; i++) {
+                       pdev = rmrr->devices[i];
+                       /*
+                        * some BIOS lists non-exist devices in DMAR
+                        * table.
+                        */
+                       if (!pdev)
+                               continue;
+                       ret = iommu_prepare_rmrr_dev(rmrr, pdev);
+                       if (ret)
+                               printk(KERN_ERR
+                                      "IOMMU: mapping reserved region failed\n");
+               }
+       }
+ 
+       iommu_prepare_isa();
+ 
+       /*
+        * for each drhd
+        *   enable fault log
+        *   global invalidate context cache
+        *   global invalidate iotlb
+        *   enable translation
+        */
+       for_each_drhd_unit(drhd) {
+               if (drhd->ignored) {
+                       /*
+                        * we always have to disable PMRs or DMA may fail on
+                        * this device
+                        */
+                       if (force_on)
+                               iommu_disable_protect_mem_regions(drhd->iommu);
+                       continue;
+               }
+               iommu = drhd->iommu;
+ 
+               iommu_flush_write_buffer(iommu);
+ 
+               ret = dmar_set_interrupt(iommu);
+               if (ret)
+                       goto error;
+ 
+               iommu_set_root_entry(iommu);
+ 
+               iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+               iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+ 
+               ret = iommu_enable_translation(iommu);
+               if (ret)
+                       goto error;
+ 
+               iommu_disable_protect_mem_regions(iommu);
+       }
+ 
+       return 0;
+ error:
+       for_each_drhd_unit(drhd) {
+               if (drhd->ignored)
+                       continue;
+               iommu = drhd->iommu;
+               free_iommu(iommu);
+       }
+       kfree(g_iommus);
+       return ret;
+ }
+ 
+ /* This takes a number of _MM_ pages, not VTD pages */
+ static struct iova *intel_alloc_iova(struct device *dev,
+                                    struct dmar_domain *domain,
+                                    unsigned long nrpages, uint64_t dma_mask)
+ {
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct iova *iova = NULL;
+ 
+       /* Restrict dma_mask to the width that the iommu can handle */
+       dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
+ 
+       if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
+               /*
+                * First try to allocate an io virtual address in
+                * DMA_BIT_MASK(32) and if that fails then try allocating
+                * from higher range
+                */
+               iova = alloc_iova(&domain->iovad, nrpages,
+                                 IOVA_PFN(DMA_BIT_MASK(32)), 1);
+               if (iova)
+                       return iova;
+       }
+       iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
+       if (unlikely(!iova)) {
+               printk(KERN_ERR "Allocating %ld-page iova for %s failed",
+                      nrpages, pci_name(pdev));
+               return NULL;
+       }
+ 
+       return iova;
+ }
+ 
+ static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
+ {
+       struct dmar_domain *domain;
+       int ret;
+ 
+       domain = get_domain_for_dev(pdev,
+                       DEFAULT_DOMAIN_ADDRESS_WIDTH);
+       if (!domain) {
+               printk(KERN_ERR
+                       "Allocating domain for %s failed", pci_name(pdev));
+               return NULL;
+       }
+ 
+       /* make sure context mapping is ok */
+       if (unlikely(!domain_context_mapped(pdev))) {
+               ret = domain_context_mapping(domain, pdev,
+                                            CONTEXT_TT_MULTI_LEVEL);
+               if (ret) {
+                       printk(KERN_ERR
+                               "Domain context map for %s failed",
+                               pci_name(pdev));
+                       return NULL;
+               }
+       }
+ 
+       return domain;
+ }
+ 
+ static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
+ {
+       struct device_domain_info *info;
+ 
+       /* No lock here, assumes no domain exit in normal case */
+       info = dev->dev.archdata.iommu;
+       if (likely(info))
+               return info->domain;
+ 
+       return __get_valid_domain_for_dev(dev);
+ }
+ 
+ static int iommu_dummy(struct pci_dev *pdev)
+ {
+       return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
+ }
+ 
+ /* Check if the pdev needs to go through non-identity map and unmap process.*/
+ static int iommu_no_mapping(struct device *dev)
+ {
+       struct pci_dev *pdev;
+       int found;
+ 
+       if (unlikely(dev->bus != &pci_bus_type))
+               return 1;
+ 
+       pdev = to_pci_dev(dev);
+       if (iommu_dummy(pdev))
+               return 1;
+ 
+       if (!iommu_identity_mapping)
+               return 0;
+ 
+       found = identity_mapping(pdev);
+       if (found) {
+               if (iommu_should_identity_map(pdev, 0))
+                       return 1;
+               else {
+                       /*
+                        * 32 bit DMA is removed from si_domain and fall back
+                        * to non-identity mapping.
+                        */
+                       domain_remove_one_dev_info(si_domain, pdev);
+                       printk(KERN_INFO "32bit %s uses non-identity mapping\n",
+                              pci_name(pdev));
+                       return 0;
+               }
+       } else {
+               /*
+                * In case of a detached 64 bit DMA device from vm, the device
+                * is put into si_domain for identity mapping.
+                */
+               if (iommu_should_identity_map(pdev, 0)) {
+                       int ret;
+                       ret = domain_add_dev_info(si_domain, pdev,
+                                                 hw_pass_through ?
+                                                 CONTEXT_TT_PASS_THROUGH :
+                                                 CONTEXT_TT_MULTI_LEVEL);
+                       if (!ret) {
+                               printk(KERN_INFO "64bit %s uses identity mapping\n",
+                                      pci_name(pdev));
+                               return 1;
+                       }
+               }
+       }
+ 
+       return 0;
+ }
+ 
+ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
+                                    size_t size, int dir, u64 dma_mask)
+ {
+       struct pci_dev *pdev = to_pci_dev(hwdev);
+       struct dmar_domain *domain;
+       phys_addr_t start_paddr;
+       struct iova *iova;
+       int prot = 0;
+       int ret;
+       struct intel_iommu *iommu;
+       unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
+ 
+       BUG_ON(dir == DMA_NONE);
+ 
+       if (iommu_no_mapping(hwdev))
+               return paddr;
+ 
+       domain = get_valid_domain_for_dev(pdev);
+       if (!domain)
+               return 0;
+ 
+       iommu = domain_get_iommu(domain);
+       size = aligned_nrpages(paddr, size);
+ 
+       iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
+       if (!iova)
+               goto error;
+ 
+       /*
+        * Check if DMAR supports zero-length reads on write only
+        * mappings..
+        */
+       if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+                       !cap_zlr(iommu->cap))
+               prot |= DMA_PTE_READ;
+       if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+               prot |= DMA_PTE_WRITE;
+       /*
+        * paddr - (paddr + size) might be partial page, we should map the whole
+        * page.  Note: if two part of one page are separately mapped, we
+        * might have two guest_addr mapping to the same host paddr, but this
+        * is not a big problem
+        */
+       ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
+                                mm_to_dma_pfn(paddr_pfn), size, prot);
+       if (ret)
+               goto error;
+ 
+       /* it's a non-present to present mapping. Only flush if caching mode */
+       if (cap_caching_mode(iommu->cap))
+               iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
+       else
+               iommu_flush_write_buffer(iommu);
+ 
+       start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
+       start_paddr += paddr & ~PAGE_MASK;
+       return start_paddr;
+ 
+ error:
+       if (iova)
+               __free_iova(&domain->iovad, iova);
+       printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
+               pci_name(pdev), size, (unsigned long long)paddr, dir);
+       return 0;
+ }
+ 
+ static dma_addr_t intel_map_page(struct device *dev, struct page *page,
+                                unsigned long offset, size_t size,
+                                enum dma_data_direction dir,
+                                struct dma_attrs *attrs)
+ {
+       return __intel_map_single(dev, page_to_phys(page) + offset, size,
+                                 dir, to_pci_dev(dev)->dma_mask);
+ }
+ 
+ static void flush_unmaps(void)
+ {
+       int i, j;
+ 
+       timer_on = 0;
+ 
+       /* just flush them all */
+       for (i = 0; i < g_num_of_iommus; i++) {
+               struct intel_iommu *iommu = g_iommus[i];
+               if (!iommu)
+                       continue;
+ 
+               if (!deferred_flush[i].next)
+                       continue;
+ 
+               /* In caching mode, global flushes turn emulation expensive */
+               if (!cap_caching_mode(iommu->cap))
+                       iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+                                        DMA_TLB_GLOBAL_FLUSH);
+               for (j = 0; j < deferred_flush[i].next; j++) {
+                       unsigned long mask;
+                       struct iova *iova = deferred_flush[i].iova[j];
+                       struct dmar_domain *domain = deferred_flush[i].domain[j];
+ 
+                       /* On real hardware multiple invalidations are expensive */
+                       if (cap_caching_mode(iommu->cap))
+                               iommu_flush_iotlb_psi(iommu, domain->id,
+                               iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
+                       else {
+                               mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
+                               iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
+                                               (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
+                       }
+                       __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
+               }
+               deferred_flush[i].next = 0;
+       }
+ 
+       list_size = 0;
+ }
+ 
+ static void flush_unmaps_timeout(unsigned long data)
+ {
+       unsigned long flags;
+ 
+       spin_lock_irqsave(&async_umap_flush_lock, flags);
+       flush_unmaps();
+       spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+ }
+ 
+ static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+ {
+       unsigned long flags;
+       int next, iommu_id;
+       struct intel_iommu *iommu;
+ 
+       spin_lock_irqsave(&async_umap_flush_lock, flags);
+       if (list_size == HIGH_WATER_MARK)
+               flush_unmaps();
+ 
+       iommu = domain_get_iommu(dom);
+       iommu_id = iommu->seq_id;
+ 
+       next = deferred_flush[iommu_id].next;
+       deferred_flush[iommu_id].domain[next] = dom;
+       deferred_flush[iommu_id].iova[next] = iova;
+       deferred_flush[iommu_id].next++;
+ 
+       if (!timer_on) {
+               mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
+               timer_on = 1;
+       }
+       list_size++;
+       spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+ }
+ 
+ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
+                            size_t size, enum dma_data_direction dir,
+                            struct dma_attrs *attrs)
+ {
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct dmar_domain *domain;
+       unsigned long start_pfn, last_pfn;
+       struct iova *iova;
+       struct intel_iommu *iommu;
+ 
+       if (iommu_no_mapping(dev))
+               return;
+ 
+       domain = find_domain(pdev);
+       BUG_ON(!domain);
+ 
+       iommu = domain_get_iommu(domain);
+ 
+       iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
+       if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
+                     (unsigned long long)dev_addr))
+               return;
+ 
+       start_pfn = mm_to_dma_pfn(iova->pfn_lo);
+       last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
+ 
+       pr_debug("Device %s unmapping: pfn %lx-%lx\n",
+                pci_name(pdev), start_pfn, last_pfn);
+ 
+       /*  clear the whole page */
+       dma_pte_clear_range(domain, start_pfn, last_pfn);
+ 
+       /* free page tables */
+       dma_pte_free_pagetable(domain, start_pfn, last_pfn);
+ 
+       if (intel_iommu_strict) {
+               iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
+                                     last_pfn - start_pfn + 1, 0);
+               /* free iova */
+               __free_iova(&domain->iovad, iova);
+       } else {
+               add_unmap(domain, iova);
+               /*
+                * queue up the release of the unmap to save the 1/6th of the
+                * cpu used up by the iotlb flush operation...
+                */
+       }
+ }
+ 
+ static void *intel_alloc_coherent(struct device *hwdev, size_t size,
+                                 dma_addr_t *dma_handle, gfp_t flags)
+ {
+       void *vaddr;
+       int order;
+ 
+       size = PAGE_ALIGN(size);
+       order = get_order(size);
+ 
+       if (!iommu_no_mapping(hwdev))
+               flags &= ~(GFP_DMA | GFP_DMA32);
+       else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
+               if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
+                       flags |= GFP_DMA;
+               else
+                       flags |= GFP_DMA32;
+       }
+ 
+       vaddr = (void *)__get_free_pages(flags, order);
+       if (!vaddr)
+               return NULL;
+       memset(vaddr, 0, size);
+ 
+       *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
+                                        DMA_BIDIRECTIONAL,
+                                        hwdev->coherent_dma_mask);
+       if (*dma_handle)
+               return vaddr;
+       free_pages((unsigned long)vaddr, order);
+       return NULL;
+ }
+ 
+ static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+                               dma_addr_t dma_handle)
+ {
+       int order;
+ 
+       size = PAGE_ALIGN(size);
+       order = get_order(size);
+ 
+       intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
+       free_pages((unsigned long)vaddr, order);
+ }
+ 
+ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
+                          int nelems, enum dma_data_direction dir,
+                          struct dma_attrs *attrs)
+ {
+       struct pci_dev *pdev = to_pci_dev(hwdev);
+       struct dmar_domain *domain;
+       unsigned long start_pfn, last_pfn;
+       struct iova *iova;
+       struct intel_iommu *iommu;
+ 
+       if (iommu_no_mapping(hwdev))
+               return;
+ 
+       domain = find_domain(pdev);
+       BUG_ON(!domain);
+ 
+       iommu = domain_get_iommu(domain);
+ 
+       iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
+       if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
+                     (unsigned long long)sglist[0].dma_address))
+               return;
+ 
+       start_pfn = mm_to_dma_pfn(iova->pfn_lo);
+       last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
+ 
+       /*  clear the whole page */
+       dma_pte_clear_range(domain, start_pfn, last_pfn);
+ 
+       /* free page tables */
+       dma_pte_free_pagetable(domain, start_pfn, last_pfn);
+ 
+       if (intel_iommu_strict) {
+               iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
+                                     last_pfn - start_pfn + 1, 0);
+               /* free iova */
+               __free_iova(&domain->iovad, iova);
+       } else {
+               add_unmap(domain, iova);
+               /*
+                * queue up the release of the unmap to save the 1/6th of the
+                * cpu used up by the iotlb flush operation...
+                */
+       }
+ }
+ 
+ static int intel_nontranslate_map_sg(struct device *hddev,
+       struct scatterlist *sglist, int nelems, int dir)
+ {
+       int i;
+       struct scatterlist *sg;
+ 
+       for_each_sg(sglist, sg, nelems, i) {
+               BUG_ON(!sg_page(sg));
+               sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
+               sg->dma_length = sg->length;
+       }
+       return nelems;
+ }
+ 
+ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
+                       enum dma_data_direction dir, struct dma_attrs *attrs)
+ {
+       int i;
+       struct pci_dev *pdev = to_pci_dev(hwdev);
+       struct dmar_domain *domain;
+       size_t size = 0;
+       int prot = 0;
+       struct iova *iova = NULL;
+       int ret;
+       struct scatterlist *sg;
+       unsigned long start_vpfn;
+       struct intel_iommu *iommu;
+ 
+       BUG_ON(dir == DMA_NONE);
+       if (iommu_no_mapping(hwdev))
+               return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
+ 
+       domain = get_valid_domain_for_dev(pdev);
+       if (!domain)
+               return 0;
+ 
+       iommu = domain_get_iommu(domain);
+ 
+       for_each_sg(sglist, sg, nelems, i)
+               size += aligned_nrpages(sg->offset, sg->length);
+ 
+       iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
+                               pdev->dma_mask);
+       if (!iova) {
+               sglist->dma_length = 0;
+               return 0;
+       }
+ 
+       /*
+        * Check if DMAR supports zero-length reads on write only
+        * mappings..
+        */
+       if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+                       !cap_zlr(iommu->cap))
+               prot |= DMA_PTE_READ;
+       if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+               prot |= DMA_PTE_WRITE;
+ 
+       start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
+ 
+       ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
+       if (unlikely(ret)) {
+               /*  clear the page */
+               dma_pte_clear_range(domain, start_vpfn,
+                                   start_vpfn + size - 1);
+               /* free page tables */
+               dma_pte_free_pagetable(domain, start_vpfn,
+                                      start_vpfn + size - 1);
+               /* free iova */
+               __free_iova(&domain->iovad, iova);
+               return 0;
+       }
+ 
+       /* it's a non-present to present mapping. Only flush if caching mode */
+       if (cap_caching_mode(iommu->cap))
+               iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
+       else
+               iommu_flush_write_buffer(iommu);
+ 
+       return nelems;
+ }
+ 
+ static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
+ {
+       return !dma_addr;
+ }
+ 
+ struct dma_map_ops intel_dma_ops = {
+       .alloc_coherent = intel_alloc_coherent,
+       .free_coherent = intel_free_coherent,
+       .map_sg = intel_map_sg,
+       .unmap_sg = intel_unmap_sg,
+       .map_page = intel_map_page,
+       .unmap_page = intel_unmap_page,
+       .mapping_error = intel_mapping_error,
+ };
+ 
+ static inline int iommu_domain_cache_init(void)
+ {
+       int ret = 0;
+ 
+       iommu_domain_cache = kmem_cache_create("iommu_domain",
+                                        sizeof(struct dmar_domain),
+                                        0,
+                                        SLAB_HWCACHE_ALIGN,
+ 
+                                        NULL);
+       if (!iommu_domain_cache) {
+               printk(KERN_ERR "Couldn't create iommu_domain cache\n");
+               ret = -ENOMEM;
+       }
+ 
+       return ret;
+ }
+ 
+ static inline int iommu_devinfo_cache_init(void)
+ {
+       int ret = 0;
+ 
+       iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
+                                        sizeof(struct device_domain_info),
+                                        0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        NULL);
+       if (!iommu_devinfo_cache) {
+               printk(KERN_ERR "Couldn't create devinfo cache\n");
+               ret = -ENOMEM;
+       }
+ 
+       return ret;
+ }
+ 
+ static inline int iommu_iova_cache_init(void)
+ {
+       int ret = 0;
+ 
+       iommu_iova_cache = kmem_cache_create("iommu_iova",
+                                        sizeof(struct iova),
+                                        0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        NULL);
+       if (!iommu_iova_cache) {
+               printk(KERN_ERR "Couldn't create iova cache\n");
+               ret = -ENOMEM;
+       }
+ 
+       return ret;
+ }
+ 
+ static int __init iommu_init_mempool(void)
+ {
+       int ret;
+       ret = iommu_iova_cache_init();
+       if (ret)
+               return ret;
+ 
+       ret = iommu_domain_cache_init();
+       if (ret)
+               goto domain_error;
+ 
+       ret = iommu_devinfo_cache_init();
+       if (!ret)
+               return ret;
+ 
+       kmem_cache_destroy(iommu_domain_cache);
+ domain_error:
+       kmem_cache_destroy(iommu_iova_cache);
+ 
+       return -ENOMEM;
+ }
+ 
+ static void __init iommu_exit_mempool(void)
+ {
+       kmem_cache_destroy(iommu_devinfo_cache);
+       kmem_cache_destroy(iommu_domain_cache);
+       kmem_cache_destroy(iommu_iova_cache);
+ 
+ }
+ 
+ static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
+ {
+       struct dmar_drhd_unit *drhd;
+       u32 vtbar;
+       int rc;
+ 
+       /* We know that this device on this chipset has its own IOMMU.
+        * If we find it under a different IOMMU, then the BIOS is lying
+        * to us. Hope that the IOMMU for this device is actually
+        * disabled, and it needs no translation...
+        */
+       rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
+       if (rc) {
+               /* "can't" happen */
+               dev_info(&pdev->dev, "failed to run vt-d quirk\n");
+               return;
+       }
+       vtbar &= 0xffff0000;
+ 
+       /* we know that the this iommu should be at offset 0xa000 from vtbar */
+       drhd = dmar_find_matched_drhd_unit(pdev);
+       if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
+                           TAINT_FIRMWARE_WORKAROUND,
+                           "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
+               pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+ }
+ DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
+ 
+ static void __init init_no_remapping_devices(void)
+ {
+       struct dmar_drhd_unit *drhd;
+ 
+       for_each_drhd_unit(drhd) {
+               if (!drhd->include_all) {
+                       int i;
+                       for (i = 0; i < drhd->devices_cnt; i++)
+                               if (drhd->devices[i] != NULL)
+                                       break;
+                       /* ignore DMAR unit if no pci devices exist */
+                       if (i == drhd->devices_cnt)
+                               drhd->ignored = 1;
+               }
+       }
+ 
+       for_each_drhd_unit(drhd) {
+               int i;
+               if (drhd->ignored || drhd->include_all)
+                       continue;
+ 
+               for (i = 0; i < drhd->devices_cnt; i++)
+                       if (drhd->devices[i] &&
+                           !IS_GFX_DEVICE(drhd->devices[i]))
+                               break;
+ 
+               if (i < drhd->devices_cnt)
+                       continue;
+ 
+               /* This IOMMU has *only* gfx devices. Either bypass it or
+                  set the gfx_mapped flag, as appropriate */
+               if (dmar_map_gfx) {
+                       intel_iommu_gfx_mapped = 1;
+               } else {
+                       drhd->ignored = 1;
+                       for (i = 0; i < drhd->devices_cnt; i++) {
+                               if (!drhd->devices[i])
+                                       continue;
+                               drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+                       }
+               }
+       }
+ }
+ 
+ #ifdef CONFIG_SUSPEND
+ static int init_iommu_hw(void)
+ {
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu = NULL;
+ 
+       for_each_active_iommu(iommu, drhd)
+               if (iommu->qi)
+                       dmar_reenable_qi(iommu);
+ 
+       for_each_iommu(iommu, drhd) {
+               if (drhd->ignored) {
+                       /*
+                        * we always have to disable PMRs or DMA may fail on
+                        * this device
+                        */
+                       if (force_on)
+                               iommu_disable_protect_mem_regions(iommu);
+                       continue;
+               }
+       
+               iommu_flush_write_buffer(iommu);
+ 
+               iommu_set_root_entry(iommu);
+ 
+               iommu->flush.flush_context(iommu, 0, 0, 0,
+                                          DMA_CCMD_GLOBAL_INVL);
+               iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+                                        DMA_TLB_GLOBAL_FLUSH);
+               if (iommu_enable_translation(iommu))
+                       return 1;
+               iommu_disable_protect_mem_regions(iommu);
+       }
+ 
+       return 0;
+ }
+ 
+ static void iommu_flush_all(void)
+ {
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+ 
+       for_each_active_iommu(iommu, drhd) {
+               iommu->flush.flush_context(iommu, 0, 0, 0,
+                                          DMA_CCMD_GLOBAL_INVL);
+               iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+                                        DMA_TLB_GLOBAL_FLUSH);
+       }
+ }
+ 
+ static int iommu_suspend(void)
+ {
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu = NULL;
+       unsigned long flag;
+ 
+       for_each_active_iommu(iommu, drhd) {
+               iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
+                                                GFP_ATOMIC);
+               if (!iommu->iommu_state)
+                       goto nomem;
+       }
+ 
+       iommu_flush_all();
+ 
+       for_each_active_iommu(iommu, drhd) {
+               iommu_disable_translation(iommu);
+ 
+               raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ 
+               iommu->iommu_state[SR_DMAR_FECTL_REG] =
+                       readl(iommu->reg + DMAR_FECTL_REG);
+               iommu->iommu_state[SR_DMAR_FEDATA_REG] =
+                       readl(iommu->reg + DMAR_FEDATA_REG);
+               iommu->iommu_state[SR_DMAR_FEADDR_REG] =
+                       readl(iommu->reg + DMAR_FEADDR_REG);
+               iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
+                       readl(iommu->reg + DMAR_FEUADDR_REG);
+ 
+               raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+       }
+       return 0;
+ 
+ nomem:
+       for_each_active_iommu(iommu, drhd)
+               kfree(iommu->iommu_state);
+ 
+       return -ENOMEM;
+ }
+ 
+ static void iommu_resume(void)
+ {
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu = NULL;
+       unsigned long flag;
+ 
+       if (init_iommu_hw()) {
+               if (force_on)
+                       panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
+               else
+                       WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
+               return;
+       }
+ 
+       for_each_active_iommu(iommu, drhd) {
+ 
+               raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ 
+               writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
+                       iommu->reg + DMAR_FECTL_REG);
+               writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
+                       iommu->reg + DMAR_FEDATA_REG);
+               writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
+                       iommu->reg + DMAR_FEADDR_REG);
+               writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
+                       iommu->reg + DMAR_FEUADDR_REG);
+ 
+               raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+       }
+ 
+       for_each_active_iommu(iommu, drhd)
+               kfree(iommu->iommu_state);
+ }
+ 
+ static struct syscore_ops iommu_syscore_ops = {
+       .resume         = iommu_resume,
+       .suspend        = iommu_suspend,
+ };
+ 
+ static void __init init_iommu_pm_ops(void)
+ {
+       register_syscore_ops(&iommu_syscore_ops);
+ }
+ 
+ #else
+ static inline void init_iommu_pm_ops(void) {}
+ #endif        /* CONFIG_PM */
+ 
+ LIST_HEAD(dmar_rmrr_units);
+ 
+ static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
+ {
+       list_add(&rmrr->list, &dmar_rmrr_units);
+ }
+ 
+ 
+ int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
+ {
+       struct acpi_dmar_reserved_memory *rmrr;
+       struct dmar_rmrr_unit *rmrru;
+ 
+       rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
+       if (!rmrru)
+               return -ENOMEM;
+ 
+       rmrru->hdr = header;
+       rmrr = (struct acpi_dmar_reserved_memory *)header;
+       rmrru->base_address = rmrr->base_address;
+       rmrru->end_address = rmrr->end_address;
+ 
+       dmar_register_rmrr_unit(rmrru);
+       return 0;
+ }
+ 
+ static int __init
+ rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
+ {
+       struct acpi_dmar_reserved_memory *rmrr;
+       int ret;
+ 
+       rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
+       ret = dmar_parse_dev_scope((void *)(rmrr + 1),
+               ((void *)rmrr) + rmrr->header.length,
+               &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
+ 
+       if (ret || (rmrru->devices_cnt == 0)) {
+               list_del(&rmrru->list);
+               kfree(rmrru);
+       }
+       return ret;
+ }
+ 
+ static LIST_HEAD(dmar_atsr_units);
+ 
+ int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
+ {
+       struct acpi_dmar_atsr *atsr;
+       struct dmar_atsr_unit *atsru;
+ 
+       atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+       atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
+       if (!atsru)
+               return -ENOMEM;
+ 
+       atsru->hdr = hdr;
+       atsru->include_all = atsr->flags & 0x1;
+ 
+       list_add(&atsru->list, &dmar_atsr_units);
+ 
+       return 0;
+ }
+ 
+ static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
+ {
+       int rc;
+       struct acpi_dmar_atsr *atsr;
+ 
+       if (atsru->include_all)
+               return 0;
+ 
+       atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+       rc = dmar_parse_dev_scope((void *)(atsr + 1),
+                               (void *)atsr + atsr->header.length,
+                               &atsru->devices_cnt, &atsru->devices,
+                               atsr->segment);
+       if (rc || !atsru->devices_cnt) {
+               list_del(&atsru->list);
+               kfree(atsru);
+       }
+ 
+       return rc;
+ }
+ 
+ int dmar_find_matched_atsr_unit(struct pci_dev *dev)
+ {
+       int i;
+       struct pci_bus *bus;
+       struct acpi_dmar_atsr *atsr;
+       struct dmar_atsr_unit *atsru;
+ 
+       dev = pci_physfn(dev);
+ 
+       list_for_each_entry(atsru, &dmar_atsr_units, list) {
+               atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+               if (atsr->segment == pci_domain_nr(dev->bus))
+                       goto found;
+       }
+ 
+       return 0;
+ 
+ found:
+       for (bus = dev->bus; bus; bus = bus->parent) {
+               struct pci_dev *bridge = bus->self;
+ 
+               if (!bridge || !pci_is_pcie(bridge) ||
+                   bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
+                       return 0;
+ 
+               if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
+                       for (i = 0; i < atsru->devices_cnt; i++)
+                               if (atsru->devices[i] == bridge)
+                                       return 1;
+                       break;
+               }
+       }
+ 
+       if (atsru->include_all)
+               return 1;
+ 
+       return 0;
+ }
+ 
+ int dmar_parse_rmrr_atsr_dev(void)
+ {
+       struct dmar_rmrr_unit *rmrr, *rmrr_n;
+       struct dmar_atsr_unit *atsr, *atsr_n;
+       int ret = 0;
+ 
+       list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
+               ret = rmrr_parse_dev(rmrr);
+               if (ret)
+                       return ret;
+       }
+ 
+       list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
+               ret = atsr_parse_dev(atsr);
+               if (ret)
+                       return ret;
+       }
+ 
+       return ret;
+ }
+ 
+ /*
+  * Here we only respond to action of unbound device from driver.
+  *
+  * Added device is not attached to its DMAR domain here yet. That will happen
+  * when mapping the device to iova.
+  */
+ static int device_notifier(struct notifier_block *nb,
+                                 unsigned long action, void *data)
+ {
+       struct device *dev = data;
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct dmar_domain *domain;
+ 
+       if (iommu_no_mapping(dev))
+               return 0;
+ 
+       domain = find_domain(pdev);
+       if (!domain)
+               return 0;
+ 
+       if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
+               domain_remove_one_dev_info(domain, pdev);
+ 
+               if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
+                   !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
+                   list_empty(&domain->devices))
+                       domain_exit(domain);
+       }
+ 
+       return 0;
+ }
+ 
+ static struct notifier_block device_nb = {
+       .notifier_call = device_notifier,
+ };
+ 
+ int __init intel_iommu_init(void)
+ {
+       int ret = 0;
+ 
+       /* VT-d is required for a TXT/tboot launch, so enforce that */
+       force_on = tboot_force_iommu();
+ 
+       if (dmar_table_init()) {
+               if (force_on)
+                       panic("tboot: Failed to initialize DMAR table\n");
+               return  -ENODEV;
+       }
+ 
+       if (dmar_dev_scope_init() < 0) {
+               if (force_on)
+                       panic("tboot: Failed to initialize DMAR device scope\n");
+               return  -ENODEV;
+       }
+ 
+       if (no_iommu || dmar_disabled)
+               return -ENODEV;
+ 
+       if (iommu_init_mempool()) {
+               if (force_on)
+                       panic("tboot: Failed to initialize iommu memory\n");
+               return  -ENODEV;
+       }
+ 
+       if (list_empty(&dmar_rmrr_units))
+               printk(KERN_INFO "DMAR: No RMRR found\n");
+ 
+       if (list_empty(&dmar_atsr_units))
+               printk(KERN_INFO "DMAR: No ATSR found\n");
+ 
+       if (dmar_init_reserved_ranges()) {
+               if (force_on)
+                       panic("tboot: Failed to reserve iommu ranges\n");
+               return  -ENODEV;
+       }
+ 
+       init_no_remapping_devices();
+ 
+       ret = init_dmars();
+       if (ret) {
+               if (force_on)
+                       panic("tboot: Failed to initialize DMARs\n");
+               printk(KERN_ERR "IOMMU: dmar init failed\n");
+               put_iova_domain(&reserved_iova_list);
+               iommu_exit_mempool();
+               return ret;
+       }
+       printk(KERN_INFO
+       "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
+ 
+       init_timer(&unmap_timer);
+ #ifdef CONFIG_SWIOTLB
+       swiotlb = 0;
+ #endif
+       dma_ops = &intel_dma_ops;
+ 
+       init_iommu_pm_ops();
+ 
+       bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
+ 
+       bus_register_notifier(&pci_bus_type, &device_nb);
+ 
+       return 0;
+ }
+ 
+ static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
+                                          struct pci_dev *pdev)
+ {
+       struct pci_dev *tmp, *parent;
+ 
+       if (!iommu || !pdev)
+               return;
+ 
+       /* dependent device detach */
+       tmp = pci_find_upstream_pcie_bridge(pdev);
+       /* Secondary interface's bus number and devfn 0 */
+       if (tmp) {
+               parent = pdev->bus->self;
+               while (parent != tmp) {
+                       iommu_detach_dev(iommu, parent->bus->number,
+                                        parent->devfn);
+                       parent = parent->bus->self;
+               }
+               if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
+                       iommu_detach_dev(iommu,
+                               tmp->subordinate->number, 0);
+               else /* this is a legacy PCI bridge */
+                       iommu_detach_dev(iommu, tmp->bus->number,
+                                        tmp->devfn);
+       }
+ }
+ 
+ static void domain_remove_one_dev_info(struct dmar_domain *domain,
+                                         struct pci_dev *pdev)
+ {
+       struct device_domain_info *info;
+       struct intel_iommu *iommu;
+       unsigned long flags;
+       int found = 0;
+       struct list_head *entry, *tmp;
+ 
+       iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
+                               pdev->devfn);
+       if (!iommu)
+               return;
+ 
+       spin_lock_irqsave(&device_domain_lock, flags);
+       list_for_each_safe(entry, tmp, &domain->devices) {
+               info = list_entry(entry, struct device_domain_info, link);
+               if (info->segment == pci_domain_nr(pdev->bus) &&
+                   info->bus == pdev->bus->number &&
+                   info->devfn == pdev->devfn) {
+                       list_del(&info->link);
+                       list_del(&info->global);
+                       if (info->dev)
+                               info->dev->dev.archdata.iommu = NULL;
+                       spin_unlock_irqrestore(&device_domain_lock, flags);
+ 
+                       iommu_disable_dev_iotlb(info);
+                       iommu_detach_dev(iommu, info->bus, info->devfn);
+                       iommu_detach_dependent_devices(iommu, pdev);
+                       free_devinfo_mem(info);
+ 
+                       spin_lock_irqsave(&device_domain_lock, flags);
+ 
+                       if (found)
+                               break;
+                       else
+                               continue;
+               }
+ 
+               /* if there is no other devices under the same iommu
+                * owned by this domain, clear this iommu in iommu_bmp
+                * update iommu count and coherency
+                */
+               if (iommu == device_to_iommu(info->segment, info->bus,
+                                           info->devfn))
+                       found = 1;
+       }
+ 
+       spin_unlock_irqrestore(&device_domain_lock, flags);
+ 
+       if (found == 0) {
+               unsigned long tmp_flags;
+               spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
+               clear_bit(iommu->seq_id, &domain->iommu_bmp);
+               domain->iommu_count--;
+               domain_update_iommu_cap(domain);
+               spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
+ 
+               if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
+                   !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
+                       spin_lock_irqsave(&iommu->lock, tmp_flags);
+                       clear_bit(domain->id, iommu->domain_ids);
+                       iommu->domains[domain->id] = NULL;
+                       spin_unlock_irqrestore(&iommu->lock, tmp_flags);
+               }
+       }
+ }
+ 
+ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
+ {
+       struct device_domain_info *info;
+       struct intel_iommu *iommu;
+       unsigned long flags1, flags2;
+ 
+       spin_lock_irqsave(&device_domain_lock, flags1);
+       while (!list_empty(&domain->devices)) {
+               info = list_entry(domain->devices.next,
+                       struct device_domain_info, link);
+               list_del(&info->link);
+               list_del(&info->global);
+               if (info->dev)
+                       info->dev->dev.archdata.iommu = NULL;
+ 
+               spin_unlock_irqrestore(&device_domain_lock, flags1);
+ 
+               iommu_disable_dev_iotlb(info);
+               iommu = device_to_iommu(info->segment, info->bus, info->devfn);
+               iommu_detach_dev(iommu, info->bus, info->devfn);
+               iommu_detach_dependent_devices(iommu, info->dev);
+ 
+               /* clear this iommu in iommu_bmp, update iommu count
+                * and capabilities
+                */
+               spin_lock_irqsave(&domain->iommu_lock, flags2);
+               if (test_and_clear_bit(iommu->seq_id,
+                                      &domain->iommu_bmp)) {
+                       domain->iommu_count--;
+                       domain_update_iommu_cap(domain);
+               }
+               spin_unlock_irqrestore(&domain->iommu_lock, flags2);
+ 
+               free_devinfo_mem(info);
+               spin_lock_irqsave(&device_domain_lock, flags1);
+       }
+       spin_unlock_irqrestore(&device_domain_lock, flags1);
+ }
+ 
+ /* domain id for virtual machine, it won't be set in context */
+ static unsigned long vm_domid;
+ 
+ static struct dmar_domain *iommu_alloc_vm_domain(void)
+ {
+       struct dmar_domain *domain;
+ 
+       domain = alloc_domain_mem();
+       if (!domain)
+               return NULL;
+ 
+       domain->id = vm_domid++;
+       domain->nid = -1;
+       memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
+       domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
+ 
+       return domain;
+ }
+ 
+ static int md_domain_init(struct dmar_domain *domain, int guest_width)
+ {
+       int adjust_width;
+ 
+       init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
+       spin_lock_init(&domain->iommu_lock);
+ 
+       domain_reserve_special_ranges(domain);
+ 
+       /* calculate AGAW */
+       domain->gaw = guest_width;
+       adjust_width = guestwidth_to_adjustwidth(guest_width);
+       domain->agaw = width_to_agaw(adjust_width);
+ 
+       INIT_LIST_HEAD(&domain->devices);
+ 
+       domain->iommu_count = 0;
+       domain->iommu_coherency = 0;
+       domain->iommu_snooping = 0;
+       domain->iommu_superpage = 0;
+       domain->max_addr = 0;
+       domain->nid = -1;
+ 
+       /* always allocate the top pgd */
+       domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
+       if (!domain->pgd)
+               return -ENOMEM;
+       domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
+       return 0;
+ }
+ 
+ static void iommu_free_vm_domain(struct dmar_domain *domain)
+ {
+       unsigned long flags;
+       struct dmar_drhd_unit *drhd;
+       struct intel_iommu *iommu;
+       unsigned long i;
+       unsigned long ndomains;
+ 
+       for_each_drhd_unit(drhd) {
+               if (drhd->ignored)
+                       continue;
+               iommu = drhd->iommu;
+ 
+               ndomains = cap_ndoms(iommu->cap);
+               for_each_set_bit(i, iommu->domain_ids, ndomains) {
+                       if (iommu->domains[i] == domain) {
+                               spin_lock_irqsave(&iommu->lock, flags);
+                               clear_bit(i, iommu->domain_ids);
+                               iommu->domains[i] = NULL;
+                               spin_unlock_irqrestore(&iommu->lock, flags);
+                               break;
+                       }
+               }
+       }
+ }
+ 
+ static void vm_domain_exit(struct dmar_domain *domain)
+ {
+       /* Domain 0 is reserved, so dont process it */
+       if (!domain)
+               return;
+ 
+       vm_domain_remove_all_dev_info(domain);
+       /* destroy iovas */
+       put_iova_domain(&domain->iovad);
+ 
+       /* clear ptes */
+       dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+ 
+       /* free page tables */
+       dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+ 
+       iommu_free_vm_domain(domain);
+       free_domain_mem(domain);
+ }
+ 
+ static int intel_iommu_domain_init(struct iommu_domain *domain)
+ {
+       struct dmar_domain *dmar_domain;
+ 
+       dmar_domain = iommu_alloc_vm_domain();
+       if (!dmar_domain) {
+               printk(KERN_ERR
+                       "intel_iommu_domain_init: dmar_domain == NULL\n");
+               return -ENOMEM;
+       }
+       if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+               printk(KERN_ERR
+                       "intel_iommu_domain_init() failed\n");
+               vm_domain_exit(dmar_domain);
+               return -ENOMEM;
+       }
+       domain_update_iommu_cap(dmar_domain);
+       domain->priv = dmar_domain;
+ 
+       return 0;
+ }
+ 
+ static void intel_iommu_domain_destroy(struct iommu_domain *domain)
+ {
+       struct dmar_domain *dmar_domain = domain->priv;
+ 
+       domain->priv = NULL;
+       vm_domain_exit(dmar_domain);
+ }
+ 
+ static int intel_iommu_attach_device(struct iommu_domain *domain,
+                                    struct device *dev)
+ {
+       struct dmar_domain *dmar_domain = domain->priv;
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct intel_iommu *iommu;
+       int addr_width;
+ 
+       /* normally pdev is not mapped */
+       if (unlikely(domain_context_mapped(pdev))) {
+               struct dmar_domain *old_domain;
+ 
+               old_domain = find_domain(pdev);
+               if (old_domain) {
+                       if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
+                           dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
+                               domain_remove_one_dev_info(old_domain, pdev);
+                       else
+                               domain_remove_dev_info(old_domain);
+               }
+       }
+ 
+       iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
+                               pdev->devfn);
+       if (!iommu)
+               return -ENODEV;
+ 
+       /* check if this iommu agaw is sufficient for max mapped address */
+       addr_width = agaw_to_width(iommu->agaw);
+       if (addr_width > cap_mgaw(iommu->cap))
+               addr_width = cap_mgaw(iommu->cap);
+ 
+       if (dmar_domain->max_addr > (1LL << addr_width)) {
+               printk(KERN_ERR "%s: iommu width (%d) is not "
+                      "sufficient for the mapped address (%llx)\n",
+                      __func__, addr_width, dmar_domain->max_addr);
+               return -EFAULT;
+       }
+       dmar_domain->gaw = addr_width;
+ 
+       /*
+        * Knock out extra levels of page tables if necessary
+        */
+       while (iommu->agaw < dmar_domain->agaw) {
+               struct dma_pte *pte;
+ 
+               pte = dmar_domain->pgd;
+               if (dma_pte_present(pte)) {
+                       dmar_domain->pgd = (struct dma_pte *)
+                               phys_to_virt(dma_pte_addr(pte));
+                       free_pgtable_page(pte);
+               }
+               dmar_domain->agaw--;
+       }
+ 
+       return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
+ }
+ 
+ static void intel_iommu_detach_device(struct iommu_domain *domain,
+                                     struct device *dev)
+ {
+       struct dmar_domain *dmar_domain = domain->priv;
+       struct pci_dev *pdev = to_pci_dev(dev);
+ 
+       domain_remove_one_dev_info(dmar_domain, pdev);
+ }
+ 
+ static int intel_iommu_map(struct iommu_domain *domain,
+                          unsigned long iova, phys_addr_t hpa,
+                          int gfp_order, int iommu_prot)
+ {
+       struct dmar_domain *dmar_domain = domain->priv;
+       u64 max_addr;
+       int prot = 0;
+       size_t size;
+       int ret;
+ 
+       if (iommu_prot & IOMMU_READ)
+               prot |= DMA_PTE_READ;
+       if (iommu_prot & IOMMU_WRITE)
+               prot |= DMA_PTE_WRITE;
+       if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
+               prot |= DMA_PTE_SNP;
+ 
+       size     = PAGE_SIZE << gfp_order;
+       max_addr = iova + size;
+       if (dmar_domain->max_addr < max_addr) {
+               u64 end;
+ 
+               /* check if minimum agaw is sufficient for mapped address */
+               end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
+               if (end < max_addr) {
+                       printk(KERN_ERR "%s: iommu width (%d) is not "
+                              "sufficient for the mapped address (%llx)\n",
+                              __func__, dmar_domain->gaw, max_addr);
+                       return -EFAULT;
+               }
+               dmar_domain->max_addr = max_addr;
+       }
+       /* Round up size to next multiple of PAGE_SIZE, if it and
+          the low bits of hpa would take us onto the next page */
+       size = aligned_nrpages(hpa, size);
+       ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
+                                hpa >> VTD_PAGE_SHIFT, size, prot);
+       return ret;
+ }
+ 
+ static int intel_iommu_unmap(struct iommu_domain *domain,
+                            unsigned long iova, int gfp_order)
+ {
+       struct dmar_domain *dmar_domain = domain->priv;
+       size_t size = PAGE_SIZE << gfp_order;
+       int order;
+ 
+       order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
+                           (iova + size - 1) >> VTD_PAGE_SHIFT);
+ 
+       if (dmar_domain->max_addr == iova + size)
+               dmar_domain->max_addr = iova;
+ 
+       return order;
+ }
+ 
+ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
+                                           unsigned long iova)
+ {
+       struct dmar_domain *dmar_domain = domain->priv;
+       struct dma_pte *pte;
+       u64 phys = 0;
+ 
+       pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
+       if (pte)
+               phys = dma_pte_addr(pte);
+ 
+       return phys;
+ }
+ 
+ static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
+                                     unsigned long cap)
+ {
+       struct dmar_domain *dmar_domain = domain->priv;
+ 
+       if (cap == IOMMU_CAP_CACHE_COHERENCY)
+               return dmar_domain->iommu_snooping;
+       if (cap == IOMMU_CAP_INTR_REMAP)
+               return intr_remapping_enabled;
+ 
+       return 0;
+ }
+ 
+ static struct iommu_ops intel_iommu_ops = {
+       .domain_init    = intel_iommu_domain_init,
+       .domain_destroy = intel_iommu_domain_destroy,
+       .attach_dev     = intel_iommu_attach_device,
+       .detach_dev     = intel_iommu_detach_device,
+       .map            = intel_iommu_map,
+       .unmap          = intel_iommu_unmap,
+       .iova_to_phys   = intel_iommu_iova_to_phys,
+       .domain_has_cap = intel_iommu_domain_has_cap,
+ };
+ 
+ static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
+ {
+       /*
+        * Mobile 4 Series Chipset neglects to set RWBF capability,
+        * but needs it:
+        */
+       printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
+       rwbf_quirk = 1;
+ 
+       /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
+       if (dev->revision == 0x07) {
+               printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
+               dmar_map_gfx = 0;
+       }
+ }
+ 
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
+ 
+ #define GGC 0x52
+ #define GGC_MEMORY_SIZE_MASK  (0xf << 8)
+ #define GGC_MEMORY_SIZE_NONE  (0x0 << 8)
+ #define GGC_MEMORY_SIZE_1M    (0x1 << 8)
+ #define GGC_MEMORY_SIZE_2M    (0x3 << 8)
+ #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
+ #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
+ #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
+ #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
+ 
+ static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
+ {
+       unsigned short ggc;
+ 
+       if (pci_read_config_word(dev, GGC, &ggc))
+               return;
+ 
+       if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
+               printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
+               dmar_map_gfx = 0;
+       } else if (dmar_map_gfx) {
+               /* we have to ensure the gfx device is idle before we flush */
+               printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
+               intel_iommu_strict = 1;
+        }
+ }
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
+ 
+ /* On Tylersburg chipsets, some BIOSes have been known to enable the
+    ISOCH DMAR unit for the Azalia sound device, but not give it any
+    TLB entries, which causes it to deadlock. Check for that.  We do
+    this in a function called from init_dmars(), instead of in a PCI
+    quirk, because we don't want to print the obnoxious "BIOS broken"
+    message if VT-d is actually disabled.
+ */
+ static void __init check_tylersburg_isoch(void)
+ {
+       struct pci_dev *pdev;
+       uint32_t vtisochctrl;
+ 
+       /* If there's no Azalia in the system anyway, forget it. */
+       pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
+       if (!pdev)
+               return;
+       pci_dev_put(pdev);
+ 
+       /* System Management Registers. Might be hidden, in which case
+          we can't do the sanity check. But that's OK, because the
+          known-broken BIOSes _don't_ actually hide it, so far. */
+       pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
+       if (!pdev)
+               return;
+ 
+       if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
+               pci_dev_put(pdev);
+               return;
+       }
+ 
+       pci_dev_put(pdev);
+ 
+       /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
+       if (vtisochctrl & 1)
+               return;
+ 
+       /* Drop all bits other than the number of TLB entries */
+       vtisochctrl &= 0x1c;
+ 
+       /* If we have the recommended number of TLB entries (16), fine. */
+       if (vtisochctrl == 0x10)
+               return;
+ 
+       /* Zero TLB entries? You get to ride the short bus to school. */
+       if (!vtisochctrl) {
+               WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
+                    "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+                    dmi_get_system_info(DMI_BIOS_VENDOR),
+                    dmi_get_system_info(DMI_BIOS_VERSION),
+                    dmi_get_system_info(DMI_PRODUCT_VERSION));
+               iommu_identity_mapping |= IDENTMAP_AZALIA;
+               return;
+       }
+       
+       printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
+              vtisochctrl);
+ }
diff --combined include/linux/memblock.h

index 90746318cec4eb234085842b3cc80422b67b0cc8,e6b843e16e81d592131e74a47974182cadb301d9..ab89b417655c47e1f766dced9c14401080791bcf
--- 1/include/linux/memblock.h
--- 2/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@@ -2,6 -2,8 +2,6 @@@
   #define _LINUX_MEMBLOCK_H
   #ifdef __KERNEL__
   
- -#define MEMBLOCK_ERROR        0
- -
   #ifdef CONFIG_HAVE_MEMBLOCK
   /*
    * Logical memory blocks.
@@@ -17,14 -19,13 +17,14 @@@
   #include <linux/init.h>
   #include <linux/mm.h>
   
- -#include <asm/memblock.h>
- -
   #define INIT_MEMBLOCK_REGIONS 128
   
   struct memblock_region {
         phys_addr_t base;
         phys_addr_t size;
+ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ +      int nid;
+ +#endif
   };
   
   struct memblock_type {
@@@ -47,8 -48,7 +47,8 @@@ extern int memblock_can_resize
   #define memblock_dbg(fmt, ...) \
         if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
   
- -u64 memblock_find_in_range(u64 start, u64 end, u64 size, u64 align);
+ +phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
+ +                                 phys_addr_t size, phys_addr_t align);
   int memblock_free_reserved_regions(void);
   int memblock_reserve_reserved_regions(void);
   
@@@ -59,56 -59,9 +59,56 @@@ extern long memblock_remove(phys_addr_
   extern long memblock_free(phys_addr_t base, phys_addr_t size);
   extern long memblock_reserve(phys_addr_t base, phys_addr_t size);
   
+ +extern void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start,
+ +                                phys_addr_t *out_end, int *out_nid);
+ +
+ +/**
+ + * for_each_free_mem_range - iterate through free memblock areas
+ + * @i: u64 used as loop variable
+ + * @nid: node selector, %MAX_NUMNODES for all nodes
+ + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ + * @p_nid: ptr to int for nid of the range, can be %NULL
+ + *
+ + * Walks over free (memory && !reserved) areas of memblock.  Available as
+ + * soon as memblock is initialized.
+ + */
+ +#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid)                \
+ +      for (i = 0,                                                     \
+ +           __next_free_mem_range(&i, nid, p_start, p_end, p_nid);     \
+ +           i != (u64)ULLONG_MAX;                                      \
+ +           __next_free_mem_range(&i, nid, p_start, p_end, p_nid))
+ +
+ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ +extern int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid);
+ +
+ +static inline void memblock_set_region_node(struct memblock_region *r, int nid)
+ +{
+ +      r->nid = nid;
+ +}
+ +
+ +static inline int memblock_get_region_node(const struct memblock_region *r)
+ +{
+ +      return r->nid;
+ +}
+ +#else
+ +static inline void memblock_set_region_node(struct memblock_region *r, int nid)
+ +{
+ +}
+ +
+ +static inline int memblock_get_region_node(const struct memblock_region *r)
+ +{
+ +      return 0;
+ +}
+ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+ +
   /* The numa aware allocator is only available if
    * CONFIG_ARCH_POPULATES_NODE_MAP is set
    */
+ +extern phys_addr_t memblock_find_in_range_node(phys_addr_t start,
+ +                                             phys_addr_t end,
+ +                                             phys_addr_t size,
+ +                                             phys_addr_t align, int nid);
   extern phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align,
                                         int nid);
   extern phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
@@@ -127,6 -80,7 +127,7 @@@ extern phys_addr_t __memblock_alloc_bas
                                            phys_addr_t align,
                                            phys_addr_t max_addr);
   extern phys_addr_t memblock_phys_mem_size(void);
+ extern phys_addr_t memblock_start_of_DRAM(void);
   extern phys_addr_t memblock_end_of_DRAM(void);
   extern void memblock_enforce_memory_limit(phys_addr_t memory_limit);
   extern int memblock_is_memory(phys_addr_t addr);
@@@ -136,6 -90,11 +137,6 @@@ extern int memblock_is_region_reserved(
   
   extern void memblock_dump_all(void);
   
- -/* Provided by the architecture */
- -extern phys_addr_t memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid);
- -extern int memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
- -                                 phys_addr_t addr2, phys_addr_t size2);
- -
   /**
    * memblock_set_current_limit - Set the current allocation limit to allow
    *                         limiting allocations to what is currently
@@@ -195,9 -154,9 +196,9 @@@ static inline unsigned long memblock_re
              region++)
   
   
- -#ifdef ARCH_DISCARD_MEMBLOCK
- -#define __init_memblock __init
- -#define __initdata_memblock __initdata
+ +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+ +#define __init_memblock __meminit
+ +#define __initdata_memblock __meminitdata
   #else
   #define __init_memblock
   #define __initdata_memblock
@@@ -206,7 -165,7 +207,7 @@@
   #else
   static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
   {
- -      return MEMBLOCK_ERROR;
+ +      return 0;
   }
   
   #endif /* CONFIG_HAVE_MEMBLOCK */
diff --combined include/linux/mm.h

index ceb1e4a1a73629f46a7edfb0e403b9fb0c53bbb9,3dc3a8c2c4858a1d3400aa2d5fd029d36a1177c6..6b365aee83960447ee48182ae2e9e01478cc0c46
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -15,6 -15,7 +15,7 @@@
   #include <linux/range.h>
   #include <linux/pfn.h>
   #include <linux/bit_spinlock.h>
+ #include <linux/shrinker.h>
   
   struct mempolicy;
   struct anon_vma;
@@@ -355,36 -356,50 +356,50 @@@ static inline struct page *compound_hea
         return page;
   }
   
+ /*
+  * The atomic page->_mapcount, starts from -1: so that transitions
+  * both from it and to it can be tracked, using atomic_inc_and_test
+  * and atomic_add_negative(-1).
+  */
+ static inline void reset_page_mapcount(struct page *page)
+ {
+       atomic_set(&(page)->_mapcount, -1);
+ }
+ 
+ static inline int page_mapcount(struct page *page)
+ {
+       return atomic_read(&(page)->_mapcount) + 1;
+ }
+ 
   static inline int page_count(struct page *page)
   {
         return atomic_read(&compound_head(page)->_count);
   }
   
+ static inline void get_huge_page_tail(struct page *page)
+ {
+       /*
+        * __split_huge_page_refcount() cannot run
+        * from under us.
+        */
+       VM_BUG_ON(page_mapcount(page) < 0);
+       VM_BUG_ON(atomic_read(&page->_count) != 0);
+       atomic_inc(&page->_mapcount);
+ }
+ 
+ extern bool __get_page_tail(struct page *page);
+ 
   static inline void get_page(struct page *page)
   {
+       if (unlikely(PageTail(page)))
+               if (likely(__get_page_tail(page)))
+                       return;
         /*
          * Getting a normal page or the head of a compound page
-        * requires to already have an elevated page->_count. Only if
-        * we're getting a tail page, the elevated page->_count is
-        * required only in the head page, so for tail pages the
-        * bugcheck only verifies that the page->_count isn't
-        * negative.
+        * requires to already have an elevated page->_count.
          */
-       VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
+       VM_BUG_ON(atomic_read(&page->_count) <= 0);
         atomic_inc(&page->_count);
-       /*
-        * Getting a tail page will elevate both the head and tail
-        * page->_count(s).
-        */
-       if (unlikely(PageTail(page))) {
-               /*
-                * This is safe only because
-                * __split_huge_page_refcount can't run under
-                * get_page().
-                */
-               VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
-               atomic_inc(&page->first_page->_count);
-       }
   }
   
   static inline struct page *virt_to_head_page(const void *x)
@@@ -636,7 -651,7 +651,7 @@@ static inline pte_t maybe_mkwrite(pte_
   #define SECTIONS_MASK         ((1UL << SECTIONS_WIDTH) - 1)
   #define ZONEID_MASK           ((1UL << ZONEID_SHIFT) - 1)
   
- static inline enum zone_type page_zonenum(struct page *page)
+ static inline enum zone_type page_zonenum(const struct page *page)
   {
         return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
   }
@@@ -664,15 -679,15 +679,15 @@@ static inline int zone_to_nid(struct zo
   }
   
   #ifdef NODE_NOT_IN_PAGE_FLAGS
- extern int page_to_nid(struct page *page);
+ extern int page_to_nid(const struct page *page);
   #else
- static inline int page_to_nid(struct page *page)
+ static inline int page_to_nid(const struct page *page)
   {
         return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
   }
   #endif
   
- static inline struct zone *page_zone(struct page *page)
+ static inline struct zone *page_zone(const struct page *page)
   {
         return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
   }
@@@ -684,7 -699,7 +699,7 @@@ static inline void set_page_section(str
         page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
   }
   
- static inline unsigned long page_to_section(struct page *page)
+ static inline unsigned long page_to_section(const struct page *page)
   {
         return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
   }
@@@ -717,7 -732,7 +732,7 @@@ static inline void set_page_links(struc
    */
   #include <linux/vmstat.h>
   
- static __always_inline void *lowmem_page_address(struct page *page)
+ static __always_inline void *lowmem_page_address(const struct page *page)
   {
         return __va(PFN_PHYS(page_to_pfn(page)));
   }
@@@ -736,7 -751,7 +751,7 @@@
   #endif
   
   #if defined(HASHED_PAGE_VIRTUAL)
- void *page_address(struct page *page);
+ void *page_address(const struct page *page);
   void set_page_address(struct page *page, void *virtual);
   void page_address_init(void);
   #endif
@@@ -802,21 -817,6 +817,6 @@@ static inline pgoff_t page_index(struc
         return page->index;
   }
   
- /*
-  * The atomic page->_mapcount, like _count, starts from -1:
-  * so that transitions both from it and to it can be tracked,
-  * using atomic_inc_and_test and atomic_add_negative(-1).
-  */
- static inline void reset_page_mapcount(struct page *page)
- {
-       atomic_set(&(page)->_mapcount, -1);
- }
- 
- static inline int page_mapcount(struct page *page)
- {
-       return atomic_read(&(page)->_mapcount) + 1;
- }
- 
   /*
    * Return true if this page is mapped into pagetables.
    */
@@@ -910,6 -910,8 +910,8 @@@ unsigned long unmap_vmas(struct mmu_gat
    * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
    * @pte_hole: if set, called for each hole at all levels
    * @hugetlb_entry: if set, called for each hugetlb entry
+  *               *Caution*: The caller must hold mmap_sem() if @hugetlb_entry
+  *                          is used.
    *
    * (see walk_page_range for more details)
    */
@@@ -959,6 -961,8 +961,8 @@@ int invalidate_inode_page(struct page *
   #ifdef CONFIG_MMU
   extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long address, unsigned int flags);
+ extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+                           unsigned long address, unsigned int fault_flags);
   #else
   static inline int handle_mm_fault(struct mm_struct *mm,
                         struct vm_area_struct *vma, unsigned long address,
@@@ -968,6 -972,14 +972,14 @@@
         BUG();
         return VM_FAULT_SIGBUS;
   }
+ static inline int fixup_user_fault(struct task_struct *tsk,
+               struct mm_struct *mm, unsigned long address,
+               unsigned int fault_flags)
+ {
+       /* should never happen if there's no MMU */
+       BUG();
+       return -EFAULT;
+ }
   #endif
   
   extern int make_pages_present(unsigned long addr, unsigned long end);
@@@ -1121,44 -1133,6 +1133,6 @@@ static inline void sync_mm_rss(struct t
   }
   #endif
   
- /*
-  * This struct is used to pass information from page reclaim to the shrinkers.
-  * We consolidate the values for easier extention later.
-  */
- struct shrink_control {
-       gfp_t gfp_mask;
- 
-       /* How many slab objects shrinker() should scan and try to reclaim */
-       unsigned long nr_to_scan;
- };
- 
- /*
-  * A callback you can register to apply pressure to ageable caches.
-  *
-  * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
-  * and a 'gfpmask'.  It should look through the least-recently-used
-  * 'nr_to_scan' entries and attempt to free them up.  It should return
-  * the number of objects which remain in the cache.  If it returns -1, it means
-  * it cannot do any scanning at this time (eg. there is a risk of deadlock).
-  *
-  * The 'gfpmask' refers to the allocation we are currently trying to
-  * fulfil.
-  *
-  * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
-  * querying the cache size, so a fastpath for that case is appropriate.
-  */
- struct shrinker {
-       int (*shrink)(struct shrinker *, struct shrink_control *sc);
-       int seeks;      /* seeks to recreate an obj */
- 
-       /* These are for internal use */
-       struct list_head list;
-       long nr;        /* objs pending delete */
- };
- #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
- extern void register_shrinker(struct shrinker *);
- extern void unregister_shrinker(struct shrinker *);
- 
   int vma_wants_writenotify(struct vm_area_struct *vma);
   
   extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@@ -1307,14 -1281,12 +1281,14 @@@ extern void free_area_init_node(int nid
    * CONFIG_ARCH_POPULATES_NODE_MAP
    */
   extern void free_area_init_nodes(unsigned long *max_zone_pfn);
+ +#ifndef CONFIG_HAVE_MEMBLOCK_NODE_MAP
   extern void add_active_range(unsigned int nid, unsigned long start_pfn,
                                         unsigned long end_pfn);
   extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
                                         unsigned long end_pfn);
   extern void remove_all_active_ranges(void);
   void sort_node_map(void);
+ +#endif
   unsigned long node_map_pfn_alignment(void);
   unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
                                                 unsigned long end_pfn);
@@@ -1327,27 -1299,11 +1301,27 @@@ extern void free_bootmem_with_active_re
                                                 unsigned long max_low_pfn);
   int add_from_early_node_map(struct range *range, int az,
                                    int nr_range, int nid);
- -u64 __init find_memory_core_early(int nid, u64 size, u64 align,
- -                                      u64 goal, u64 limit);
- -typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
- -extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
   extern void sparse_memory_present_with_active_regions(int nid);
+ +
+ +extern void __next_mem_pfn_range(int *idx, int nid,
+ +                               unsigned long *out_start_pfn,
+ +                               unsigned long *out_end_pfn, int *out_nid);
+ +
+ +/**
+ + * for_each_mem_pfn_range - early memory pfn range iterator
+ + * @i: an integer used as loop variable
+ + * @nid: node selector, %MAX_NUMNODES for all nodes
+ + * @p_start: ptr to ulong for start pfn of the range, can be %NULL
+ + * @p_end: ptr to ulong for end pfn of the range, can be %NULL
+ + * @p_nid: ptr to int for nid of the range, can be %NULL
+ + *
+ + * Walks over configured memory ranges.  Available after early_node_map is
+ + * populated.
+ + */
+ +#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid)         \
+ +      for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
+ +           i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
+ +
   #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
   
   #if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \
@@@ -1377,7 -1333,8 +1351,8 @@@ extern void si_meminfo(struct sysinfo 
   extern void si_meminfo_node(struct sysinfo *val, int nid);
   extern int after_bootmem;
   
- extern void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
+ extern __printf(3, 4)
+ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
   
   extern void setup_per_cpu_pageset(void);
   
@@@ -1464,8 -1421,7 +1439,7 @@@ extern int do_munmap(struct mm_struct *
   
   extern unsigned long do_brk(unsigned long, unsigned long);
   
- /* filemap.c */
- extern unsigned long page_unuse(struct page *);
+ /* truncate.c */
   extern void truncate_inode_pages(struct address_space *, loff_t);
   extern void truncate_inode_pages_range(struct address_space *,
                                        loff_t lstart, loff_t lend);
@@@ -1652,6 -1608,7 +1626,7 @@@ enum mf_flags 
   };
   extern void memory_failure(unsigned long pfn, int trapno);
   extern int __memory_failure(unsigned long pfn, int trapno, int flags);
+ extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
   extern int unpoison_memory(unsigned long pfn);
   extern int sysctl_memory_failure_early_kill;
   extern int sysctl_memory_failure_recovery;
diff --combined kernel/printk.c

index b1d5a6174d652f580935178217c8b078f2fc4f3e,1455a0d4eedd4b386c759d689f939ba5d7a9007a..baf2aebd6970af85547a865276785557b4066e27
--- 1/kernel/printk.c
--- 2/kernel/printk.c
+++ b/kernel/printk.c
@@@ -100,7 -100,7 +100,7 @@@ static int console_locked, console_susp
    * It is also used in interesting ways to provide interlocking in
    * console_unlock();.
    */
- static DEFINE_SPINLOCK(logbuf_lock);
+ static DEFINE_RAW_SPINLOCK(logbuf_lock);
   
   #define LOG_BUF_MASK (log_buf_len-1)
   #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@@ -199,7 -199,7 +199,7 @@@ void __init setup_log_buf(int early
                 unsigned long mem;
   
                 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
- -              if (mem == MEMBLOCK_ERROR)
+ +              if (!mem)
                         return;
                 new_log_buf = __va(mem);
         } else {
@@@ -212,7 -212,7 +212,7 @@@
                 return;
         }
   
-       spin_lock_irqsave(&logbuf_lock, flags);
+       raw_spin_lock_irqsave(&logbuf_lock, flags);
         log_buf_len = new_log_buf_len;
         log_buf = new_log_buf;
         new_log_buf_len = 0;
@@@ -230,7 -230,7 +230,7 @@@
         log_start -= offset;
         con_start -= offset;
         log_end -= offset;
-       spin_unlock_irqrestore(&logbuf_lock, flags);
+       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
   
         pr_info("log_buf_len: %d\n", log_buf_len);
         pr_info("early log buf free: %d(%d%%)\n",
@@@ -318,8 -318,10 +318,10 @@@ static int check_syslog_permissions(in
                         return 0;
                 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
                 if (capable(CAP_SYS_ADMIN)) {
-                       WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
-                                "but no CAP_SYSLOG (deprecated).\n");
+                       printk_once(KERN_WARNING "%s (%d): "
+                                "Attempt to access syslog with CAP_SYS_ADMIN "
+                                "but no CAP_SYSLOG (deprecated).\n",
+                                current->comm, task_pid_nr(current));
                         return 0;
                 }
                 return -EPERM;
@@@ -363,18 -365,18 +365,18 @@@ int do_syslog(int type, char __user *bu
                 if (error)
                         goto out;
                 i = 0;
-               spin_lock_irq(&logbuf_lock);
+               raw_spin_lock_irq(&logbuf_lock);
                 while (!error && (log_start != log_end) && i < len) {
                         c = LOG_BUF(log_start);
                         log_start++;
-                       spin_unlock_irq(&logbuf_lock);
+                       raw_spin_unlock_irq(&logbuf_lock);
                         error = __put_user(c,buf);
                         buf++;
                         i++;
                         cond_resched();
-                       spin_lock_irq(&logbuf_lock);
+                       raw_spin_lock_irq(&logbuf_lock);
                 }
-               spin_unlock_irq(&logbuf_lock);
+               raw_spin_unlock_irq(&logbuf_lock);
                 if (!error)
                         error = i;
                 break;
@@@ -397,7 -399,7 +399,7 @@@
                 count = len;
                 if (count > log_buf_len)
                         count = log_buf_len;
-               spin_lock_irq(&logbuf_lock);
+               raw_spin_lock_irq(&logbuf_lock);
                 if (count > logged_chars)
                         count = logged_chars;
                 if (do_clear)
@@@ -414,12 -416,12 +416,12 @@@
                         if (j + log_buf_len < log_end)
                                 break;
                         c = LOG_BUF(j);
-                       spin_unlock_irq(&logbuf_lock);
+                       raw_spin_unlock_irq(&logbuf_lock);
                         error = __put_user(c,&buf[count-1-i]);
                         cond_resched();
-                       spin_lock_irq(&logbuf_lock);
+                       raw_spin_lock_irq(&logbuf_lock);
                 }
-               spin_unlock_irq(&logbuf_lock);
+               raw_spin_unlock_irq(&logbuf_lock);
                 if (error)
                         break;
                 error = i;
@@@ -530,6 -532,9 +532,9 @@@ static int __init ignore_loglevel_setup
   }
   
   early_param("ignore_loglevel", ignore_loglevel_setup);
+ module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
+       "print all kernel messages to the console.");
   
   /*
    * Write out chars from start to end - 1 inclusive
@@@ -590,9 -595,6 +595,6 @@@ static size_t log_prefix(const char *p
                 /* multi digit including the level and facility number */
                 char *endp = NULL;
   
-               if (p[1] < '0' && p[1] > '9')
-                       return 0;
- 
                 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
                 if (endp == NULL || endp[0] != '>')
                         return 0;
@@@ -687,7 -689,7 +689,7 @@@ static void zap_locks(void
         oops_timestamp = jiffies;
   
         /* If a crash is occurring, make sure we can't deadlock */
-       spin_lock_init(&logbuf_lock);
+       raw_spin_lock_init(&logbuf_lock);
         /* And make sure that we print immediately */
         sema_init(&console_sem, 1);
   }
@@@ -782,7 -784,7 +784,7 @@@ static inline int can_use_console(unsig
   static int console_trylock_for_printk(unsigned int cpu)
         __releases(&logbuf_lock)
   {
-       int retval = 0;
+       int retval = 0, wake = 0;
   
         if (console_trylock()) {
                 retval = 1;
@@@ -795,12 -797,14 +797,14 @@@
                  */
                 if (!can_use_console(cpu)) {
                         console_locked = 0;
-                       up(&console_sem);
+                       wake = 1;
                         retval = 0;
                 }
         }
         printk_cpu = UINT_MAX;
-       spin_unlock(&logbuf_lock);
+       if (wake)
+               up(&console_sem);
+       raw_spin_unlock(&logbuf_lock);
         return retval;
   }
   static const char recursion_bug_msg [] =
@@@ -860,7 -864,7 +864,7 @@@ asmlinkage int vprintk(const char *fmt
         }
   
         lockdep_off();
-       spin_lock(&logbuf_lock);
+       raw_spin_lock(&logbuf_lock);
         printk_cpu = this_cpu;
   
         if (recursion_bug) {
@@@ -1104,6 -1108,10 +1108,10 @@@ static int __init console_suspend_disab
         return 1;
   }
   __setup("no_console_suspend", console_suspend_disable);
+ module_param_named(console_suspend, console_suspend_enabled,
+               bool, S_IRUGO | S_IWUSR);
+ MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
+       " and hibernate operations");
   
   /**
    * suspend_console - suspend the console subsystem
@@@ -1242,7 -1250,7 +1250,7 @@@ void console_unlock(void
   {
         unsigned long flags;
         unsigned _con_start, _log_end;
-       unsigned wake_klogd = 0;
+       unsigned wake_klogd = 0, retry = 0;
   
         if (console_suspended) {
                 up(&console_sem);
@@@ -1251,15 -1259,16 +1259,16 @@@
   
         console_may_schedule = 0;
   
+ again:
         for ( ; ; ) {
-               spin_lock_irqsave(&logbuf_lock, flags);
+               raw_spin_lock_irqsave(&logbuf_lock, flags);
                 wake_klogd |= log_start - log_end;
                 if (con_start == log_end)
                         break;                  /* Nothing to print */
                 _con_start = con_start;
                 _log_end = log_end;
                 con_start = log_end;            /* Flush */
-               spin_unlock(&logbuf_lock);
+               raw_spin_unlock(&logbuf_lock);
                 stop_critical_timings();        /* don't trace print latency */
                 call_console_drivers(_con_start, _log_end);
                 start_critical_timings();
@@@ -1271,8 -1280,23 +1280,23 @@@
         if (unlikely(exclusive_console))
                 exclusive_console = NULL;
   
+       raw_spin_unlock(&logbuf_lock);
+ 
         up(&console_sem);
-       spin_unlock_irqrestore(&logbuf_lock, flags);
+ 
+       /*
+        * Someone could have filled up the buffer again, so re-check if there's
+        * something to flush. In case we cannot trylock the console_sem again,
+        * there's a new owner and the console_unlock() from them will do the
+        * flush, no worries.
+        */
+       raw_spin_lock(&logbuf_lock);
+       if (con_start != log_end)
+               retry = 1;
+       if (retry && console_trylock())
+               goto again;
+ 
+       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
         if (wake_klogd)
                 wake_up_klogd();
   }
@@@ -1502,9 -1526,9 +1526,9 @@@ void register_console(struct console *n
                  * console_unlock(); will print out the buffered messages
                  * for us.
                  */
-               spin_lock_irqsave(&logbuf_lock, flags);
+               raw_spin_lock_irqsave(&logbuf_lock, flags);
                 con_start = log_start;
-               spin_unlock_irqrestore(&logbuf_lock, flags);
+               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
                 /*
                  * We're about to replay the log buffer.  Only do this to the
                  * just-registered console to avoid excessive message spam to
@@@ -1584,7 -1608,7 +1608,7 @@@ static int __init printk_late_init(void
         struct console *con;
   
         for_each_console(con) {
-               if (con->flags & CON_BOOT) {
+               if (!keep_bootcon && con->flags & CON_BOOT) {
                         printk(KERN_INFO "turn off boot console %s%d\n",
                                 con->name, con->index);
                         unregister_console(con);
@@@ -1711,10 -1735,10 +1735,10 @@@ void kmsg_dump(enum kmsg_dump_reason re
         /* Theoretically, the log could move on after we do this, but
            there's not a lot we can do about that. The new messages
            will overwrite the start of what we dump. */
-       spin_lock_irqsave(&logbuf_lock, flags);
+       raw_spin_lock_irqsave(&logbuf_lock, flags);
         end = log_end & LOG_BUF_MASK;
         chars = logged_chars;
-       spin_unlock_irqrestore(&logbuf_lock, flags);
+       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
   
         if (chars > end) {
                 s1 = log_buf + log_buf_len - chars + end;
diff --combined mm/Kconfig

index 7c5697116fcf86bd4e4c7ad0ee9c42e4f3c04834,011b110365c8681d7b1c546223bdb8a2daccc59d..e338407f1225f0873a8eb20761c86fd82db1dfc0
--- 1/mm/Kconfig
--- 2/mm/Kconfig
+++ b/mm/Kconfig
@@@ -131,12 -131,9 +131,15 @@@ config SPARSEMEM_VMEMMA
   config HAVE_MEMBLOCK
         boolean
   
+ +config HAVE_MEMBLOCK_NODE_MAP
+ +      boolean
+ +
+ +config ARCH_DISCARD_MEMBLOCK
+ +      boolean
+ +
+ config NO_BOOTMEM
+       boolean
+ 
   # eventually, we can have this option just 'select SPARSEMEM'
   config MEMORY_HOTPLUG
         bool "Allow for memory hot-add"
@@@ -362,7 -359,7 +365,7 @@@ config CLEANCACH
           for clean pages that the kernel's pageframe replacement algorithm
           (PFRA) would like to keep around, but can't since there isn't enough
           memory.  So when the PFRA "evicts" a page, it first attempts to use
-         cleancacne code to put the data contained in that page into
+         cleancache code to put the data contained in that page into
           "transcendent memory", memory that is not directly accessible or
           addressable by the kernel and is of unknown and possibly
           time-varying size.  And when a cleancache-enabled
diff --combined mm/memblock.c

index a75723d626311ada8a344e5246389c171996e123,84bec4969ed5d3a27d9c6035da6e39ebf1dfdd18..a57092f63a862ec5ec490d50f43a08eb9021f29a
--- 1/mm/memblock.c
--- 2/mm/memblock.c
+++ b/mm/memblock.c
@@@ -41,13 -41,25 +41,14 @@@ static inline const char *memblock_type
   /*
    * Address comparison utilities
    */
- -
- -static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
- -{
- -      return addr & ~(size - 1);
- -}
- -
- -static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
- -{
- -      return (addr + (size - 1)) & ~(size - 1);
- -}
- -
   static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
                                        phys_addr_t base2, phys_addr_t size2)
   {
         return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
   }
   
- long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
+                                       phys_addr_t base, phys_addr_t size)
   {
         unsigned long i;
   
@@@ -74,9 -86,9 +75,9 @@@ static phys_addr_t __init_memblock memb
   
         /* In case, huge size is requested */
         if (end < size)
- -              return MEMBLOCK_ERROR;
+ +              return 0;
   
- -      base = memblock_align_down((end - size), align);
+ +      base = round_down(end - size, align);
   
         /* Prevent allocations returning 0 as it's also used to
          * indicate an allocation failure
@@@ -91,17 -103,14 +92,17 @@@
                 res_base = memblock.reserved.regions[j].base;
                 if (res_base < size)
                         break;
- -              base = memblock_align_down(res_base - size, align);
+ +              base = round_down(res_base - size, align);
         }
   
- -      return MEMBLOCK_ERROR;
+ +      return 0;
   }
   
- -static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
- -                      phys_addr_t align, phys_addr_t start, phys_addr_t end)
+ +/*
+ + * Find a free area with specified alignment in a specific range.
+ + */
+ +phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end,
+ +                                      phys_addr_t size, phys_addr_t align)
   {
         long i;
   
@@@ -129,10 -138,18 +130,10 @@@
                 if (bottom >= top)
                         continue;
                 found = memblock_find_region(bottom, top, size, align);
- -              if (found != MEMBLOCK_ERROR)
+ +              if (found)
                         return found;
         }
- -      return MEMBLOCK_ERROR;
- -}
- -
- -/*
- - * Find a free area with specified alignment in a specific range.
- - */
- -u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align)
- -{
- -      return memblock_find_base(size, align, start, end);
+ +      return 0;
   }
   
   /*
@@@ -161,8 -178,12 +162,8 @@@ int __init_memblock memblock_reserve_re
   
   static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
   {
- -      unsigned long i;
- -
- -      for (i = r; i < type->cnt - 1; i++) {
- -              type->regions[i].base = type->regions[i + 1].base;
- -              type->regions[i].size = type->regions[i + 1].size;
- -      }
+ +      memmove(&type->regions[r], &type->regions[r + 1],
+ +              (type->cnt - (r + 1)) * sizeof(type->regions[r]));
         type->cnt--;
   
         /* Special case for empty arrays */
@@@ -170,7 -191,6 +171,7 @@@
                 type->cnt = 1;
                 type->regions[0].base = 0;
                 type->regions[0].size = 0;
+ +              memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
         }
   }
   
@@@ -206,10 -226,10 +207,10 @@@ static int __init_memblock memblock_dou
          */
         if (use_slab) {
                 new_array = kmalloc(new_size, GFP_KERNEL);
- -              addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array);
+ +              addr = new_array ? __pa(new_array) : 0;
         } else
- -              addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE);
- -      if (addr == MEMBLOCK_ERROR) {
+ +              addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+ +      if (!addr) {
                 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
                        memblock_type_name(type), type->max, type->max * 2);
                 return -1;
@@@ -248,147 -268,146 +249,147 @@@
         return 0;
   }
   
- -int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
- -                                        phys_addr_t addr2, phys_addr_t size2)
- -{
- -      return 1;
- -}
- -
- -static long __init_memblock memblock_add_region(struct memblock_type *type,
- -                                              phys_addr_t base, phys_addr_t size)
+ +/**
+ + * memblock_merge_regions - merge neighboring compatible regions
+ + * @type: memblock type to scan
+ + *
+ + * Scan @type and merge neighboring compatible regions.
+ + */
+ +static void __init_memblock memblock_merge_regions(struct memblock_type *type)
   {
- -      phys_addr_t end = base + size;
- -      int i, slot = -1;
- -
- -      /* First try and coalesce this MEMBLOCK with others */
- -      for (i = 0; i < type->cnt; i++) {
- -              struct memblock_region *rgn = &type->regions[i];
- -              phys_addr_t rend = rgn->base + rgn->size;
+ +      int i = 0;
   
- -              /* Exit if there's no possible hits */
- -              if (rgn->base > end || rgn->size == 0)
- -                      break;
+ +      /* cnt never goes below 1 */
+ +      while (i < type->cnt - 1) {
+ +              struct memblock_region *this = &type->regions[i];
+ +              struct memblock_region *next = &type->regions[i + 1];
   
- -              /* Check if we are fully enclosed within an existing
- -               * block
- -               */
- -              if (rgn->base <= base && rend >= end)
- -                      return 0;
+ +              if (this->base + this->size != next->base ||
+ +                  memblock_get_region_node(this) !=
+ +                  memblock_get_region_node(next)) {
+ +                      BUG_ON(this->base + this->size > next->base);
+ +                      i++;
+ +                      continue;
+ +              }
   
- -              /* Check if we overlap or are adjacent with the bottom
- -               * of a block.
- -               */
- -              if (base < rgn->base && end >= rgn->base) {
- -                      /* If we can't coalesce, create a new block */
- -                      if (!memblock_memory_can_coalesce(base, size,
- -                                                        rgn->base,
- -                                                        rgn->size)) {
- -                              /* Overlap & can't coalesce are mutually
- -                               * exclusive, if you do that, be prepared
- -                               * for trouble
- -                               */
- -                              WARN_ON(end != rgn->base);
- -                              goto new_block;
- -                      }
- -                      /* We extend the bottom of the block down to our
- -                       * base
- -                       */
- -                      rgn->base = base;
- -                      rgn->size = rend - base;
+ +              this->size += next->size;
+ +              memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next));
+ +              type->cnt--;
+ +      }
+ +}
   
- -                      /* Return if we have nothing else to allocate
- -                       * (fully coalesced)
- -                       */
- -                      if (rend >= end)
- -                              return 0;
+ +/**
+ + * memblock_insert_region - insert new memblock region
+ + * @type: memblock type to insert into
+ + * @idx: index for the insertion point
+ + * @base: base address of the new region
+ + * @size: size of the new region
+ + *
+ + * Insert new memblock region [@base,@base+@size) into @type at @idx.
+ + * @type must already have extra room to accomodate the new region.
+ + */
+ +static void __init_memblock memblock_insert_region(struct memblock_type *type,
+ +                                                 int idx, phys_addr_t base,
+ +                                                 phys_addr_t size, int nid)
+ +{
+ +      struct memblock_region *rgn = &type->regions[idx];
   
- -                      /* We continue processing from the end of the
- -                       * coalesced block.
- -                       */
- -                      base = rend;
- -                      size = end - base;
- -              }
+ +      BUG_ON(type->cnt >= type->max);
+ +      memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
+ +      rgn->base = base;
+ +      rgn->size = size;
+ +      memblock_set_region_node(rgn, nid);
+ +      type->cnt++;
+ +}
   
- -              /* Now check if we overlap or are adjacent with the
- -               * top of a block
- -               */
- -              if (base <= rend && end >= rend) {
- -                      /* If we can't coalesce, create a new block */
- -                      if (!memblock_memory_can_coalesce(rgn->base,
- -                                                        rgn->size,
- -                                                        base, size)) {
- -                              /* Overlap & can't coalesce are mutually
- -                               * exclusive, if you do that, be prepared
- -                               * for trouble
- -                               */
- -                              WARN_ON(rend != base);
- -                              goto new_block;
- -                      }
- -                      /* We adjust our base down to enclose the
- -                       * original block and destroy it. It will be
- -                       * part of our new allocation. Since we've
- -                       * freed an entry, we know we won't fail
- -                       * to allocate one later, so we won't risk
- -                       * losing the original block allocation.
- -                       */
- -                      size += (base - rgn->base);
- -                      base = rgn->base;
- -                      memblock_remove_region(type, i--);
- -              }
- -      }
+ +/**
+ + * memblock_add_region - add new memblock region
+ + * @type: memblock type to add new region into
+ + * @base: base address of the new region
+ + * @size: size of the new region
+ + *
+ + * Add new memblock region [@base,@base+@size) into @type.  The new region
+ + * is allowed to overlap with existing ones - overlaps don't affect already
+ + * existing regions.  @type is guaranteed to be minimal (all neighbouring
+ + * compatible regions are merged) after the addition.
+ + *
+ + * RETURNS:
+ + * 0 on success, -errno on failure.
+ + */
+ +static long __init_memblock memblock_add_region(struct memblock_type *type,
+ +                                              phys_addr_t base, phys_addr_t size)
+ +{
+ +      bool insert = false;
+ +      phys_addr_t obase = base, end = base + size;
+ +      int i, nr_new;
   
- -      /* If the array is empty, special case, replace the fake
- -       * filler region and return
- -       */
- -      if ((type->cnt == 1) && (type->regions[0].size == 0)) {
+ +      /* special case for empty array */
+ +      if (type->regions[0].size == 0) {
+ +              WARN_ON(type->cnt != 1);
                 type->regions[0].base = base;
                 type->regions[0].size = size;
+ +              memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
                 return 0;
         }
- -
- - new_block:
- -      /* If we are out of space, we fail. It's too late to resize the array
- -       * but then this shouldn't have happened in the first place.
+ +repeat:
+ +      /*
+ +       * The following is executed twice.  Once with %false @insert and
+ +       * then with %true.  The first counts the number of regions needed
+ +       * to accomodate the new area.  The second actually inserts them.
          */
- -      if (WARN_ON(type->cnt >= type->max))
- -              return -1;
+ +      base = obase;
+ +      nr_new = 0;
   
- -      /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
- -      for (i = type->cnt - 1; i >= 0; i--) {
- -              if (base < type->regions[i].base) {
- -                      type->regions[i+1].base = type->regions[i].base;
- -                      type->regions[i+1].size = type->regions[i].size;
- -              } else {
- -                      type->regions[i+1].base = base;
- -                      type->regions[i+1].size = size;
- -                      slot = i + 1;
+ +      for (i = 0; i < type->cnt; i++) {
+ +              struct memblock_region *rgn = &type->regions[i];
+ +              phys_addr_t rbase = rgn->base;
+ +              phys_addr_t rend = rbase + rgn->size;
+ +
+ +              if (rbase >= end)
                         break;
+ +              if (rend <= base)
+ +                      continue;
+ +              /*
+ +               * @rgn overlaps.  If it separates the lower part of new
+ +               * area, insert that portion.
+ +               */
+ +              if (rbase > base) {
+ +                      nr_new++;
+ +                      if (insert)
+ +                              memblock_insert_region(type, i++, base,
+ +                                              rbase - base, MAX_NUMNODES);
                 }
+ +              /* area below @rend is dealt with, forget about it */
+ +              base = min(rend, end);
         }
- -      if (base < type->regions[0].base) {
- -              type->regions[0].base = base;
- -              type->regions[0].size = size;
- -              slot = 0;
+ +
+ +      /* insert the remaining portion */
+ +      if (base < end) {
+ +              nr_new++;
+ +              if (insert)
+ +                      memblock_insert_region(type, i, base, end - base,
+ +                                             MAX_NUMNODES);
         }
- -      type->cnt++;
   
- -      /* The array is full ? Try to resize it. If that fails, we undo
- -       * our allocation and return an error
+ +      /*
+ +       * If this was the first round, resize array and repeat for actual
+ +       * insertions; otherwise, merge and return.
          */
- -      if (type->cnt == type->max && memblock_double_array(type)) {
- -              BUG_ON(slot < 0);
- -              memblock_remove_region(type, slot);
- -              return -1;
+ +      if (!insert) {
+ +              while (type->cnt + nr_new > type->max)
+ +                      if (memblock_double_array(type) < 0)
+ +                              return -ENOMEM;
+ +              insert = true;
+ +              goto repeat;
+ +      } else {
+ +              memblock_merge_regions(type);
+ +              return 0;
         }
- -
- -      return 0;
   }
   
   long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
   {
         return memblock_add_region(&memblock.memory, base, size);
- -
   }
   
   static long __init_memblock __memblock_remove(struct memblock_type *type,
@@@ -449,11 -468,6 +450,11 @@@ long __init_memblock memblock_remove(ph
   
   long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
   {
+ +      memblock_dbg("   memblock_free: [%#016llx-%#016llx] %pF\n",
+ +                   (unsigned long long)base,
+ +                   (unsigned long long)base + size,
+ +                   (void *)_RET_IP_);
+ +
         return __memblock_remove(&memblock.reserved, base, size);
   }
   
@@@ -461,186 -475,11 +462,186 @@@ long __init_memblock memblock_reserve(p
   {
         struct memblock_type *_rgn = &memblock.reserved;
   
+ +      memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n",
+ +                   (unsigned long long)base,
+ +                   (unsigned long long)base + size,
+ +                   (void *)_RET_IP_);
         BUG_ON(0 == size);
   
         return memblock_add_region(_rgn, base, size);
   }
   
+ +/**
+ + * __next_free_mem_range - next function for for_each_free_mem_range()
+ + * @idx: pointer to u64 loop variable
+ + * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ + * @p_nid: ptr to int for nid of the range, can be %NULL
+ + *
+ + * Find the first free area from *@idx which matches @nid, fill the out
+ + * parameters, and update *@idx for the next iteration.  The lower 32bit of
+ + * *@idx contains index into memory region and the upper 32bit indexes the
+ + * areas before each reserved region.  For example, if reserved regions
+ + * look like the following,
+ + *
+ + *    0:[0-16), 1:[32-48), 2:[128-130)
+ + *
+ + * The upper 32bit indexes the following regions.
+ + *
+ + *    0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX)
+ + *
+ + * As both region arrays are sorted, the function advances the two indices
+ + * in lockstep and returns each intersection.
+ + */
+ +void __init_memblock __next_free_mem_range(u64 *idx, int nid,
+ +                                         phys_addr_t *out_start,
+ +                                         phys_addr_t *out_end, int *out_nid)
+ +{
+ +      struct memblock_type *mem = &memblock.memory;
+ +      struct memblock_type *rsv = &memblock.reserved;
+ +      int mi = *idx & 0xffffffff;
+ +      int ri = *idx >> 32;
+ +
+ +      for ( ; mi < mem->cnt; mi++) {
+ +              struct memblock_region *m = &mem->regions[mi];
+ +              phys_addr_t m_start = m->base;
+ +              phys_addr_t m_end = m->base + m->size;
+ +
+ +              /* only memory regions are associated with nodes, check it */
+ +              if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+ +                      continue;
+ +
+ +              /* scan areas before each reservation for intersection */
+ +              for ( ; ri < rsv->cnt + 1; ri++) {
+ +                      struct memblock_region *r = &rsv->regions[ri];
+ +                      phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
+ +                      phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
+ +
+ +                      /* if ri advanced past mi, break out to advance mi */
+ +                      if (r_start >= m_end)
+ +                              break;
+ +                      /* if the two regions intersect, we're done */
+ +                      if (m_start < r_end) {
+ +                              if (out_start)
+ +                                      *out_start = max(m_start, r_start);
+ +                              if (out_end)
+ +                                      *out_end = min(m_end, r_end);
+ +                              if (out_nid)
+ +                                      *out_nid = memblock_get_region_node(m);
+ +                              /*
+ +                               * The region which ends first is advanced
+ +                               * for the next iteration.
+ +                               */
+ +                              if (m_end <= r_end)
+ +                                      mi++;
+ +                              else
+ +                                      ri++;
+ +                              *idx = (u32)mi | (u64)ri << 32;
+ +                              return;
+ +                      }
+ +              }
+ +      }
+ +
+ +      /* signal end of iteration */
+ +      *idx = ULLONG_MAX;
+ +}
+ +
+ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ +/*
+ + * Common iterator interface used to define for_each_mem_range().
+ + */
+ +void __init_memblock __next_mem_pfn_range(int *idx, int nid,
+ +                              unsigned long *out_start_pfn,
+ +                              unsigned long *out_end_pfn, int *out_nid)
+ +{
+ +      struct memblock_type *type = &memblock.memory;
+ +      struct memblock_region *r;
+ +
+ +      while (++*idx < type->cnt) {
+ +              r = &type->regions[*idx];
+ +
+ +              if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
+ +                      continue;
+ +              if (nid == MAX_NUMNODES || nid == r->nid)
+ +                      break;
+ +      }
+ +      if (*idx >= type->cnt) {
+ +              *idx = -1;
+ +              return;
+ +      }
+ +
+ +      if (out_start_pfn)
+ +              *out_start_pfn = PFN_UP(r->base);
+ +      if (out_end_pfn)
+ +              *out_end_pfn = PFN_DOWN(r->base + r->size);
+ +      if (out_nid)
+ +              *out_nid = r->nid;
+ +}
+ +
+ +/**
+ + * memblock_set_node - set node ID on memblock regions
+ + * @base: base of area to set node ID for
+ + * @size: size of area to set node ID for
+ + * @nid: node ID to set
+ + *
+ + * Set the nid of memblock memory regions in [@base,@base+@size) to @nid.
+ + * Regions which cross the area boundaries are split as necessary.
+ + *
+ + * RETURNS:
+ + * 0 on success, -errno on failure.
+ + */
+ +int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
+ +                                    int nid)
+ +{
+ +      struct memblock_type *type = &memblock.memory;
+ +      phys_addr_t end = base + size;
+ +      int i;
+ +
+ +      /* we'll create at most two more regions */
+ +      while (type->cnt + 2 > type->max)
+ +              if (memblock_double_array(type) < 0)
+ +                      return -ENOMEM;
+ +
+ +      for (i = 0; i < type->cnt; i++) {
+ +              struct memblock_region *rgn = &type->regions[i];
+ +              phys_addr_t rbase = rgn->base;
+ +              phys_addr_t rend = rbase + rgn->size;
+ +
+ +              if (rbase >= end)
+ +                      break;
+ +              if (rend <= base)
+ +                      continue;
+ +
+ +              if (rbase < base) {
+ +                      /*
+ +                       * @rgn intersects from below.  Split and continue
+ +                       * to process the next region - the new top half.
+ +                       */
+ +                      rgn->base = base;
+ +                      rgn->size = rend - rgn->base;
+ +                      memblock_insert_region(type, i, rbase, base - rbase,
+ +                                             rgn->nid);
+ +              } else if (rend > end) {
+ +                      /*
+ +                       * @rgn intersects from above.  Split and redo the
+ +                       * current region - the new bottom half.
+ +                       */
+ +                      rgn->base = end;
+ +                      rgn->size = rend - rgn->base;
+ +                      memblock_insert_region(type, i--, rbase, end - rbase,
+ +                                             rgn->nid);
+ +              } else {
+ +                      /* @rgn is fully contained, set ->nid */
+ +                      rgn->nid = nid;
+ +              }
+ +      }
+ +
+ +      memblock_merge_regions(type);
+ +      return 0;
+ +}
+ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+ +
   phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
   {
         phys_addr_t found;
@@@ -648,10 -487,11 +649,10 @@@
         /* We align the size to limit fragmentation. Without this, a lot of
          * small allocs quickly eat up the whole reserve array on sparc
          */
- -      size = memblock_align_up(size, align);
+ +      size = round_up(size, align);
   
- -      found = memblock_find_base(size, align, 0, max_addr);
- -      if (found != MEMBLOCK_ERROR &&
- -          !memblock_add_region(&memblock.reserved, found, size))
+ +      found = memblock_find_in_range(0, max_addr, size, align);
+ +      if (found && !memblock_add_region(&memblock.reserved, found, size))
                 return found;
   
         return 0;
@@@ -677,78 -517,92 +678,78 @@@ phys_addr_t __init memblock_alloc(phys_
   
   
   /*
- - * Additional node-local allocators. Search for node memory is bottom up
- - * and walks memblock regions within that node bottom-up as well, but allocation
- - * within an memblock region is top-down. XXX I plan to fix that at some stage
+ + * Additional node-local top-down allocators.
    *
    * WARNING: Only available after early_node_map[] has been populated,
    * on some architectures, that is after all the calls to add_active_range()
    * have been done to populate it.
    */
   
- -phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid)
+ +static phys_addr_t __init memblock_nid_range_rev(phys_addr_t start,
+ +                                               phys_addr_t end, int *nid)
   {
   #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
- -      /*
- -       * This code originates from sparc which really wants use to walk by addresses
- -       * and returns the nid. This is not very convenient for early_pfn_map[] users
- -       * as the map isn't sorted yet, and it really wants to be walked by nid.
- -       *
- -       * For now, I implement the inefficient method below which walks the early
- -       * map multiple times. Eventually we may want to use an ARCH config option
- -       * to implement a completely different method for both case.
- -       */
         unsigned long start_pfn, end_pfn;
         int i;
   
- -      for (i = 0; i < MAX_NUMNODES; i++) {
- -              get_pfn_range_for_nid(i, &start_pfn, &end_pfn);
- -              if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn))
- -                      continue;
- -              *nid = i;
- -              return min(end, PFN_PHYS(end_pfn));
- -      }
+ +      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, nid)
+ +              if (end > PFN_PHYS(start_pfn) && end <= PFN_PHYS(end_pfn))
+ +                      return max(start, PFN_PHYS(start_pfn));
   #endif
         *nid = 0;
- -
- -      return end;
+ +      return start;
   }
   
- -static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
+ +phys_addr_t __init memblock_find_in_range_node(phys_addr_t start,
+ +                                             phys_addr_t end,
                                                phys_addr_t size,
                                                phys_addr_t align, int nid)
   {
- -      phys_addr_t start, end;
+ +      struct memblock_type *mem = &memblock.memory;
+ +      int i;
+ +
+ +      BUG_ON(0 == size);
+ +
+ +      /* Pump up max_addr */
+ +      if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+ +              end = memblock.current_limit;
   
- -      start = mp->base;
- -      end = start + mp->size;
+ +      for (i = mem->cnt - 1; i >= 0; i--) {
+ +              struct memblock_region *r = &mem->regions[i];
+ +              phys_addr_t base = max(start, r->base);
+ +              phys_addr_t top = min(end, r->base + r->size);
   
- -      start = memblock_align_up(start, align);
- -      while (start < end) {
- -              phys_addr_t this_end;
- -              int this_nid;
+ +              while (base < top) {
+ +                      phys_addr_t tbase, ret;
+ +                      int tnid;
   
- -              this_end = memblock_nid_range(start, end, &this_nid);
- -              if (this_nid == nid) {
- -                      phys_addr_t ret = memblock_find_region(start, this_end, size, align);
- -                      if (ret != MEMBLOCK_ERROR &&
- -                          !memblock_add_region(&memblock.reserved, ret, size))
- -                              return ret;
+ +                      tbase = memblock_nid_range_rev(base, top, &tnid);
+ +                      if (nid == MAX_NUMNODES || tnid == nid) {
+ +                              ret = memblock_find_region(tbase, top, size, align);
+ +                              if (ret)
+ +                                      return ret;
+ +                      }
+ +                      top = tbase;
                 }
- -              start = this_end;
         }
   
- -      return MEMBLOCK_ERROR;
+ +      return 0;
   }
   
   phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
   {
- -      struct memblock_type *mem = &memblock.memory;
- -      int i;
- -
- -      BUG_ON(0 == size);
+ +      phys_addr_t found;
   
- -      /* We align the size to limit fragmentation. Without this, a lot of
+ +      /*
+ +       * We align the size to limit fragmentation. Without this, a lot of
          * small allocs quickly eat up the whole reserve array on sparc
          */
- -      size = memblock_align_up(size, align);
+ +      size = round_up(size, align);
   
- -      /* We do a bottom-up search for a region with the right
- -       * nid since that's easier considering how memblock_nid_range()
- -       * works
- -       */
- -      for (i = 0; i < mem->cnt; i++) {
- -              phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
- -                                             size, align, nid);
- -              if (ret != MEMBLOCK_ERROR)
- -                      return ret;
- -      }
+ +      found = memblock_find_in_range_node(0, MEMBLOCK_ALLOC_ACCESSIBLE,
+ +                                          size, align, nid);
+ +      if (found && !memblock_add_region(&memblock.reserved, found, size))
+ +              return found;
   
         return 0;
   }
@@@ -759,7 -613,7 +760,7 @@@ phys_addr_t __init memblock_alloc_try_n
   
         if (res)
                 return res;
- -      return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
+ +      return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
   }
   
   
@@@ -773,6 -627,12 +774,12 @@@ phys_addr_t __init memblock_phys_mem_si
         return memblock.memory_size;
   }
   
+ /* lowest address */
+ phys_addr_t __init_memblock memblock_start_of_DRAM(void)
+ {
+       return memblock.memory.regions[0].base;
+ }
+ 
   phys_addr_t __init_memblock memblock_end_of_DRAM(void)
   {
         int idx = memblock.memory.cnt - 1;
@@@ -871,26 -731,19 +878,26 @@@ void __init_memblock memblock_set_curre
         memblock.current_limit = limit;
   }
   
- -static void __init_memblock memblock_dump(struct memblock_type *region, char *name)
+ +static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
   {
         unsigned long long base, size;
         int i;
   
- -      pr_info(" %s.cnt  = 0x%lx\n", name, region->cnt);
- -
- -      for (i = 0; i < region->cnt; i++) {
- -              base = region->regions[i].base;
- -              size = region->regions[i].size;
+ +      pr_info(" %s.cnt  = 0x%lx\n", name, type->cnt);
   
- -              pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n",
- -                  name, i, base, base + size - 1, size);
+ +      for (i = 0; i < type->cnt; i++) {
+ +              struct memblock_region *rgn = &type->regions[i];
+ +              char nid_buf[32] = "";
+ +
+ +              base = rgn->base;
+ +              size = rgn->size;
+ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ +              if (memblock_get_region_node(rgn) != MAX_NUMNODES)
+ +                      snprintf(nid_buf, sizeof(nid_buf), " on node %d",
+ +                               memblock_get_region_node(rgn));
+ +#endif
+ +              pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n",
+ +                      name, i, base, base + size - 1, size, nid_buf);
         }
   }
   
@@@ -912,9 -765,9 +919,9 @@@ void __init memblock_analyze(void
   
         /* Check marker in the unused last array entry */
         WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
-               != (phys_addr_t)RED_INACTIVE);
+               != MEMBLOCK_INACTIVE);
         WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
-               != (phys_addr_t)RED_INACTIVE);
+               != MEMBLOCK_INACTIVE);
   
         memblock.memory_size = 0;
   
@@@ -940,21 -793,19 +947,21 @@@ void __init memblock_init(void
         memblock.reserved.max   = INIT_MEMBLOCK_REGIONS;
   
         /* Write a marker in the unused last array entry */
-       memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
-       memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+       memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
+       memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
   
         /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
          * This simplifies the memblock_add() code below...
          */
         memblock.memory.regions[0].base = 0;
         memblock.memory.regions[0].size = 0;
+ +      memblock_set_region_node(&memblock.memory.regions[0], MAX_NUMNODES);
         memblock.memory.cnt = 1;
   
         /* Ditto. */
         memblock.reserved.regions[0].base = 0;
         memblock.reserved.regions[0].size = 0;
+ +      memblock_set_region_node(&memblock.reserved.regions[0], MAX_NUMNODES);
         memblock.reserved.cnt = 1;
   
         memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
@@@ -968,7 -819,7 +975,7 @@@ static int __init early_memblock(char *
   }
   early_param("memblock", early_memblock);
   
- -#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK)
+ +#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK)
   
   static int memblock_debug_show(struct seq_file *m, void *private)
   {
diff --combined mm/nobootmem.c

index 29d948ce6d0f7cbe58ffb04ce9bb49fe01ba935d,7fa41b4a07bf2c129d57d2df5be92b434252ca7e..24f0fc1a56d60ebbbacf1950ac0ebec7067dbf41
--- 1/mm/nobootmem.c
--- 2/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@@ -12,7 -12,7 +12,7 @@@
   #include <linux/pfn.h>
   #include <linux/slab.h>
   #include <linux/bootmem.h>
- #include <linux/module.h>
+ #include <linux/export.h>
   #include <linux/kmemleak.h>
   #include <linux/range.h>
   #include <linux/memblock.h>
@@@ -41,13 -41,14 +41,13 @@@ static void * __init __alloc_memory_cor
         if (limit > memblock.current_limit)
                 limit = memblock.current_limit;
   
- -      addr = find_memory_core_early(nid, size, align, goal, limit);
- -
- -      if (addr == MEMBLOCK_ERROR)
+ +      addr = memblock_find_in_range_node(goal, limit, size, align, nid);
+ +      if (!addr)
                 return NULL;
   
         ptr = phys_to_virt(addr);
         memset(ptr, 0, size);
- -      memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+ +      memblock_reserve(addr, size);
         /*
          * The min_count is set to 0 so that bootmem allocated blocks
          * are never reported as leaks.
@@@ -106,27 -107,23 +106,27 @@@ static void __init __free_pages_memory(
                 __free_pages_bootmem(pfn_to_page(i), 0);
   }
   
- -unsigned long __init free_all_memory_core_early(int nodeid)
+ +unsigned long __init free_low_memory_core_early(int nodeid)
   {
- -      int i;
- -      u64 start, end;
         unsigned long count = 0;
- -      struct range *range = NULL;
- -      int nr_range;
- -
- -      nr_range = get_free_all_memory_range(&range, nodeid);
- -
- -      for (i = 0; i < nr_range; i++) {
- -              start = range[i].start;
- -              end = range[i].end;
- -              count += end - start;
- -              __free_pages_memory(start, end);
+ +      phys_addr_t start, end;
+ +      u64 i;
+ +
+ +      /* free reserved array temporarily so that it's treated as free area */
+ +      memblock_free_reserved_regions();
+ +
+ +      for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+ +              unsigned long start_pfn = PFN_UP(start);
+ +              unsigned long end_pfn = min_t(unsigned long,
+ +                                            PFN_DOWN(end), max_low_pfn);
+ +              if (start_pfn < end_pfn) {
+ +                      __free_pages_memory(start_pfn, end_pfn);
+ +                      count += end_pfn - start_pfn;
+ +              }
         }
   
+ +      /* put region array back? */
+ +      memblock_reserve_reserved_regions();
         return count;
   }
   
@@@ -140,7 -137,7 +140,7 @@@ unsigned long __init free_all_bootmem_n
   {
         register_page_bootmem_info_node(pgdat);
   
- -      /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
+ +      /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
         return 0;
   }
   
@@@ -158,7 -155,7 +158,7 @@@ unsigned long __init free_all_bootmem(v
          * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
          *  will be used instead of only Node0 related
          */
- -      return free_all_memory_core_early(MAX_NUMNODES);
+ +      return free_low_memory_core_early(MAX_NUMNODES);
   }
   
   /**
@@@ -175,7 -172,7 +175,7 @@@ void __init free_bootmem_node(pg_data_
                               unsigned long size)
   {
         kmemleak_free_part(__va(physaddr), size);
- -      memblock_x86_free_range(physaddr, physaddr + size);
+ +      memblock_free(physaddr, size);
   }
   
   /**
@@@ -190,7 -187,7 +190,7 @@@
   void __init free_bootmem(unsigned long addr, unsigned long size)
   {
         kmemleak_free_part(__va(addr), size);
- -      memblock_x86_free_range(addr, addr + size);
+ +      memblock_free(addr, size);
   }
   
   static void * __init ___alloc_bootmem_nopanic(unsigned long size,
diff --combined mm/page_alloc.c

index 3c7ea45ffba96ef0bc020c763ea334c68fb8fee9,9dd443d89d8be665813bbeb4e17e54fafde46428..6ce27331834c94ee944a68fed300abc57dd1d850
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -182,31 -182,28 +182,31 @@@ static unsigned long __meminitdata nr_a
   static unsigned long __meminitdata dma_reserve;
   
   #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
- -  /*
- -   * MAX_ACTIVE_REGIONS determines the maximum number of distinct
- -   * ranges of memory (RAM) that may be registered with add_active_range().
- -   * Ranges passed to add_active_range() will be merged if possible
- -   * so the number of times add_active_range() can be called is
- -   * related to the number of nodes and the number of holes
- -   */
- -  #ifdef CONFIG_MAX_ACTIVE_REGIONS
- -    /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
- -    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
- -  #else
- -    #if MAX_NUMNODES >= 32
- -      /* If there can be many nodes, allow up to 50 holes per node */
- -      #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
+ +  #ifndef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ +    /*
+ +     * MAX_ACTIVE_REGIONS determines the maximum number of distinct ranges
+ +     * of memory (RAM) that may be registered with add_active_range().
+ +     * Ranges passed to add_active_range() will be merged if possible so
+ +     * the number of times add_active_range() can be called is related to
+ +     * the number of nodes and the number of holes
+ +     */
+ +    #ifdef CONFIG_MAX_ACTIVE_REGIONS
+ +      /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
+ +      #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
       #else
- -      /* By default, allow up to 256 distinct regions */
- -      #define MAX_ACTIVE_REGIONS 256
+ +      #if MAX_NUMNODES >= 32
+ +        /* If there can be many nodes, allow up to 50 holes per node */
+ +        #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
+ +      #else
+ +        /* By default, allow up to 256 distinct regions */
+ +        #define MAX_ACTIVE_REGIONS 256
+ +      #endif
       #endif
- -  #endif
   
- -  static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
- -  static int __meminitdata nr_nodemap_entries;
+ +    static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
+ +    static int __meminitdata nr_nodemap_entries;
+ +#endif /* !CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+ +
     static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
     static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
     static unsigned long __initdata required_kernelcore;
@@@ -321,6 -318,7 +321,7 @@@ static void bad_page(struct page *page
                 current->comm, page_to_pfn(page));
         dump_page(page);
   
+       print_modules();
         dump_stack();
   out:
         /* Leave bad fields for debug, except PageBuddy could make trouble */
@@@ -708,10 -706,10 +709,10 @@@ void __meminit __free_pages_bootmem(str
                 int loop;
   
                 prefetchw(page);
- -              for (loop = 0; loop < BITS_PER_LONG; loop++) {
+ +              for (loop = 0; loop < (1 << order); loop++) {
                         struct page *p = &page[loop];
   
- -                      if (loop + 1 < BITS_PER_LONG)
+ +                      if (loop + 1 < (1 << order))
                                 prefetchw(p + 1);
                         __ClearPageReserved(p);
                         set_page_count(p, 0);
@@@ -1373,21 -1371,12 +1374,12 @@@ failed
   
   #ifdef CONFIG_FAIL_PAGE_ALLOC
   
- static struct fail_page_alloc_attr {
+ static struct {
         struct fault_attr attr;
   
         u32 ignore_gfp_highmem;
         u32 ignore_gfp_wait;
         u32 min_order;
- 
- #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
- 
-       struct dentry *ignore_gfp_highmem_file;
-       struct dentry *ignore_gfp_wait_file;
-       struct dentry *min_order_file;
- 
- #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
- 
   } fail_page_alloc = {
         .attr = FAULT_ATTR_INITIALIZER,
         .ignore_gfp_wait = 1,
@@@ -1421,36 -1410,27 +1413,27 @@@ static int __init fail_page_alloc_debug
   {
         mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
         struct dentry *dir;
-       int err;
- 
-       err = init_fault_attr_dentries(&fail_page_alloc.attr,
-                                      "fail_page_alloc");
-       if (err)
-               return err;
-       dir = fail_page_alloc.attr.dentries.dir;
- 
-       fail_page_alloc.ignore_gfp_wait_file =
-               debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                                     &fail_page_alloc.ignore_gfp_wait);
- 
-       fail_page_alloc.ignore_gfp_highmem_file =
-               debugfs_create_bool("ignore-gfp-highmem", mode, dir,
-                                     &fail_page_alloc.ignore_gfp_highmem);
-       fail_page_alloc.min_order_file =
-               debugfs_create_u32("min-order", mode, dir,
-                                  &fail_page_alloc.min_order);
- 
-       if (!fail_page_alloc.ignore_gfp_wait_file ||
-             !fail_page_alloc.ignore_gfp_highmem_file ||
-             !fail_page_alloc.min_order_file) {
-               err = -ENOMEM;
-               debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
-               debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
-               debugfs_remove(fail_page_alloc.min_order_file);
-               cleanup_fault_attr_dentries(&fail_page_alloc.attr);
-       }
   
-       return err;
+       dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+                                       &fail_page_alloc.attr);
+       if (IS_ERR(dir))
+               return PTR_ERR(dir);
+ 
+       if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
+                               &fail_page_alloc.ignore_gfp_wait))
+               goto fail;
+       if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+                               &fail_page_alloc.ignore_gfp_highmem))
+               goto fail;
+       if (!debugfs_create_u32("min-order", mode, dir,
+                               &fail_page_alloc.min_order))
+               goto fail;
+ 
+       return 0;
+ fail:
+       debugfs_remove_recursive(dir);
+ 
+       return -ENOMEM;
   }
   
   late_initcall(fail_page_alloc_debugfs);
@@@ -1619,6 -1599,21 +1602,21 @@@ static void zlc_mark_zone_full(struct z
         set_bit(i, zlc->fullzones);
   }
   
+ /*
+  * clear all zones full, called after direct reclaim makes progress so that
+  * a zone that was recently full is not skipped over for up to a second
+  */
+ static void zlc_clear_zones_full(struct zonelist *zonelist)
+ {
+       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+ 
+       zlc = zonelist->zlcache_ptr;
+       if (!zlc)
+               return;
+ 
+       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ }
+ 
   #else /* CONFIG_NUMA */
   
   static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@@ -1635,6 -1630,10 +1633,10 @@@ static int zlc_zone_worth_trying(struc
   static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
   {
   }
+ 
+ static void zlc_clear_zones_full(struct zonelist *zonelist)
+ {
+ }
   #endif        /* CONFIG_NUMA */
   
   /*
@@@ -1667,7 -1666,7 +1669,7 @@@ zonelist_scan
                                 continue;
                 if ((alloc_flags & ALLOC_CPUSET) &&
                         !cpuset_zone_allowed_softwall(zone, gfp_mask))
-                               goto try_next_zone;
+                               continue;
   
                 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@@ -1679,17 -1678,36 +1681,36 @@@
                                     classzone_idx, alloc_flags))
                                 goto try_this_zone;
   
+                       if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+                               /*
+                                * we do zlc_setup if there are multiple nodes
+                                * and before considering the first zone allowed
+                                * by the cpuset.
+                                */
+                               allowednodes = zlc_setup(zonelist, alloc_flags);
+                               zlc_active = 1;
+                               did_zlc_setup = 1;
+                       }
+ 
                         if (zone_reclaim_mode == 0)
                                 goto this_zone_full;
   
+                       /*
+                        * As we may have just activated ZLC, check if the first
+                        * eligible zone has failed zone_reclaim recently.
+                        */
+                       if (NUMA_BUILD && zlc_active &&
+                               !zlc_zone_worth_trying(zonelist, z, allowednodes))
+                               continue;
+ 
                         ret = zone_reclaim(zone, gfp_mask, order);
                         switch (ret) {
                         case ZONE_RECLAIM_NOSCAN:
                                 /* did not scan */
-                               goto try_next_zone;
+                               continue;
                         case ZONE_RECLAIM_FULL:
                                 /* scanned but unreclaimable */
-                               goto this_zone_full;
+                               continue;
                         default:
                                 /* did we reclaim enough */
                                 if (!zone_watermark_ok(zone, order, mark,
@@@ -1706,16 -1724,6 +1727,6 @@@ try_this_zone
   this_zone_full:
                 if (NUMA_BUILD)
                         zlc_mark_zone_full(zonelist, z);
- try_next_zone:
-               if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
-                       /*
-                        * we do zlc_setup after the first zone is tried but only
-                        * if there are multiple nodes make it worthwhile
-                        */
-                       allowednodes = zlc_setup(zonelist, alloc_flags);
-                       zlc_active = 1;
-                       did_zlc_setup = 1;
-               }
         }
   
         if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@@ -1746,7 -1754,6 +1757,6 @@@ static DEFINE_RATELIMIT_STATE(nopage_rs
   
   void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
   {
-       va_list args;
         unsigned int filter = SHOW_MEM_FILTER_NODES;
   
         if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
@@@ -1765,14 -1772,21 +1775,21 @@@
                 filter &= ~SHOW_MEM_FILTER_NODES;
   
         if (fmt) {
-               printk(KERN_WARNING);
+               struct va_format vaf;
+               va_list args;
+ 
                 va_start(args, fmt);
-               vprintk(fmt, args);
+ 
+               vaf.fmt = fmt;
+               vaf.va = &args;
+ 
+               pr_warn("%pV", &vaf);
+ 
                 va_end(args);
         }
   
-       pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
-                  current->comm, order, gfp_mask);
+       pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+               current->comm, order, gfp_mask);
   
         dump_stack();
         if (!should_suppress_show_mem())
@@@ -1957,6 -1971,10 +1974,10 @@@ __alloc_pages_direct_reclaim(gfp_t gfp_
         if (unlikely(!(*did_some_progress)))
                 return NULL;
   
+       /* After successful reclaim, reconsider all zones for allocation */
+       if (NUMA_BUILD)
+               zlc_clear_zones_full(zonelist);
+ 
   retry:
         page = get_page_from_freelist(gfp_mask, nodemask, order,
                                         zonelist, high_zoneidx,
@@@ -3714,6 -3732,34 +3735,6 @@@ __meminit int init_currently_empty_zone
   }
   
   #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
- -/*
- - * Basic iterator support. Return the first range of PFNs for a node
- - * Note: nid == MAX_NUMNODES returns first region regardless of node
- - */
- -static int __meminit first_active_region_index_in_nid(int nid)
- -{
- -      int i;
- -
- -      for (i = 0; i < nr_nodemap_entries; i++)
- -              if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
- -                      return i;
- -
- -      return -1;
- -}
- -
- -/*
- - * Basic iterator support. Return the next active range of PFNs for a node
- - * Note: nid == MAX_NUMNODES returns next region regardless of node
- - */
- -static int __meminit next_active_region_index_in_nid(int index, int nid)
- -{
- -      for (index = index + 1; index < nr_nodemap_entries; index++)
- -              if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
- -                      return index;
- -
- -      return -1;
- -}
- -
   #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
   /*
    * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
@@@ -3723,12 -3769,15 +3744,12 @@@
    */
   int __meminit __early_pfn_to_nid(unsigned long pfn)
   {
- -      int i;
- -
- -      for (i = 0; i < nr_nodemap_entries; i++) {
- -              unsigned long start_pfn = early_node_map[i].start_pfn;
- -              unsigned long end_pfn = early_node_map[i].end_pfn;
+ +      unsigned long start_pfn, end_pfn;
+ +      int i, nid;
   
+ +      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
                 if (start_pfn <= pfn && pfn < end_pfn)
- -                      return early_node_map[i].nid;
- -      }
+ +                      return nid;
         /* This is a memory hole */
         return -1;
   }
@@@ -3757,6 -3806,11 +3778,6 @@@ bool __meminit early_pfn_in_nid(unsigne
   }
   #endif
   
- -/* Basic iterator support to walk early_node_map[] */
- -#define for_each_active_range_index_in_nid(i, nid) \
- -      for (i = first_active_region_index_in_nid(nid); i != -1; \
- -                              i = next_active_region_index_in_nid(i, nid))
- -
   /**
    * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
    * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@@ -3766,34 -3820,122 +3787,34 @@@
    * add_active_ranges() contain no holes and may be freed, this
    * this function may be used instead of calling free_bootmem() manually.
    */
- -void __init free_bootmem_with_active_regions(int nid,
- -                                              unsigned long max_low_pfn)
+ +void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
   {
- -      int i;
- -
- -      for_each_active_range_index_in_nid(i, nid) {
- -              unsigned long size_pages = 0;
- -              unsigned long end_pfn = early_node_map[i].end_pfn;
- -
- -              if (early_node_map[i].start_pfn >= max_low_pfn)
- -                      continue;
+ +      unsigned long start_pfn, end_pfn;
+ +      int i, this_nid;
   
- -              if (end_pfn > max_low_pfn)
- -                      end_pfn = max_low_pfn;
+ +      for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
+ +              start_pfn = min(start_pfn, max_low_pfn);
+ +              end_pfn = min(end_pfn, max_low_pfn);
   
- -              size_pages = end_pfn - early_node_map[i].start_pfn;
- -              free_bootmem_node(NODE_DATA(early_node_map[i].nid),
- -                              PFN_PHYS(early_node_map[i].start_pfn),
- -                              size_pages << PAGE_SHIFT);
+ +              if (start_pfn < end_pfn)
+ +                      free_bootmem_node(NODE_DATA(this_nid),
+ +                                        PFN_PHYS(start_pfn),
+ +                                        (end_pfn - start_pfn) << PAGE_SHIFT);
         }
   }
   
- -#ifdef CONFIG_HAVE_MEMBLOCK
- -/*
- - * Basic iterator support. Return the last range of PFNs for a node
- - * Note: nid == MAX_NUMNODES returns last region regardless of node
- - */
- -static int __meminit last_active_region_index_in_nid(int nid)
- -{
- -      int i;
- -
- -      for (i = nr_nodemap_entries - 1; i >= 0; i--)
- -              if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
- -                      return i;
- -
- -      return -1;
- -}
- -
- -/*
- - * Basic iterator support. Return the previous active range of PFNs for a node
- - * Note: nid == MAX_NUMNODES returns next region regardless of node
- - */
- -static int __meminit previous_active_region_index_in_nid(int index, int nid)
- -{
- -      for (index = index - 1; index >= 0; index--)
- -              if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
- -                      return index;
- -
- -      return -1;
- -}
- -
- -#define for_each_active_range_index_in_nid_reverse(i, nid) \
- -      for (i = last_active_region_index_in_nid(nid); i != -1; \
- -                              i = previous_active_region_index_in_nid(i, nid))
- -
- -u64 __init find_memory_core_early(int nid, u64 size, u64 align,
- -                                      u64 goal, u64 limit)
- -{
- -      int i;
- -
- -      /* Need to go over early_node_map to find out good range for node */
- -      for_each_active_range_index_in_nid_reverse(i, nid) {
- -              u64 addr;
- -              u64 ei_start, ei_last;
- -              u64 final_start, final_end;
- -
- -              ei_last = early_node_map[i].end_pfn;
- -              ei_last <<= PAGE_SHIFT;
- -              ei_start = early_node_map[i].start_pfn;
- -              ei_start <<= PAGE_SHIFT;
- -
- -              final_start = max(ei_start, goal);
- -              final_end = min(ei_last, limit);
- -
- -              if (final_start >= final_end)
- -                      continue;
- -
- -              addr = memblock_find_in_range(final_start, final_end, size, align);
- -
- -              if (addr == MEMBLOCK_ERROR)
- -                      continue;
- -
- -              return addr;
- -      }
- -
- -      return MEMBLOCK_ERROR;
- -}
- -#endif
- -
   int __init add_from_early_node_map(struct range *range, int az,
                                    int nr_range, int nid)
   {
+ +      unsigned long start_pfn, end_pfn;
         int i;
- -      u64 start, end;
   
         /* need to go over early_node_map to find out good range for node */
- -      for_each_active_range_index_in_nid(i, nid) {
- -              start = early_node_map[i].start_pfn;
- -              end = early_node_map[i].end_pfn;
- -              nr_range = add_range(range, az, nr_range, start, end);
- -      }
+ +      for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
+ +              nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
         return nr_range;
   }
   
- -void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
- -{
- -      int i;
- -      int ret;
- -
- -      for_each_active_range_index_in_nid(i, nid) {
- -              ret = work_fn(early_node_map[i].start_pfn,
- -                            early_node_map[i].end_pfn, data);
- -              if (ret)
- -                      break;
- -      }
- -}
   /**
    * sparse_memory_present_with_active_regions - Call memory_present for each active range
    * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@@ -3804,11 -3946,12 +3825,11 @@@
    */
   void __init sparse_memory_present_with_active_regions(int nid)
   {
- -      int i;
+ +      unsigned long start_pfn, end_pfn;
+ +      int i, this_nid;
   
- -      for_each_active_range_index_in_nid(i, nid)
- -              memory_present(early_node_map[i].nid,
- -                              early_node_map[i].start_pfn,
- -                              early_node_map[i].end_pfn);
+ +      for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
+ +              memory_present(this_nid, start_pfn, end_pfn);
   }
   
   /**
@@@ -3825,15 -3968,13 +3846,15 @@@
   void __meminit get_pfn_range_for_nid(unsigned int nid,
                         unsigned long *start_pfn, unsigned long *end_pfn)
   {
+ +      unsigned long this_start_pfn, this_end_pfn;
         int i;
+ +
         *start_pfn = -1UL;
         *end_pfn = 0;
   
- -      for_each_active_range_index_in_nid(i, nid) {
- -              *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
- -              *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+ +      for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
+ +              *start_pfn = min(*start_pfn, this_start_pfn);
+ +              *end_pfn = max(*end_pfn, this_end_pfn);
         }
   
         if (*start_pfn == -1UL)
@@@ -3936,16 -4077,46 +3957,16 @@@ unsigned long __meminit __absent_pages_
                                 unsigned long range_start_pfn,
                                 unsigned long range_end_pfn)
   {
- -      int i = 0;
- -      unsigned long prev_end_pfn = 0, hole_pages = 0;
- -      unsigned long start_pfn;
- -
- -      /* Find the end_pfn of the first active range of pfns in the node */
- -      i = first_active_region_index_in_nid(nid);
- -      if (i == -1)
- -              return 0;
- -
- -      prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
- -
- -      /* Account for ranges before physical memory on this node */
- -      if (early_node_map[i].start_pfn > range_start_pfn)
- -              hole_pages = prev_end_pfn - range_start_pfn;
- -
- -      /* Find all holes for the zone within the node */
- -      for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
- -
- -              /* No need to continue if prev_end_pfn is outside the zone */
- -              if (prev_end_pfn >= range_end_pfn)
- -                      break;
- -
- -              /* Make sure the end of the zone is not within the hole */
- -              start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
- -              prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+ +      unsigned long nr_absent = range_end_pfn - range_start_pfn;
+ +      unsigned long start_pfn, end_pfn;
+ +      int i;
   
- -              /* Update the hole size cound and move on */
- -              if (start_pfn > range_start_pfn) {
- -                      BUG_ON(prev_end_pfn > start_pfn);
- -                      hole_pages += start_pfn - prev_end_pfn;
- -              }
- -              prev_end_pfn = early_node_map[i].end_pfn;
+ +      for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ +              start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+ +              end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+ +              nr_absent -= end_pfn - start_pfn;
         }
- -
- -      /* Account for ranges past physical memory on this node */
- -      if (range_end_pfn > prev_end_pfn)
- -              hole_pages += range_end_pfn -
- -                              max(range_start_pfn, prev_end_pfn);
- -
- -      return hole_pages;
+ +      return nr_absent;
   }
   
   /**
@@@ -3966,14 -4137,14 +3987,14 @@@ static unsigned long __meminit zone_abs
                                         unsigned long zone_type,
                                         unsigned long *ignored)
   {
+ +      unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ +      unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
         unsigned long node_start_pfn, node_end_pfn;
         unsigned long zone_start_pfn, zone_end_pfn;
   
         get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
- -      zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
- -                                                      node_start_pfn);
- -      zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
- -                                                      node_end_pfn);
+ +      zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ +      zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
   
         adjust_zone_range_for_zone_movable(nid, zone_type,
                         node_start_pfn, node_end_pfn,
@@@ -4271,35 -4442,6 +4292,35 @@@ static inline void setup_nr_node_ids(vo
   }
   #endif
   
+ +#ifndef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ +/*
+ + * Common iterator interface used to define for_each_mem_pfn_range().
+ + */
+ +void __meminit __next_mem_pfn_range(int *idx, int nid,
+ +                                  unsigned long *out_start_pfn,
+ +                                  unsigned long *out_end_pfn, int *out_nid)
+ +{
+ +      struct node_active_region *r = NULL;
+ +
+ +      while (++*idx < nr_nodemap_entries) {
+ +              if (nid == MAX_NUMNODES || nid == early_node_map[*idx].nid) {
+ +                      r = &early_node_map[*idx];
+ +                      break;
+ +              }
+ +      }
+ +      if (!r) {
+ +              *idx = -1;
+ +              return;
+ +      }
+ +
+ +      if (out_start_pfn)
+ +              *out_start_pfn = r->start_pfn;
+ +      if (out_end_pfn)
+ +              *out_end_pfn = r->end_pfn;
+ +      if (out_nid)
+ +              *out_nid = r->nid;
+ +}
+ +
   /**
    * add_active_range - Register a range of PFNs backed by physical memory
    * @nid: The node ID the range resides on
@@@ -4377,7 -4519,6 +4398,7 @@@ void __init add_active_range(unsigned i
   void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
                                 unsigned long end_pfn)
   {
+ +      unsigned long this_start_pfn, this_end_pfn;
         int i, j;
         int removed = 0;
   
@@@ -4385,22 -4526,26 +4406,22 @@@
                           nid, start_pfn, end_pfn);
   
         /* Find the old active region end and shrink */
- -      for_each_active_range_index_in_nid(i, nid) {
- -              if (early_node_map[i].start_pfn >= start_pfn &&
- -                  early_node_map[i].end_pfn <= end_pfn) {
+ +      for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
+ +              if (this_start_pfn >= start_pfn && this_end_pfn <= end_pfn) {
                         /* clear it */
                         early_node_map[i].start_pfn = 0;
                         early_node_map[i].end_pfn = 0;
                         removed = 1;
                         continue;
                 }
- -              if (early_node_map[i].start_pfn < start_pfn &&
- -                  early_node_map[i].end_pfn > start_pfn) {
- -                      unsigned long temp_end_pfn = early_node_map[i].end_pfn;
+ +              if (this_start_pfn < start_pfn && this_end_pfn > start_pfn) {
                         early_node_map[i].end_pfn = start_pfn;
- -                      if (temp_end_pfn > end_pfn)
- -                              add_active_range(nid, end_pfn, temp_end_pfn);
+ +                      if (this_end_pfn > end_pfn)
+ +                              add_active_range(nid, end_pfn, this_end_pfn);
                         continue;
                 }
- -              if (early_node_map[i].start_pfn >= start_pfn &&
- -                  early_node_map[i].end_pfn > end_pfn &&
- -                  early_node_map[i].start_pfn < end_pfn) {
+ +              if (this_start_pfn >= start_pfn && this_end_pfn > end_pfn &&
+ +                  this_start_pfn < end_pfn) {
                         early_node_map[i].start_pfn = end_pfn;
                         continue;
                 }
@@@ -4460,11 -4605,6 +4481,11 @@@ void __init sort_node_map(void
                         sizeof(struct node_active_region),
                         cmp_node_active_region, NULL);
   }
+ +#else /* !CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+ +static inline void sort_node_map(void)
+ +{
+ +}
+ +#endif
   
   /**
    * node_map_pfn_alignment - determine the maximum internode alignment
@@@ -4488,11 -4628,15 +4509,11 @@@
   unsigned long __init node_map_pfn_alignment(void)
   {
         unsigned long accl_mask = 0, last_end = 0;
+ +      unsigned long start, end, mask;
         int last_nid = -1;
- -      int i;
- -
- -      for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
- -              int nid = early_node_map[i].nid;
- -              unsigned long start = early_node_map[i].start_pfn;
- -              unsigned long end = early_node_map[i].end_pfn;
- -              unsigned long mask;
+ +      int i, nid;
   
+ +      for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
                 if (!start || last_nid < 0 || last_nid == nid) {
                         last_nid = nid;
                         last_end = end;
@@@ -4519,12 -4663,12 +4540,12 @@@
   /* Find the lowest pfn for a node */
   static unsigned long __init find_min_pfn_for_node(int nid)
   {
- -      int i;
         unsigned long min_pfn = ULONG_MAX;
+ +      unsigned long start_pfn;
+ +      int i;
   
- -      /* Assuming a sorted map, the first range found has the starting pfn */
- -      for_each_active_range_index_in_nid(i, nid)
- -              min_pfn = min(min_pfn, early_node_map[i].start_pfn);
+ +      for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
+ +              min_pfn = min(min_pfn, start_pfn);
   
         if (min_pfn == ULONG_MAX) {
                 printk(KERN_WARNING
@@@ -4553,16 -4697,15 +4574,16 @@@ unsigned long __init find_min_pfn_with_
    */
   static unsigned long __init early_calculate_totalpages(void)
   {
- -      int i;
         unsigned long totalpages = 0;
+ +      unsigned long start_pfn, end_pfn;
+ +      int i, nid;
+ +
+ +      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ +              unsigned long pages = end_pfn - start_pfn;
   
- -      for (i = 0; i < nr_nodemap_entries; i++) {
- -              unsigned long pages = early_node_map[i].end_pfn -
- -                                              early_node_map[i].start_pfn;
                 totalpages += pages;
                 if (pages)
- -                      node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
+ +                      node_set_state(nid, N_HIGH_MEMORY);
         }
         return totalpages;
   }
@@@ -4617,8 -4760,6 +4638,8 @@@ restart
         /* Spread kernelcore memory as evenly as possible throughout nodes */
         kernelcore_node = required_kernelcore / usable_nodes;
         for_each_node_state(nid, N_HIGH_MEMORY) {
+ +              unsigned long start_pfn, end_pfn;
+ +
                 /*
                  * Recalculate kernelcore_node if the division per node
                  * now exceeds what is necessary to satisfy the requested
@@@ -4635,10 -4776,13 +4656,10 @@@
                 kernelcore_remaining = kernelcore_node;
   
                 /* Go through each range of PFNs within this node */
- -              for_each_active_range_index_in_nid(i, nid) {
- -                      unsigned long start_pfn, end_pfn;
+ +              for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
                         unsigned long size_pages;
   
- -                      start_pfn = max(early_node_map[i].start_pfn,
- -                                              zone_movable_pfn[nid]);
- -                      end_pfn = early_node_map[i].end_pfn;
+ +                      start_pfn = max(start_pfn, zone_movable_pfn[nid]);
                         if (start_pfn >= end_pfn)
                                 continue;
   
@@@ -4740,8 -4884,8 +4761,8 @@@ static void check_for_regular_memory(pg
    */
   void __init free_area_init_nodes(unsigned long *max_zone_pfn)
   {
- -      unsigned long nid;
- -      int i;
+ +      unsigned long start_pfn, end_pfn;
+ +      int i, nid;
   
         /* Sort early_node_map as initialisation assumes it is sorted */
         sort_node_map();
@@@ -4791,9 -4935,11 +4812,9 @@@
         }
   
         /* Print out the early_node_map[] */
- -      printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
- -      for (i = 0; i < nr_nodemap_entries; i++)
- -              printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
- -                                              early_node_map[i].start_pfn,
- -                                              early_node_map[i].end_pfn);
+ +      printk("Early memory PFN ranges\n");
+ +      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+ +              printk("  %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
   
         /* Initialise every node */
         mminit_verify_pageflags_layout();
author	Tejun Heo <tj@kernel.org>
	Mon, 28 Nov 2011 17:46:22 +0000 (09:46 -0800)
committer	Tejun Heo <tj@kernel.org>
	Mon, 28 Nov 2011 17:46:22 +0000 (09:46 -0800)
		1	2
arch/powerpc/mm/numa.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/sparc/mm/init_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/e820.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/platform/efi/efi.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/xen/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/xen/setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/iommu/intel-iommu.c	patch \|	\|	diff2 \|	blob \| history
include/linux/memblock.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/printk.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memblock.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/nobootmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history