[PATCH] Avoid excessive sorting of early_node_map[]

[mirror_ubuntu-bionic-kernel.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index cd47e8f7bd5bfda18ad83e546a202da2781964c7..f26fdc94393e1af05166af704252010518e5d950 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -40,6 +40,7 @@
  #include <linux/sort.h>
  #include <linux/pfn.h>
  #include <linux/backing-dev.h>
+#include <linux/fault-inject.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
@@ -83,7 +84,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
  
  EXPORT_SYMBOL(totalram_pages);
  
-static char *zone_names[MAX_NR_ZONES] = {
+static char * const zone_names[MAX_NR_ZONES] = {
          "DMA",
  #ifdef CONFIG_ZONE_DMA32
          "DMA32",
@@ -230,7 +231,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
         int i;
         int nr_pages = 1 << order;
  
-       page[1].lru.next = (void *)free_compound_page;  /* set dtor */
+       set_compound_page_dtor(page, free_compound_page);
         page[1].lru.prev = (void *)order;
         for (i = 0; i < nr_pages; i++) {
                 struct page *p = page + i;
@@ -685,9 +686,15 @@ void drain_node_pages(int nodeid)
  
                         pcp = &pset->pcp[i];
                         if (pcp->count) {
+                               int to_drain;
+
                                 local_irq_save(flags);
-                               free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-                               pcp->count = 0;
+                               if (pcp->count >= pcp->batch)
+                                       to_drain = pcp->batch;
+                               else
+                                       to_drain = pcp->count;
+                               free_pages_bulk(zone, to_drain, &pcp->list, 0);
+                               pcp->count -= to_drain;
                                 local_irq_restore(flags);
                         }
                 }
@@ -695,7 +702,6 @@ void drain_node_pages(int nodeid)
  }
  #endif
  
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
  static void __drain_pages(unsigned int cpu)
  {
         unsigned long flags;
@@ -705,6 +711,9 @@ static void __drain_pages(unsigned int cpu)
         for_each_zone(zone) {
                 struct per_cpu_pageset *pset;
  
+               if (!populated_zone(zone))
+                       continue;
+
                 pset = zone_pcp(zone, cpu);
                 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                         struct per_cpu_pages *pcp;
@@ -717,7 +726,6 @@ static void __drain_pages(unsigned int cpu)
                 }
         }
  }
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
  
  #ifdef CONFIG_PM
  
@@ -888,6 +896,91 @@ failed:
  #define ALLOC_HIGH             0x20 /* __GFP_HIGH set */
  #define ALLOC_CPUSET           0x40 /* check for correct cpuset */
  
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+
+static struct fail_page_alloc_attr {
+       struct fault_attr attr;
+
+       u32 ignore_gfp_highmem;
+       u32 ignore_gfp_wait;
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+       struct dentry *ignore_gfp_highmem_file;
+       struct dentry *ignore_gfp_wait_file;
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+} fail_page_alloc = {
+       .attr = FAULT_ATTR_INITIALIZER,
+       .ignore_gfp_wait = 1,
+       .ignore_gfp_highmem = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+       return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+       if (gfp_mask & __GFP_NOFAIL)
+               return 0;
+       if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+               return 0;
+       if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+               return 0;
+
+       return should_fail(&fail_page_alloc.attr, 1 << order);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+       mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+       struct dentry *dir;
+       int err;
+
+       err = init_fault_attr_dentries(&fail_page_alloc.attr,
+                                      "fail_page_alloc");
+       if (err)
+               return err;
+       dir = fail_page_alloc.attr.dentries.dir;
+
+       fail_page_alloc.ignore_gfp_wait_file =
+               debugfs_create_bool("ignore-gfp-wait", mode, dir,
+                                     &fail_page_alloc.ignore_gfp_wait);
+
+       fail_page_alloc.ignore_gfp_highmem_file =
+               debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+                                     &fail_page_alloc.ignore_gfp_highmem);
+
+       if (!fail_page_alloc.ignore_gfp_wait_file ||
+                       !fail_page_alloc.ignore_gfp_highmem_file) {
+               err = -ENOMEM;
+               debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
+               debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
+               cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+       }
+
+       return err;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else /* CONFIG_FAIL_PAGE_ALLOC */
+
+static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+       return 0;
+}
+
+#endif /* CONFIG_FAIL_PAGE_ALLOC */
+
  /*
   * Return 1 if free pages are above 'mark'. This takes into account the order
   * of the allocation.
@@ -896,8 +989,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                       int classzone_idx, int alloc_flags)
  {
         /* free_pages my go negative - that's OK */
-       unsigned long min = mark;
-       long free_pages = z->free_pages - (1 << order) + 1;
+       long min = mark, free_pages = z->free_pages - (1 << order) + 1;
         int o;
  
         if (alloc_flags & ALLOC_HIGH)
@@ -1072,7 +1164,7 @@ zonelist_scan:
                         zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
                                 break;
                 if ((alloc_flags & ALLOC_CPUSET) &&
-                       !cpuset_zone_allowed(zone, gfp_mask))
+                       !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                 goto try_next_zone;
  
                 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1132,6 +1224,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
  
         might_sleep_if(wait);
  
+       if (should_fail_alloc_page(gfp_mask, order))
+               return NULL;
+
  restart:
         z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
  
@@ -1145,6 +1240,17 @@ restart:
         if (page)
                 goto got_pg;
  
+       /*
+        * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+        * __GFP_NOWARN set) should not cause reclaim since the subsystem
+        * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+        * using a larger set of nodes after it has established that the
+        * allowed per node queues are empty and that nodes are
+        * over allocated.
+        */
+       if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+               goto nopage;
+
         for (z = zonelist->zones; *z; z++)
                 wakeup_kswapd(*z, order);
  
@@ -1180,6 +1286,7 @@ restart:
  
         /* This allocation should allow future memory freeing. */
  
+rebalance:
         if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
                         && !in_interrupt()) {
                 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1201,7 +1308,6 @@ nofail_alloc:
         if (!wait)
                 goto nopage;
  
-rebalance:
         cond_resched();
  
         /* We now go into synchronous reclaim */
@@ -1401,7 +1507,7 @@ unsigned int nr_free_pagecache_pages(void)
  static inline void show_node(struct zone *zone)
  {
         if (NUMA_BUILD)
-               printk("Node %ld ", zone_to_nid(zone));
+               printk("Node %d ", zone_to_nid(zone));
  }
  
  void si_meminfo(struct sysinfo *val)
@@ -1473,8 +1579,8 @@ void show_free_areas(void)
  
         get_zone_counts(&active, &inactive, &free);
  
-       printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
-               "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+       printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+               " free:%u slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
                 active,
                 inactive,
                 global_page_state(NR_FILE_DIRTY),
@@ -1484,7 +1590,8 @@ void show_free_areas(void)
                 global_page_state(NR_SLAB_RECLAIMABLE) +
                         global_page_state(NR_SLAB_UNRECLAIMABLE),
                 global_page_state(NR_FILE_MAPPED),
-               global_page_state(NR_PAGETABLE));
+               global_page_state(NR_PAGETABLE),
+               global_page_state(NR_BOUNCE));
  
         for_each_zone(zone) {
                 int i;
@@ -1849,17 +1956,24 @@ static inline unsigned long wait_table_bits(unsigned long size)
   * done. Non-atomic initialization, single-pass.
   */
  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
-               unsigned long start_pfn)
+               unsigned long start_pfn, enum memmap_context context)
  {
         struct page *page;
         unsigned long end_pfn = start_pfn + size;
         unsigned long pfn;
  
         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-               if (!early_pfn_valid(pfn))
-                       continue;
-               if (!early_pfn_in_nid(pfn, nid))
-                       continue;
+               /*
+                * There can be holes in boot-time mem_map[]s
+                * handed to this function.  They do not
+                * exist on hotplugged memory.
+                */
+               if (context == MEMMAP_EARLY) {
+                       if (!early_pfn_valid(pfn))
+                               continue;
+                       if (!early_pfn_in_nid(pfn, nid))
+                               continue;
+               }
                 page = pfn_to_page(pfn);
                 set_page_links(page, zone, nid, pfn);
                 init_page_count(page);
@@ -1886,7 +2000,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
  
  #ifndef __HAVE_ARCH_MEMMAP_INIT
  #define memmap_init(size, nid, zone, start_pfn) \
-       memmap_init_zone((size), (nid), (zone), (start_pfn))
+       memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
  #endif
  
  static int __cpuinit zone_batchsize(struct zone *zone)
@@ -2036,16 +2150,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
         int ret = NOTIFY_OK;
  
         switch (action) {
-               case CPU_UP_PREPARE:
-                       if (process_zones(cpu))
-                               ret = NOTIFY_BAD;
-                       break;
-               case CPU_UP_CANCELED:
-               case CPU_DEAD:
-                       free_zone_pagesets(cpu);
-                       break;
-               default:
-                       break;
+       case CPU_UP_PREPARE:
+               if (process_zones(cpu))
+                       ret = NOTIFY_BAD;
+               break;
+       case CPU_UP_CANCELED:
+       case CPU_DEAD:
+               free_zone_pagesets(cpu);
+               break;
+       default:
+               break;
         }
         return ret;
  }
@@ -2132,7 +2246,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
  
  __meminit int init_currently_empty_zone(struct zone *zone,
                                         unsigned long zone_start_pfn,
-                                       unsigned long size)
+                                       unsigned long size,
+                                       enum memmap_context context)
  {
         struct pglist_data *pgdat = zone->zone_pgdat;
         int ret;
@@ -2576,7 +2691,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                 if (!size)
                         continue;
  
-               ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+               ret = init_currently_empty_zone(zone, zone_start_pfn,
+                                               size, MEMMAP_EARLY);
                 BUG_ON(ret);
                 zone_start_pfn += size;
         }
@@ -2761,20 +2877,23 @@ static void __init sort_node_map(void)
                         cmp_node_active_region, NULL);
  }
  
-/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+/* Find the lowest pfn for a node */
  unsigned long __init find_min_pfn_for_node(unsigned long nid)
  {
         int i;
-
-       /* Regions in the early_node_map can be in any order */
-       sort_node_map();
+       unsigned long min_pfn = ULONG_MAX;
  
         /* Assuming a sorted map, the first range found has the starting pfn */
         for_each_active_range_index_in_nid(i, nid)
-               return early_node_map[i].start_pfn;
+               min_pfn = min(min_pfn, early_node_map[i].start_pfn);
  
-       printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
-       return 0;
+       if (min_pfn == ULONG_MAX) {
+               printk(KERN_WARNING
+                       "Could not find start_pfn for node %lu\n", nid);
+               return 0;
+       }
+
+       return min_pfn;
  }
  
  /**
@@ -2823,6 +2942,9 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
         unsigned long nid;
         enum zone_type i;
  
+       /* Sort early_node_map as initialisation assumes it is sorted */
+       sort_node_map();
+
         /* Record where the zone boundaries are */
         memset(arch_zone_lowest_possible_pfn, 0,
                                 sizeof(arch_zone_lowest_possible_pfn));
@@ -2890,7 +3012,6 @@ void __init free_area_init(unsigned long *zones_size)
                         __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
  }
  
-#ifdef CONFIG_HOTPLUG_CPU
  static int page_alloc_cpu_notify(struct notifier_block *self,
                                  unsigned long action, void *hcpu)
  {
@@ -2905,7 +3026,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
         }
         return NOTIFY_OK;
  }
-#endif /* CONFIG_HOTPLUG_CPU */
  
  void __init page_alloc_init(void)
  {
@@ -3209,7 +3329,7 @@ void *__init alloc_large_system_hash(const char *tablename,
         /* allow the kernel cmdline to have a say */
         if (!numentries) {
                 /* round applicable memory size up to nearest megabyte */
-               numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
+               numentries = nr_kernel_pages;
                 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
                 numentries >>= 20 - PAGE_SHIFT;
                 numentries <<= 20 - PAGE_SHIFT;
@@ -3219,6 +3339,10 @@ void *__init alloc_large_system_hash(const char *tablename,
                         numentries >>= (scale - PAGE_SHIFT);
                 else
                         numentries <<= (PAGE_SHIFT - scale);
+
+               /* Make sure we've got at least a 0-order allocation.. */
+               if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+                       numentries = PAGE_SIZE / bucketsize;
         }
         numentries = roundup_pow_of_two(numentries);
  
@@ -3231,7 +3355,7 @@ void *__init alloc_large_system_hash(const char *tablename,
         if (numentries > max)
                 numentries = max;
  
-       log2qty = long_log2(numentries);
+       log2qty = ilog2(numentries);
  
         do {
                 size = bucketsize << log2qty;
@@ -3253,7 +3377,7 @@ void *__init alloc_large_system_hash(const char *tablename,
         printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
                tablename,
                (1U << log2qty),
-              long_log2(size) - PAGE_SHIFT,
+              ilog2(size) - PAGE_SHIFT,
                size);
  
         if (_hash_shift)