[PATCH] jsm: use dynamic major number allocation

[mirror_ubuntu-artful-kernel.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 2019c1b19254e4b2085064c911b40b3f962a32bb..1d6ba6a4b594e7db82617c67449831482f52412d 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
   * Used by page_zone() to look up the address of the struct zone whose
   * id is encoded in the upper bits of page->flags
   */
-struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
+struct zone *zone_table[1 << ZONETABLE_SHIFT];
  EXPORT_SYMBOL(zone_table);
  
  static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page)
         printk(KERN_EMERG "Backtrace:\n");
         dump_stack();
         printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
-       page->flags &= ~(1 << PG_private        |
+       page->flags &= ~(1 << PG_lru    |
+                       1 << PG_private |
                         1 << PG_locked  |
-                       1 << PG_lru     |
                         1 << PG_active  |
                         1 << PG_dirty   |
+                       1 << PG_reclaim |
+                       1 << PG_slab    |
                         1 << PG_swapcache |
                         1 << PG_writeback);
         set_page_count(page, 0);
@@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order)
   */
  static void prep_new_page(struct page *page, int order)
  {
-       if (page->mapping || page_mapcount(page) ||
-           (page->flags & (
+       if (    page_mapcount(page) ||
+               page->mapping != NULL ||
+               page_count(page) != 0 ||
+               (page->flags & (
+                       1 << PG_lru     |
                         1 << PG_private |
                         1 << PG_locked  |
-                       1 << PG_lru     |
                         1 << PG_active  |
                         1 << PG_dirty   |
                         1 << PG_reclaim |
+                       1 << PG_slab    |
                         1 << PG_swapcache |
                         1 << PG_writeback )))
                 bad_page(__FUNCTION__, page);
@@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
         return allocated;
  }
  
+#ifdef CONFIG_NUMA
+/* Called from the slab reaper to drain remote pagesets */
+void drain_remote_pages(void)
+{
+       struct zone *zone;
+       int i;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       for_each_zone(zone) {
+               struct per_cpu_pageset *pset;
+
+               /* Do not drain local pagesets */
+               if (zone->zone_pgdat->node_id == numa_node_id())
+                       continue;
+
+               pset = zone->pageset[smp_processor_id()];
+               for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+                       struct per_cpu_pages *pcp;
+
+                       pcp = &pset->pcp[i];
+                       if (pcp->count)
+                               pcp->count -= free_pages_bulk(zone, pcp->count,
+                                               &pcp->list, 0);
+               }
+       }
+       local_irq_restore(flags);
+}
+#endif
+
  #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
  static void __drain_pages(unsigned int cpu)
  {
@@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu)
         for_each_zone(zone) {
                 struct per_cpu_pageset *pset;
  
-               pset = &zone->pageset[cpu];
+               pset = zone_pcp(zone, cpu);
                 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                         struct per_cpu_pages *pcp;
  
@@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
  
         local_irq_save(flags);
         cpu = smp_processor_id();
-       p = &z->pageset[cpu];
+       p = zone_pcp(z,cpu);
         if (pg == orig) {
-               z->pageset[cpu].numa_hit++;
+               p->numa_hit++;
         } else {
                 p->numa_miss++;
-               zonelist->zones[0]->pageset[cpu].numa_foreign++;
+               zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
         }
         if (pg == NODE_DATA(numa_node_id()))
                 p->local_node++;
@@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
         if (PageAnon(page))
                 page->mapping = NULL;
         free_pages_check(__FUNCTION__, page);
-       pcp = &zone->pageset[get_cpu()].pcp[cold];
+       pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
         local_irq_save(flags);
-       if (pcp->count >= pcp->high)
-               pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
         list_add(&page->lru, &pcp->list);
         pcp->count++;
+       if (pcp->count >= pcp->high)
+               pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
         local_irq_restore(flags);
         put_cpu();
  }
@@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
         if (order == 0) {
                 struct per_cpu_pages *pcp;
  
-               pcp = &zone->pageset[get_cpu()].pcp[cold];
+               pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
                 local_irq_save(flags);
                 if (pcp->count <= pcp->low)
                         pcp->count += rmqueue_bulk(zone, 0,
@@ -854,7 +889,7 @@ rebalance:
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
  
-       did_some_progress = try_to_free_pages(zones, gfp_mask, order);
+       did_some_progress = try_to_free_pages(zones, gfp_mask);
  
         p->reclaim_state = NULL;
         p->flags &= ~PF_MEMALLOC;
@@ -862,12 +897,6 @@ rebalance:
         cond_resched();
  
         if (likely(did_some_progress)) {
-               /*
-                * Go through the zonelist yet one more time, keep
-                * very high watermark here, this is only to catch
-                * a parallel oom killing, we must fail if we're still
-                * under heavy pressure.
-                */
                 for (i = 0; (z = zones[i]) != NULL; i++) {
                         if (!zone_watermark_ok(z, order, z->pages_min,
                                                classzone_idx, can_try_harder,
@@ -901,7 +930,7 @@ rebalance:
                                 goto got_pg;
                 }
  
-               out_of_memory(gfp_mask);
+               out_of_memory(gfp_mask, order);
                 goto restart;
         }
  
@@ -930,6 +959,7 @@ nopage:
                         " order:%d, mode:0x%x\n",
                         p->comm, order, gfp_mask);
                 dump_stack();
+               show_mem();
         }
         return NULL;
  got_pg:
@@ -1139,7 +1169,7 @@ void get_full_page_state(struct page_state *ret)
         __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
  }
  
-unsigned long __read_page_state(unsigned offset)
+unsigned long __read_page_state(unsigned long offset)
  {
         unsigned long ret = 0;
         int cpu;
@@ -1153,7 +1183,7 @@ unsigned long __read_page_state(unsigned offset)
         return ret;
  }
  
-void __mod_page_state(unsigned offset, unsigned long delta)
+void __mod_page_state(unsigned long offset, unsigned long delta)
  {
         unsigned long flags;
         void* ptr;
@@ -1262,22 +1292,23 @@ void show_free_areas(void)
                         if (!cpu_possible(cpu))
                                 continue;
  
-                       pageset = zone->pageset + cpu;
+                       pageset = zone_pcp(zone, cpu);
  
                         for (temperature = 0; temperature < 2; temperature++)
-                               printk("cpu %d %s: low %d, high %d, batch %d\n",
+                               printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
                                         cpu,
                                         temperature ? "cold" : "hot",
                                         pageset->pcp[temperature].low,
                                         pageset->pcp[temperature].high,
-                                       pageset->pcp[temperature].batch);
+                                       pageset->pcp[temperature].batch,
+                                       pageset->pcp[temperature].count);
                 }
         }
  
         get_page_state(&ps);
         get_zone_counts(&active, &inactive, &free);
  
-       printk("\nFree pages: %11ukB (%ukB HighMem)\n",
+       printk("Free pages: %11ukB (%ukB HighMem)\n",
                 K(nr_free_pages()),
                 K(nr_free_highpages()));
  
@@ -1612,11 +1643,17 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
  void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                 unsigned long start_pfn)
  {
-       struct page *start = pfn_to_page(start_pfn);
         struct page *page;
+       unsigned long end_pfn = start_pfn + size;
+       unsigned long pfn;
  
-       for (page = start; page < (start + size); page++) {
-               set_page_zone(page, NODEZONE(nid, zone));
+       for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
+               if (!early_pfn_valid(pfn))
+                       continue;
+               if (!early_pfn_in_nid(pfn, nid))
+                       continue;
+               page = pfn_to_page(pfn);
+               set_page_links(page, zone, nid, pfn);
                 set_page_count(page, 0);
                 reset_page_mapcount(page);
                 SetPageReserved(page);
@@ -1624,9 +1661,8 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
  #ifdef WANT_PAGE_VIRTUAL
                 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
                 if (!is_highmem_idx(zone))
-                       set_page_address(page, __va(start_pfn << PAGE_SHIFT));
+                       set_page_address(page, __va(pfn << PAGE_SHIFT));
  #endif
-               start_pfn++;
         }
  }
  
@@ -1640,11 +1676,181 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
         }
  }
  
+#define ZONETABLE_INDEX(x, zone_nr)    ((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+               unsigned long size)
+{
+       unsigned long snum = pfn_to_section_nr(pfn);
+       unsigned long end = pfn_to_section_nr(pfn + size);
+
+       if (FLAGS_HAS_NODE)
+               zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+       else
+               for (; snum <= end; snum++)
+                       zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
  #ifndef __HAVE_ARCH_MEMMAP_INIT
  #define memmap_init(size, nid, zone, start_pfn) \
         memmap_init_zone((size), (nid), (zone), (start_pfn))
  #endif
  
+static int __devinit zone_batchsize(struct zone *zone)
+{
+       int batch;
+
+       /*
+        * The per-cpu-pages pools are set to around 1000th of the
+        * size of the zone.  But no more than 1/4 of a meg - there's
+        * no point in going beyond the size of L2 cache.
+        *
+        * OK, so we don't know how big the cache is.  So guess.
+        */
+       batch = zone->present_pages / 1024;
+       if (batch * PAGE_SIZE > 256 * 1024)
+               batch = (256 * 1024) / PAGE_SIZE;
+       batch /= 4;             /* We effectively *= 4 below */
+       if (batch < 1)
+               batch = 1;
+
+       /*
+        * Clamp the batch to a 2^n - 1 value. Having a power
+        * of 2 value was found to be more likely to have
+        * suboptimal cache aliasing properties in some cases.
+        *
+        * For example if 2 tasks are alternately allocating
+        * batches of pages, one task can end up with a lot
+        * of pages of one half of the possible page colors
+        * and the other with pages of the other colors.
+        */
+       batch = (1 << fls(batch + batch/2)) - 1;
+       return batch;
+}
+
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+       struct per_cpu_pages *pcp;
+
+       pcp = &p->pcp[0];               /* hot */
+       pcp->count = 0;
+       pcp->low = 2 * batch;
+       pcp->high = 6 * batch;
+       pcp->batch = max(1UL, 1 * batch);
+       INIT_LIST_HEAD(&pcp->list);
+
+       pcp = &p->pcp[1];               /* cold*/
+       pcp->count = 0;
+       pcp->low = 0;
+       pcp->high = 2 * batch;
+       pcp->batch = max(1UL, 1 * batch);
+       INIT_LIST_HEAD(&pcp->list);
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static struct per_cpu_pageset
+       boot_pageset[NR_CPUS];
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __devinit process_zones(int cpu)
+{
+       struct zone *zone, *dzone;
+
+       for_each_zone(zone) {
+
+               zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+                                        GFP_KERNEL, cpu_to_node(cpu));
+               if (!zone->pageset[cpu])
+                       goto bad;
+
+               setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+       }
+
+       return 0;
+bad:
+       for_each_zone(dzone) {
+               if (dzone == zone)
+                       break;
+               kfree(dzone->pageset[cpu]);
+               dzone->pageset[cpu] = NULL;
+       }
+       return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+#ifdef CONFIG_NUMA
+       struct zone *zone;
+
+       for_each_zone(zone) {
+               struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+               zone_pcp(zone, cpu) = NULL;
+               kfree(pset);
+       }
+#endif
+}
+
+static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+               unsigned long action,
+               void *hcpu)
+{
+       int cpu = (long)hcpu;
+       int ret = NOTIFY_OK;
+
+       switch (action) {
+               case CPU_UP_PREPARE:
+                       if (process_zones(cpu))
+                               ret = NOTIFY_BAD;
+                       break;
+#ifdef CONFIG_HOTPLUG_CPU
+               case CPU_DEAD:
+                       free_zone_pagesets(cpu);
+                       break;
+#endif
+               default:
+                       break;
+       }
+       return ret;
+}
+
+static struct notifier_block pageset_notifier =
+       { &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset()
+{
+       int err;
+
+       /* Initialize per_cpu_pageset for cpu 0.
+        * A cpuup callback will do this for every cpu
+        * as it comes online
+        */
+       err = process_zones(smp_processor_id());
+       BUG_ON(err);
+       register_cpu_notifier(&pageset_notifier);
+}
+
+#endif
+
  /*
   * Set up the zone data structures:
   *   - mark all pages reserved
@@ -1668,7 +1874,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                 unsigned long size, realsize;
                 unsigned long batch;
  
-               zone_table[NODEZONE(nid, j)] = zone;
                 realsize = size = zones_size[j];
                 if (zholes_size)
                         realsize -= zholes_size[j];
@@ -1687,48 +1892,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
  
                 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
  
-               /*
-                * The per-cpu-pages pools are set to around 1000th of the
-                * size of the zone.  But no more than 1/4 of a meg - there's
-                * no point in going beyond the size of L2 cache.
-                *
-                * OK, so we don't know how big the cache is.  So guess.
-                */
-               batch = zone->present_pages / 1024;
-               if (batch * PAGE_SIZE > 256 * 1024)
-                       batch = (256 * 1024) / PAGE_SIZE;
-               batch /= 4;             /* We effectively *= 4 below */
-               if (batch < 1)
-                       batch = 1;
-
-               /*
-                * Clamp the batch to a 2^n - 1 value. Having a power
-                * of 2 value was found to be more likely to have
-                * suboptimal cache aliasing properties in some cases.
-                *
-                * For example if 2 tasks are alternately allocating
-                * batches of pages, one task can end up with a lot
-                * of pages of one half of the possible page colors
-                * and the other with pages of the other colors.
-                */
-               batch = (1 << fls(batch + batch/2)) - 1;
+               batch = zone_batchsize(zone);
  
                 for (cpu = 0; cpu < NR_CPUS; cpu++) {
-                       struct per_cpu_pages *pcp;
-
-                       pcp = &zone->pageset[cpu].pcp[0];       /* hot */
-                       pcp->count = 0;
-                       pcp->low = 2 * batch;
-                       pcp->high = 6 * batch;
-                       pcp->batch = 1 * batch;
-                       INIT_LIST_HEAD(&pcp->list);
-
-                       pcp = &zone->pageset[cpu].pcp[1];       /* cold */
-                       pcp->count = 0;
-                       pcp->low = 0;
-                       pcp->high = 2 * batch;
-                       pcp->batch = 1 * batch;
-                       INIT_LIST_HEAD(&pcp->list);
+#ifdef CONFIG_NUMA
+                       /* Early boot. Slab allocator not functional yet */
+                       zone->pageset[cpu] = &boot_pageset[cpu];
+                       setup_pageset(&boot_pageset[cpu],0);
+#else
+                       setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
                 }
                 printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
                                 zone_names[j], realsize, batch);
@@ -1766,6 +1939,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
  
                 memmap_init(size, nid, j, zone_start_pfn);
  
+               zonetable_add(zone, nid, j, zone_start_pfn, size);
+
                 zone_start_pfn += size;
  
                 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
@@ -1774,24 +1949,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
  
  static void __init alloc_node_mem_map(struct pglist_data *pgdat)
  {
-       unsigned long size;
-
         /* Skip empty nodes */
         if (!pgdat->node_spanned_pages)
                 return;
  
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
         /* ia64 gets its own node_mem_map, before this, without bootmem */
         if (!pgdat->node_mem_map) {
+               unsigned long size;
+               struct page *map;
+
                 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
-               pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
+               map = alloc_remap(pgdat->node_id, size);
+               if (!map)
+                       map = alloc_bootmem_node(pgdat, size);
+               pgdat->node_mem_map = map;
         }
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
         /*
          * With no DISCONTIG, the global mem_map is just set as node 0's
          */
         if (pgdat == NODE_DATA(0))
                 mem_map = NODE_DATA(0)->node_mem_map;
  #endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
  }
  
  void __init free_area_init_node(int nid, struct pglist_data *pgdat,
@@ -1807,18 +1988,18 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat,
         free_area_init_core(pgdat, zones_size, zholes_size);
  }
  
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
  static bootmem_data_t contig_bootmem_data;
  struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
  
  EXPORT_SYMBOL(contig_page_data);
+#endif
  
  void __init free_area_init(unsigned long *zones_size)
  {
-       free_area_init_node(0, &contig_page_data, zones_size,
+       free_area_init_node(0, NODE_DATA(0), zones_size,
                         __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
  }
-#endif
  
  #ifdef CONFIG_PROC_FS
  
@@ -1929,7 +2110,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                         struct per_cpu_pageset *pageset;
                         int j;
  
-                       pageset = &zone->pageset[i];
+                       pageset = zone_pcp(zone, i);
                         for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
                                 if (pageset->pcp[j].count)
                                         break;