* Used by page_zone() to look up the address of the struct zone whose
* id is encoded in the upper bits of page->flags
*/
-struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
+struct zone *zone_table[1 << ZONETABLE_SHIFT];
EXPORT_SYMBOL(zone_table);
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
printk(KERN_EMERG "Backtrace:\n");
dump_stack();
printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
- page->flags &= ~(1 << PG_private |
+ page->flags &= ~(1 << PG_lru |
+ 1 << PG_private |
1 << PG_locked |
- 1 << PG_lru |
1 << PG_active |
1 << PG_dirty |
+ 1 << PG_reclaim |
+ 1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback);
set_page_count(page, 0);
*/
static void prep_new_page(struct page *page, int order)
{
- if (page->mapping || page_mapcount(page) ||
- (page->flags & (
+ if ( page_mapcount(page) ||
+ page->mapping != NULL ||
+ page_count(page) != 0 ||
+ (page->flags & (
+ 1 << PG_lru |
1 << PG_private |
1 << PG_locked |
- 1 << PG_lru |
1 << PG_active |
1 << PG_dirty |
1 << PG_reclaim |
+ 1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback )))
bad_page(__FUNCTION__, page);
return allocated;
}
+#ifdef CONFIG_NUMA
+/* Called from the slab reaper to drain remote pagesets */
+void drain_remote_pages(void)
+{
+ struct zone *zone;
+ int i;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ for_each_zone(zone) {
+ struct per_cpu_pageset *pset;
+
+ /* Do not drain local pagesets */
+ if (zone->zone_pgdat->node_id == numa_node_id())
+ continue;
+
+ pset = zone->pageset[smp_processor_id()];
+ for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &pset->pcp[i];
+ if (pcp->count)
+ pcp->count -= free_pages_bulk(zone, pcp->count,
+ &pcp->list, 0);
+ }
+ }
+ local_irq_restore(flags);
+}
+#endif
+
#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages(unsigned int cpu)
{
for_each_zone(zone) {
struct per_cpu_pageset *pset;
- pset = &zone->pageset[cpu];
+ pset = zone_pcp(zone, cpu);
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;
local_irq_save(flags);
cpu = smp_processor_id();
- p = &z->pageset[cpu];
+ p = zone_pcp(z,cpu);
if (pg == orig) {
- z->pageset[cpu].numa_hit++;
+ p->numa_hit++;
} else {
p->numa_miss++;
- zonelist->zones[0]->pageset[cpu].numa_foreign++;
+ zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
}
if (pg == NODE_DATA(numa_node_id()))
p->local_node++;
if (PageAnon(page))
page->mapping = NULL;
free_pages_check(__FUNCTION__, page);
- pcp = &zone->pageset[get_cpu()].pcp[cold];
+ pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
- if (pcp->count >= pcp->high)
- pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
list_add(&page->lru, &pcp->list);
pcp->count++;
+ if (pcp->count >= pcp->high)
+ pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
local_irq_restore(flags);
put_cpu();
}
if (order == 0) {
struct per_cpu_pages *pcp;
- pcp = &zone->pageset[get_cpu()].pcp[cold];
+ pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- did_some_progress = try_to_free_pages(zones, gfp_mask, order);
+ did_some_progress = try_to_free_pages(zones, gfp_mask);
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
cond_resched();
if (likely(did_some_progress)) {
- /*
- * Go through the zonelist yet one more time, keep
- * very high watermark here, this is only to catch
- * a parallel oom killing, we must fail if we're still
- * under heavy pressure.
- */
for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_min,
classzone_idx, can_try_harder,
goto got_pg;
}
- out_of_memory(gfp_mask);
+ out_of_memory(gfp_mask, order);
goto restart;
}
" order:%d, mode:0x%x\n",
p->comm, order, gfp_mask);
dump_stack();
+ show_mem();
}
return NULL;
got_pg:
__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
}
-unsigned long __read_page_state(unsigned offset)
+unsigned long __read_page_state(unsigned long offset)
{
unsigned long ret = 0;
int cpu;
return ret;
}
-void __mod_page_state(unsigned offset, unsigned long delta)
+void __mod_page_state(unsigned long offset, unsigned long delta)
{
unsigned long flags;
void* ptr;
if (!cpu_possible(cpu))
continue;
- pageset = zone->pageset + cpu;
+ pageset = zone_pcp(zone, cpu);
for (temperature = 0; temperature < 2; temperature++)
- printk("cpu %d %s: low %d, high %d, batch %d\n",
+ printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
cpu,
temperature ? "cold" : "hot",
pageset->pcp[temperature].low,
pageset->pcp[temperature].high,
- pageset->pcp[temperature].batch);
+ pageset->pcp[temperature].batch,
+ pageset->pcp[temperature].count);
}
}
get_page_state(&ps);
get_zone_counts(&active, &inactive, &free);
- printk("\nFree pages: %11ukB (%ukB HighMem)\n",
+ printk("Free pages: %11ukB (%ukB HighMem)\n",
K(nr_free_pages()),
K(nr_free_highpages()));
void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn)
{
- struct page *start = pfn_to_page(start_pfn);
struct page *page;
+ unsigned long end_pfn = start_pfn + size;
+ unsigned long pfn;
- for (page = start; page < (start + size); page++) {
- set_page_zone(page, NODEZONE(nid, zone));
+ for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ page = pfn_to_page(pfn);
+ set_page_links(page, zone, nid, pfn);
set_page_count(page, 0);
reset_page_mapcount(page);
SetPageReserved(page);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
- set_page_address(page, __va(start_pfn << PAGE_SHIFT));
+ set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
- start_pfn++;
}
}
}
}
+#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+ unsigned long size)
+{
+ unsigned long snum = pfn_to_section_nr(pfn);
+ unsigned long end = pfn_to_section_nr(pfn + size);
+
+ if (FLAGS_HAS_NODE)
+ zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+ else
+ for (; snum <= end; snum++)
+ zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
memmap_init_zone((size), (nid), (zone), (start_pfn))
#endif
+static int __devinit zone_batchsize(struct zone *zone)
+{
+ int batch;
+
+ /*
+ * The per-cpu-pages pools are set to around 1000th of the
+ * size of the zone. But no more than 1/4 of a meg - there's
+ * no point in going beyond the size of L2 cache.
+ *
+ * OK, so we don't know how big the cache is. So guess.
+ */
+ batch = zone->present_pages / 1024;
+ if (batch * PAGE_SIZE > 256 * 1024)
+ batch = (256 * 1024) / PAGE_SIZE;
+ batch /= 4; /* We effectively *= 4 below */
+ if (batch < 1)
+ batch = 1;
+
+ /*
+ * Clamp the batch to a 2^n - 1 value. Having a power
+ * of 2 value was found to be more likely to have
+ * suboptimal cache aliasing properties in some cases.
+ *
+ * For example if 2 tasks are alternately allocating
+ * batches of pages, one task can end up with a lot
+ * of pages of one half of the possible page colors
+ * and the other with pages of the other colors.
+ */
+ batch = (1 << fls(batch + batch/2)) - 1;
+ return batch;
+}
+
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+ struct per_cpu_pages *pcp;
+
+ pcp = &p->pcp[0]; /* hot */
+ pcp->count = 0;
+ pcp->low = 2 * batch;
+ pcp->high = 6 * batch;
+ pcp->batch = max(1UL, 1 * batch);
+ INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &p->pcp[1]; /* cold*/
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = max(1UL, 1 * batch);
+ INIT_LIST_HEAD(&pcp->list);
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static struct per_cpu_pageset
+ boot_pageset[NR_CPUS];
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __devinit process_zones(int cpu)
+{
+ struct zone *zone, *dzone;
+
+ for_each_zone(zone) {
+
+ zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!zone->pageset[cpu])
+ goto bad;
+
+ setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+ }
+
+ return 0;
+bad:
+ for_each_zone(dzone) {
+ if (dzone == zone)
+ break;
+ kfree(dzone->pageset[cpu]);
+ dzone->pageset[cpu] = NULL;
+ }
+ return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+#ifdef CONFIG_NUMA
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+ zone_pcp(zone, cpu) = NULL;
+ kfree(pset);
+ }
+#endif
+}
+
+static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+ int ret = NOTIFY_OK;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (process_zones(cpu))
+ ret = NOTIFY_BAD;
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ free_zone_pagesets(cpu);
+ break;
+#endif
+ default:
+ break;
+ }
+ return ret;
+}
+
+static struct notifier_block pageset_notifier =
+ { &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset()
+{
+ int err;
+
+ /* Initialize per_cpu_pageset for cpu 0.
+ * A cpuup callback will do this for every cpu
+ * as it comes online
+ */
+ err = process_zones(smp_processor_id());
+ BUG_ON(err);
+ register_cpu_notifier(&pageset_notifier);
+}
+
+#endif
+
/*
* Set up the zone data structures:
* - mark all pages reserved
unsigned long size, realsize;
unsigned long batch;
- zone_table[NODEZONE(nid, j)] = zone;
realsize = size = zones_size[j];
if (zholes_size)
realsize -= zholes_size[j];
zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
- /*
- * The per-cpu-pages pools are set to around 1000th of the
- * size of the zone. But no more than 1/4 of a meg - there's
- * no point in going beyond the size of L2 cache.
- *
- * OK, so we don't know how big the cache is. So guess.
- */
- batch = zone->present_pages / 1024;
- if (batch * PAGE_SIZE > 256 * 1024)
- batch = (256 * 1024) / PAGE_SIZE;
- batch /= 4; /* We effectively *= 4 below */
- if (batch < 1)
- batch = 1;
-
- /*
- * Clamp the batch to a 2^n - 1 value. Having a power
- * of 2 value was found to be more likely to have
- * suboptimal cache aliasing properties in some cases.
- *
- * For example if 2 tasks are alternately allocating
- * batches of pages, one task can end up with a lot
- * of pages of one half of the possible page colors
- * and the other with pages of the other colors.
- */
- batch = (1 << fls(batch + batch/2)) - 1;
+ batch = zone_batchsize(zone);
for (cpu = 0; cpu < NR_CPUS; cpu++) {
- struct per_cpu_pages *pcp;
-
- pcp = &zone->pageset[cpu].pcp[0]; /* hot */
- pcp->count = 0;
- pcp->low = 2 * batch;
- pcp->high = 6 * batch;
- pcp->batch = 1 * batch;
- INIT_LIST_HEAD(&pcp->list);
-
- pcp = &zone->pageset[cpu].pcp[1]; /* cold */
- pcp->count = 0;
- pcp->low = 0;
- pcp->high = 2 * batch;
- pcp->batch = 1 * batch;
- INIT_LIST_HEAD(&pcp->list);
+#ifdef CONFIG_NUMA
+ /* Early boot. Slab allocator not functional yet */
+ zone->pageset[cpu] = &boot_pageset[cpu];
+ setup_pageset(&boot_pageset[cpu],0);
+#else
+ setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
}
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
zone_names[j], realsize, batch);
memmap_init(size, nid, j, zone_start_pfn);
+ zonetable_add(zone, nid, j, zone_start_pfn, size);
+
zone_start_pfn += size;
zone_init_free_lists(pgdat, zone, zone->spanned_pages);
static void __init alloc_node_mem_map(struct pglist_data *pgdat)
{
- unsigned long size;
-
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
+ unsigned long size;
+ struct page *map;
+
size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
- pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
+ map = alloc_remap(pgdat->node_id, size);
+ if (!map)
+ map = alloc_bootmem_node(pgdat, size);
+ pgdat->node_mem_map = map;
}
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
if (pgdat == NODE_DATA(0))
mem_map = NODE_DATA(0)->node_mem_map;
#endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
void __init free_area_init_node(int nid, struct pglist_data *pgdat,
free_area_init_core(pgdat, zones_size, zholes_size);
}
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
static bootmem_data_t contig_bootmem_data;
struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
EXPORT_SYMBOL(contig_page_data);
+#endif
void __init free_area_init(unsigned long *zones_size)
{
- free_area_init_node(0, &contig_page_data, zones_size,
+ free_area_init_node(0, NODE_DATA(0), zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
-#endif
#ifdef CONFIG_PROC_FS
struct per_cpu_pageset *pageset;
int j;
- pageset = &zone->pageset[i];
+ pageset = zone_pcp(zone, i);
for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
if (pageset->pcp[j].count)
break;