]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - arch/x86_64/mm/numa.c
Linux-2.6.12-rc2
[mirror_ubuntu-artful-kernel.git] / arch / x86_64 / mm / numa.c
1 /*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5 #include <linux/kernel.h>
6 #include <linux/mm.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
14
15 #include <asm/e820.h>
16 #include <asm/proto.h>
17 #include <asm/dma.h>
18 #include <asm/numa.h>
19 #include <asm/acpi.h>
20
21 #ifndef Dprintk
22 #define Dprintk(x...)
23 #endif
24
25 struct pglist_data *node_data[MAX_NUMNODES];
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28 int memnode_shift;
29 u8 memnodemap[NODEMAPSIZE];
30
31 unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
32 cpumask_t node_to_cpumask[MAX_NUMNODES];
33
34 int numa_off __initdata;
35
36 int __init compute_hash_shift(struct node *nodes, int numnodes)
37 {
38 int i;
39 int shift = 24;
40 u64 addr;
41
42 /* When in doubt use brute force. */
43 while (shift < 48) {
44 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
45 for (i = 0; i < numnodes; i++) {
46 if (nodes[i].start == nodes[i].end)
47 continue;
48 for (addr = nodes[i].start;
49 addr < nodes[i].end;
50 addr += (1UL << shift)) {
51 if (memnodemap[addr >> shift] != 0xff &&
52 memnodemap[addr >> shift] != i) {
53 printk(KERN_INFO
54 "node %d shift %d addr %Lx conflict %d\n",
55 i, shift, addr, memnodemap[addr>>shift]);
56 goto next;
57 }
58 memnodemap[addr >> shift] = i;
59 }
60 }
61 return shift;
62 next:
63 shift++;
64 }
65 memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE);
66 return -1;
67 }
68
69 /* Initialize bootmem allocator for a node */
70 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
71 {
72 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
73 unsigned long nodedata_phys;
74 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
75
76 start = round_up(start, ZONE_ALIGN);
77
78 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
79
80 start_pfn = start >> PAGE_SHIFT;
81 end_pfn = end >> PAGE_SHIFT;
82
83 nodedata_phys = find_e820_area(start, end, pgdat_size);
84 if (nodedata_phys == -1L)
85 panic("Cannot find memory pgdat in node %d\n", nodeid);
86
87 Dprintk("nodedata_phys %lx\n", nodedata_phys);
88
89 node_data[nodeid] = phys_to_virt(nodedata_phys);
90 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
91 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
92 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
93 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
94
95 /* Find a place for the bootmem map */
96 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
97 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
98 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
99 if (bootmap_start == -1L)
100 panic("Not enough continuous space for bootmap on node %d", nodeid);
101 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
102
103 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
104 bootmap_start >> PAGE_SHIFT,
105 start_pfn, end_pfn);
106
107 e820_bootmem_free(NODE_DATA(nodeid), start, end);
108
109 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
110 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
111 node_set_online(nodeid);
112 }
113
114 /* Initialize final allocator for a zone */
115 void __init setup_node_zones(int nodeid)
116 {
117 unsigned long start_pfn, end_pfn;
118 unsigned long zones[MAX_NR_ZONES];
119 unsigned long dma_end_pfn;
120
121 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
122
123 start_pfn = node_start_pfn(nodeid);
124 end_pfn = node_end_pfn(nodeid);
125
126 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
127
128 /* All nodes > 0 have a zero length zone DMA */
129 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
130 if (start_pfn < dma_end_pfn) {
131 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
132 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
133 } else {
134 zones[ZONE_NORMAL] = end_pfn - start_pfn;
135 }
136
137 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
138 start_pfn, NULL);
139 }
140
141 void __init numa_init_array(void)
142 {
143 int rr, i;
144 /* There are unfortunately some poorly designed mainboards around
145 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
146 mapping. To avoid this fill in the mapping for all possible
147 CPUs, as the number of CPUs is not known yet.
148 We round robin the existing nodes. */
149 rr = 0;
150 for (i = 0; i < NR_CPUS; i++) {
151 if (cpu_to_node[i] != NUMA_NO_NODE)
152 continue;
153 rr = next_node(rr, node_online_map);
154 if (rr == MAX_NUMNODES)
155 rr = first_node(node_online_map);
156 cpu_to_node[i] = rr;
157 rr++;
158 }
159
160 set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
161 }
162
163 #ifdef CONFIG_NUMA_EMU
164 int numa_fake __initdata = 0;
165
166 /* Numa emulation */
167 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
168 {
169 int i;
170 struct node nodes[MAX_NUMNODES];
171 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
172
173 /* Kludge needed for the hash function */
174 if (hweight64(sz) > 1) {
175 unsigned long x = 1;
176 while ((x << 1) < sz)
177 x <<= 1;
178 if (x < sz/2)
179 printk("Numa emulation unbalanced. Complain to maintainer\n");
180 sz = x;
181 }
182
183 memset(&nodes,0,sizeof(nodes));
184 for (i = 0; i < numa_fake; i++) {
185 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
186 if (i == numa_fake-1)
187 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
188 nodes[i].end = nodes[i].start + sz;
189 if (i != numa_fake-1)
190 nodes[i].end--;
191 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
192 i,
193 nodes[i].start, nodes[i].end,
194 (nodes[i].end - nodes[i].start) >> 20);
195 node_set_online(i);
196 }
197 memnode_shift = compute_hash_shift(nodes, numa_fake);
198 if (memnode_shift < 0) {
199 memnode_shift = 0;
200 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
201 return -1;
202 }
203 for_each_online_node(i)
204 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
205 numa_init_array();
206 return 0;
207 }
208 #endif
209
210 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
211 {
212 int i;
213
214 #ifdef CONFIG_NUMA_EMU
215 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
216 return;
217 #endif
218
219 #ifdef CONFIG_ACPI_NUMA
220 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
221 end_pfn << PAGE_SHIFT))
222 return;
223 #endif
224
225 #ifdef CONFIG_K8_NUMA
226 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
227 return;
228 #endif
229 printk(KERN_INFO "%s\n",
230 numa_off ? "NUMA turned off" : "No NUMA configuration found");
231
232 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
233 start_pfn << PAGE_SHIFT,
234 end_pfn << PAGE_SHIFT);
235 /* setup dummy node covering all memory */
236 memnode_shift = 63;
237 memnodemap[0] = 0;
238 nodes_clear(node_online_map);
239 node_set_online(0);
240 for (i = 0; i < NR_CPUS; i++)
241 cpu_to_node[i] = 0;
242 node_to_cpumask[0] = cpumask_of_cpu(0);
243 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
244 }
245
246 __init void numa_add_cpu(int cpu)
247 {
248 /* BP is initialized elsewhere */
249 if (cpu)
250 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
251 }
252
253 unsigned long __init numa_free_all_bootmem(void)
254 {
255 int i;
256 unsigned long pages = 0;
257 for_each_online_node(i) {
258 pages += free_all_bootmem_node(NODE_DATA(i));
259 }
260 return pages;
261 }
262
263 void __init paging_init(void)
264 {
265 int i;
266 for_each_online_node(i) {
267 setup_node_zones(i);
268 }
269 }
270
271 /* [numa=off] */
272 __init int numa_setup(char *opt)
273 {
274 if (!strncmp(opt,"off",3))
275 numa_off = 1;
276 #ifdef CONFIG_NUMA_EMU
277 if(!strncmp(opt, "fake=", 5)) {
278 numa_fake = simple_strtoul(opt+5,NULL,0); ;
279 if (numa_fake >= MAX_NUMNODES)
280 numa_fake = MAX_NUMNODES;
281 }
282 #endif
283 #ifdef CONFIG_ACPI_NUMA
284 if (!strncmp(opt,"noacpi",6))
285 acpi_numa = -1;
286 #endif
287 return 1;
288 }
289
290 EXPORT_SYMBOL(cpu_to_node);
291 EXPORT_SYMBOL(node_to_cpumask);
292 EXPORT_SYMBOL(memnode_shift);
293 EXPORT_SYMBOL(memnodemap);
294 EXPORT_SYMBOL(node_data);