]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/powerpc/mm/numa.c
treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 151
[mirror_ubuntu-jammy-kernel.git] / arch / powerpc / mm / numa.c
CommitLineData
1da177e4
LT
1/*
2 * pSeries NUMA support
3 *
4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
2d73bae1
NA
11#define pr_fmt(fmt) "numa: " fmt
12
1da177e4 13#include <linux/threads.h>
57c8a661 14#include <linux/memblock.h>
1da177e4
LT
15#include <linux/init.h>
16#include <linux/mm.h>
17#include <linux/mmzone.h>
4b16f8e2 18#include <linux/export.h>
1da177e4
LT
19#include <linux/nodemask.h>
20#include <linux/cpu.h>
21#include <linux/notifier.h>
6df1646e 22#include <linux/of.h>
06eccea6 23#include <linux/pfn.h>
9eff1a38
JL
24#include <linux/cpuset.h>
25#include <linux/node.h>
30c05350 26#include <linux/stop_machine.h>
e04fa612
NF
27#include <linux/proc_fs.h>
28#include <linux/seq_file.h>
29#include <linux/uaccess.h>
191a7120 30#include <linux/slab.h>
3be7db6a 31#include <asm/cputhreads.h>
45fb6cea 32#include <asm/sparsemem.h>
d9b2b2a2 33#include <asm/prom.h>
2249ca9d 34#include <asm/smp.h>
d4edc5b6 35#include <asm/topology.h>
9eff1a38
JL
36#include <asm/firmware.h>
37#include <asm/paca.h>
39bf990e 38#include <asm/hvcall.h>
ae3a197e 39#include <asm/setup.h>
176bbf14 40#include <asm/vdso.h>
514a9cb3 41#include <asm/drmem.h>
1da177e4
LT
42
43static int numa_enabled = 1;
44
1daa6d08
BS
45static char *cmdline __initdata;
46
1da177e4
LT
47static int numa_debug;
48#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
49
45fb6cea 50int numa_cpu_lookup_table[NR_CPUS];
25863de0 51cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
1da177e4 52struct pglist_data *node_data[MAX_NUMNODES];
45fb6cea
AB
53
54EXPORT_SYMBOL(numa_cpu_lookup_table);
25863de0 55EXPORT_SYMBOL(node_to_cpumask_map);
45fb6cea
AB
56EXPORT_SYMBOL(node_data);
57
1da177e4 58static int min_common_depth;
237a0989 59static int n_mem_addr_cells, n_mem_size_cells;
41eab6f8
AB
60static int form1_affinity;
61
62#define MAX_DISTANCE_REF_POINTS 4
63static int distance_ref_points_depth;
b08a2a12 64static const __be32 *distance_ref_points;
41eab6f8 65static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
1da177e4 66
25863de0
AB
67/*
68 * Allocate node_to_cpumask_map based on number of available nodes
69 * Requires node_possible_map to be valid.
70 *
9512938b 71 * Note: cpumask_of_node() is not valid until after this is done.
25863de0
AB
72 */
73static void __init setup_node_to_cpumask_map(void)
74{
f9d531b8 75 unsigned int node;
25863de0
AB
76
77 /* setup nr_node_ids if not done yet */
f9d531b8
CS
78 if (nr_node_ids == MAX_NUMNODES)
79 setup_nr_node_ids();
25863de0
AB
80
81 /* allocate the map */
c118baf8 82 for_each_node(node)
25863de0
AB
83 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
84
85 /* cpumask_of_node() will now work */
b9726c26 86 dbg("Node to cpumask map for %u nodes\n", nr_node_ids);
25863de0
AB
87}
88
55671f3c 89static int __init fake_numa_create_new_node(unsigned long end_pfn,
1daa6d08
BS
90 unsigned int *nid)
91{
92 unsigned long long mem;
93 char *p = cmdline;
94 static unsigned int fake_nid;
95 static unsigned long long curr_boundary;
96
97 /*
98 * Modify node id, iff we started creating NUMA nodes
99 * We want to continue from where we left of the last time
100 */
101 if (fake_nid)
102 *nid = fake_nid;
103 /*
104 * In case there are no more arguments to parse, the
105 * node_id should be the same as the last fake node id
106 * (we've handled this above).
107 */
108 if (!p)
109 return 0;
110
111 mem = memparse(p, &p);
112 if (!mem)
113 return 0;
114
115 if (mem < curr_boundary)
116 return 0;
117
118 curr_boundary = mem;
119
120 if ((end_pfn << PAGE_SHIFT) > mem) {
121 /*
122 * Skip commas and spaces
123 */
124 while (*p == ',' || *p == ' ' || *p == '\t')
125 p++;
126
127 cmdline = p;
128 fake_nid++;
129 *nid = fake_nid;
130 dbg("created new fake_node with id %d\n", fake_nid);
131 return 1;
132 }
133 return 0;
134}
135
d4edc5b6
SB
136static void reset_numa_cpu_lookup_table(void)
137{
138 unsigned int cpu;
139
140 for_each_possible_cpu(cpu)
141 numa_cpu_lookup_table[cpu] = -1;
142}
143
d4edc5b6
SB
144static void map_cpu_to_node(int cpu, int node)
145{
146 update_numa_cpu_lookup_table(cpu, node);
45fb6cea 147
bf4b85b0
NL
148 dbg("adding cpu %d to node %d\n", cpu, node);
149
25863de0
AB
150 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
151 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
1da177e4
LT
152}
153
39bf990e 154#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
1da177e4
LT
155static void unmap_cpu_from_node(unsigned long cpu)
156{
157 int node = numa_cpu_lookup_table[cpu];
158
159 dbg("removing cpu %lu from node %d\n", cpu, node);
160
25863de0 161 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
429f4d8d 162 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
1da177e4
LT
163 } else {
164 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
165 cpu, node);
166 }
167}
39bf990e 168#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
1da177e4 169
1da177e4 170/* must hold reference to node during call */
b08a2a12 171static const __be32 *of_get_associativity(struct device_node *dev)
1da177e4 172{
e2eb6392 173 return of_get_property(dev, "ibm,associativity", NULL);
1da177e4
LT
174}
175
41eab6f8
AB
176int __node_distance(int a, int b)
177{
178 int i;
179 int distance = LOCAL_DISTANCE;
180
181 if (!form1_affinity)
7122beee 182 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
41eab6f8
AB
183
184 for (i = 0; i < distance_ref_points_depth; i++) {
185 if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
186 break;
187
188 /* Double the distance for each NUMA level */
189 distance *= 2;
190 }
191
192 return distance;
193}
12c743eb 194EXPORT_SYMBOL(__node_distance);
41eab6f8
AB
195
196static void initialize_distance_lookup_table(int nid,
b08a2a12 197 const __be32 *associativity)
41eab6f8
AB
198{
199 int i;
200
201 if (!form1_affinity)
202 return;
203
204 for (i = 0; i < distance_ref_points_depth; i++) {
b08a2a12
AP
205 const __be32 *entry;
206
1d805440 207 entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
b08a2a12 208 distance_lookup_table[nid][i] = of_read_number(entry, 1);
41eab6f8
AB
209 }
210}
211
482ec7c4
NL
212/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
213 * info is found.
214 */
b08a2a12 215static int associativity_to_nid(const __be32 *associativity)
1da177e4 216{
98fa15f3 217 int nid = NUMA_NO_NODE;
1da177e4
LT
218
219 if (min_common_depth == -1)
482ec7c4 220 goto out;
1da177e4 221
b08a2a12
AP
222 if (of_read_number(associativity, 1) >= min_common_depth)
223 nid = of_read_number(&associativity[min_common_depth], 1);
bc16a759
NL
224
225 /* POWER4 LPAR uses 0xffff as invalid node */
482ec7c4 226 if (nid == 0xffff || nid >= MAX_NUMNODES)
98fa15f3 227 nid = NUMA_NO_NODE;
41eab6f8 228
b08a2a12 229 if (nid > 0 &&
1d805440
ND
230 of_read_number(associativity, 1) >= distance_ref_points_depth) {
231 /*
232 * Skip the length field and send start of associativity array
233 */
234 initialize_distance_lookup_table(nid, associativity + 1);
235 }
41eab6f8 236
482ec7c4 237out:
cf950b7a 238 return nid;
1da177e4
LT
239}
240
9eff1a38
JL
241/* Returns the nid associated with the given device tree node,
242 * or -1 if not found.
243 */
244static int of_node_to_nid_single(struct device_node *device)
245{
98fa15f3 246 int nid = NUMA_NO_NODE;
b08a2a12 247 const __be32 *tmp;
9eff1a38
JL
248
249 tmp = of_get_associativity(device);
250 if (tmp)
251 nid = associativity_to_nid(tmp);
252 return nid;
253}
254
953039c8
JK
255/* Walk the device tree upwards, looking for an associativity id */
256int of_node_to_nid(struct device_node *device)
257{
98fa15f3 258 int nid = NUMA_NO_NODE;
953039c8
JK
259
260 of_node_get(device);
261 while (device) {
262 nid = of_node_to_nid_single(device);
263 if (nid != -1)
264 break;
265
1def3758 266 device = of_get_next_parent(device);
953039c8
JK
267 }
268 of_node_put(device);
269
270 return nid;
271}
be9ba9ff 272EXPORT_SYMBOL(of_node_to_nid);
953039c8 273
1da177e4
LT
274static int __init find_min_common_depth(void)
275{
41eab6f8 276 int depth;
e70606eb 277 struct device_node *root;
1da177e4 278
1c8ee733
DS
279 if (firmware_has_feature(FW_FEATURE_OPAL))
280 root = of_find_node_by_path("/ibm,opal");
281 else
282 root = of_find_node_by_path("/rtas");
e70606eb
ME
283 if (!root)
284 root = of_find_node_by_path("/");
1da177e4
LT
285
286 /*
41eab6f8
AB
287 * This property is a set of 32-bit integers, each representing
288 * an index into the ibm,associativity nodes.
289 *
290 * With form 0 affinity the first integer is for an SMP configuration
291 * (should be all 0's) and the second is for a normal NUMA
292 * configuration. We have only one level of NUMA.
293 *
294 * With form 1 affinity the first integer is the most significant
295 * NUMA boundary and the following are progressively less significant
296 * boundaries. There can be more than one level of NUMA.
1da177e4 297 */
e70606eb 298 distance_ref_points = of_get_property(root,
41eab6f8
AB
299 "ibm,associativity-reference-points",
300 &distance_ref_points_depth);
301
302 if (!distance_ref_points) {
303 dbg("NUMA: ibm,associativity-reference-points not found.\n");
304 goto err;
305 }
306
307 distance_ref_points_depth /= sizeof(int);
1da177e4 308
8002b0c5
NF
309 if (firmware_has_feature(FW_FEATURE_OPAL) ||
310 firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
311 dbg("Using form 1 affinity\n");
1c8ee733 312 form1_affinity = 1;
4b83c330
AB
313 }
314
41eab6f8 315 if (form1_affinity) {
b08a2a12 316 depth = of_read_number(distance_ref_points, 1);
1da177e4 317 } else {
41eab6f8
AB
318 if (distance_ref_points_depth < 2) {
319 printk(KERN_WARNING "NUMA: "
320 "short ibm,associativity-reference-points\n");
321 goto err;
322 }
323
b08a2a12 324 depth = of_read_number(&distance_ref_points[1], 1);
1da177e4 325 }
1da177e4 326
41eab6f8
AB
327 /*
328 * Warn and cap if the hardware supports more than
329 * MAX_DISTANCE_REF_POINTS domains.
330 */
331 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
332 printk(KERN_WARNING "NUMA: distance array capped at "
333 "%d entries\n", MAX_DISTANCE_REF_POINTS);
334 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
335 }
336
e70606eb 337 of_node_put(root);
1da177e4 338 return depth;
41eab6f8
AB
339
340err:
e70606eb 341 of_node_put(root);
41eab6f8 342 return -1;
1da177e4
LT
343}
344
84c9fdd1 345static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
1da177e4
LT
346{
347 struct device_node *memory = NULL;
1da177e4
LT
348
349 memory = of_find_node_by_type(memory, "memory");
54c23310 350 if (!memory)
84c9fdd1 351 panic("numa.c: No memory nodes found!");
54c23310 352
a8bda5dd 353 *n_addr_cells = of_n_addr_cells(memory);
9213feea 354 *n_size_cells = of_n_size_cells(memory);
84c9fdd1 355 of_node_put(memory);
1da177e4
LT
356}
357
b08a2a12 358static unsigned long read_n_cells(int n, const __be32 **buf)
1da177e4
LT
359{
360 unsigned long result = 0;
361
362 while (n--) {
b08a2a12 363 result = (result << 32) | of_read_number(*buf, 1);
1da177e4
LT
364 (*buf)++;
365 }
366 return result;
367}
368
8342681d
NF
369struct assoc_arrays {
370 u32 n_arrays;
371 u32 array_sz;
b08a2a12 372 const __be32 *arrays;
8342681d
NF
373};
374
375/*
25985edc 376 * Retrieve and validate the list of associativity arrays for drconf
8342681d
NF
377 * memory from the ibm,associativity-lookup-arrays property of the
378 * device tree..
379 *
380 * The layout of the ibm,associativity-lookup-arrays property is a number N
381 * indicating the number of associativity arrays, followed by a number M
382 * indicating the size of each associativity array, followed by a list
383 * of N associativity arrays.
384 */
35f80deb 385static int of_get_assoc_arrays(struct assoc_arrays *aa)
8342681d 386{
35f80deb 387 struct device_node *memory;
b08a2a12 388 const __be32 *prop;
8342681d
NF
389 u32 len;
390
35f80deb
NF
391 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
392 if (!memory)
393 return -1;
394
8342681d 395 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
35f80deb
NF
396 if (!prop || len < 2 * sizeof(unsigned int)) {
397 of_node_put(memory);
8342681d 398 return -1;
35f80deb 399 }
8342681d 400
b08a2a12
AP
401 aa->n_arrays = of_read_number(prop++, 1);
402 aa->array_sz = of_read_number(prop++, 1);
8342681d 403
35f80deb
NF
404 of_node_put(memory);
405
42b2aa86 406 /* Now that we know the number of arrays and size of each array,
8342681d
NF
407 * revalidate the size of the property read in.
408 */
409 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
410 return -1;
411
412 aa->arrays = prop;
413 return 0;
414}
415
416/*
417 * This is like of_node_to_nid_single() for memory represented in the
418 * ibm,dynamic-reconfiguration-memory node.
419 */
514a9cb3 420static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
8342681d 421{
b88fc309 422 struct assoc_arrays aa = { .arrays = NULL };
8342681d
NF
423 int default_nid = 0;
424 int nid = default_nid;
b88fc309
NF
425 int rc, index;
426
427 rc = of_get_assoc_arrays(&aa);
428 if (rc)
429 return default_nid;
8342681d 430
b88fc309 431 if (min_common_depth > 0 && min_common_depth <= aa.array_sz &&
514a9cb3
NF
432 !(lmb->flags & DRCONF_MEM_AI_INVALID) &&
433 lmb->aa_index < aa.n_arrays) {
434 index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
b88fc309 435 nid = of_read_number(&aa.arrays[index], 1);
8342681d
NF
436
437 if (nid == 0xffff || nid >= MAX_NUMNODES)
438 nid = default_nid;
1d805440
ND
439
440 if (nid > 0) {
514a9cb3 441 index = lmb->aa_index * aa.array_sz;
1d805440 442 initialize_distance_lookup_table(nid,
b88fc309 443 &aa.arrays[index]);
1d805440 444 }
8342681d
NF
445 }
446
447 return nid;
448}
449
1da177e4
LT
450/*
451 * Figure out to which domain a cpu belongs and stick it there.
452 * Return the id of the domain used.
453 */
061d19f2 454static int numa_setup_cpu(unsigned long lcpu)
1da177e4 455{
98fa15f3 456 int nid = NUMA_NO_NODE;
d4edc5b6
SB
457 struct device_node *cpu;
458
459 /*
460 * If a valid cpu-to-node mapping is already available, use it
461 * directly instead of querying the firmware, since it represents
462 * the most recent mapping notified to us by the platform (eg: VPHN).
463 */
464 if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
465 map_cpu_to_node(lcpu, nid);
466 return nid;
467 }
468
469 cpu = of_get_cpu_node(lcpu, NULL);
1da177e4
LT
470
471 if (!cpu) {
472 WARN_ON(1);
297cf502
LZ
473 if (cpu_present(lcpu))
474 goto out_present;
475 else
476 goto out;
1da177e4
LT
477 }
478
953039c8 479 nid = of_node_to_nid_single(cpu);
1da177e4 480
297cf502 481out_present:
ea05ba7c 482 if (nid < 0 || !node_possible(nid))
72c33688 483 nid = first_online_node;
1da177e4 484
297cf502 485 map_cpu_to_node(lcpu, nid);
1da177e4 486 of_node_put(cpu);
297cf502 487out:
cf950b7a 488 return nid;
1da177e4
LT
489}
490
68fb18aa
SB
491static void verify_cpu_node_mapping(int cpu, int node)
492{
493 int base, sibling, i;
494
495 /* Verify that all the threads in the core belong to the same node */
496 base = cpu_first_thread_sibling(cpu);
497
498 for (i = 0; i < threads_per_core; i++) {
499 sibling = base + i;
500
501 if (sibling == cpu || cpu_is_offline(sibling))
502 continue;
503
504 if (cpu_to_node(sibling) != node) {
505 WARN(1, "CPU thread siblings %d and %d don't belong"
506 " to the same node!\n", cpu, sibling);
507 break;
508 }
509 }
510}
511
bdab88e0
SAS
512/* Must run before sched domains notifier. */
513static int ppc_numa_cpu_prepare(unsigned int cpu)
514{
515 int nid;
516
517 nid = numa_setup_cpu(cpu);
518 verify_cpu_node_mapping(cpu, nid);
519 return 0;
520}
521
522static int ppc_numa_cpu_dead(unsigned int cpu)
523{
1da177e4 524#ifdef CONFIG_HOTPLUG_CPU
bdab88e0 525 unmap_cpu_from_node(cpu);
1da177e4 526#endif
bdab88e0 527 return 0;
1da177e4
LT
528}
529
530/*
531 * Check and possibly modify a memory region to enforce the memory limit.
532 *
533 * Returns the size the region should have to enforce the memory limit.
534 * This will either be the original value of size, a truncated value,
535 * or zero. If the returned value of size is 0 the region should be
25985edc 536 * discarded as it lies wholly above the memory limit.
1da177e4 537 */
45fb6cea
AB
538static unsigned long __init numa_enforce_memory_limit(unsigned long start,
539 unsigned long size)
1da177e4
LT
540{
541 /*
95f72d1e 542 * We use memblock_end_of_DRAM() in here instead of memory_limit because
1da177e4 543 * we've already adjusted it for the limit and it takes care of
fe55249d
MM
544 * having memory holes below the limit. Also, in the case of
545 * iommu_is_off, memory_limit is not set but is implicitly enforced.
1da177e4 546 */
1da177e4 547
95f72d1e 548 if (start + size <= memblock_end_of_DRAM())
1da177e4
LT
549 return size;
550
95f72d1e 551 if (start >= memblock_end_of_DRAM())
1da177e4
LT
552 return 0;
553
95f72d1e 554 return memblock_end_of_DRAM() - start;
1da177e4
LT
555}
556
cf00085d
C
557/*
558 * Reads the counter for a given entry in
559 * linux,drconf-usable-memory property
560 */
b08a2a12 561static inline int __init read_usm_ranges(const __be32 **usm)
cf00085d
C
562{
563 /*
3fdfd990 564 * For each lmb in ibm,dynamic-memory a corresponding
cf00085d
C
565 * entry in linux,drconf-usable-memory property contains
566 * a counter followed by that many (base, size) duple.
567 * read the counter from linux,drconf-usable-memory
568 */
569 return read_n_cells(n_mem_size_cells, usm);
570}
571
0204568a
PM
572/*
573 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
574 * node. This assumes n_mem_{addr,size}_cells have been set.
575 */
514a9cb3
NF
576static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
577 const __be32 **usm)
0204568a 578{
514a9cb3
NF
579 unsigned int ranges, is_kexec_kdump = 0;
580 unsigned long base, size, sz;
8342681d 581 int nid;
8342681d 582
514a9cb3
NF
583 /*
584 * Skip this block if the reserved bit is set in flags (0x80)
585 * or if the block is not assigned to this partition (0x8)
586 */
587 if ((lmb->flags & DRCONF_MEM_RESERVED)
588 || !(lmb->flags & DRCONF_MEM_ASSIGNED))
8342681d
NF
589 return;
590
514a9cb3 591 if (*usm)
cf00085d
C
592 is_kexec_kdump = 1;
593
514a9cb3
NF
594 base = lmb->base_addr;
595 size = drmem_lmb_size();
596 ranges = 1;
8342681d 597
514a9cb3
NF
598 if (is_kexec_kdump) {
599 ranges = read_usm_ranges(usm);
600 if (!ranges) /* there are no (base, size) duple */
601 return;
602 }
8342681d 603
514a9cb3 604 do {
cf00085d 605 if (is_kexec_kdump) {
514a9cb3
NF
606 base = read_n_cells(n_mem_addr_cells, usm);
607 size = read_n_cells(n_mem_size_cells, usm);
cf00085d 608 }
514a9cb3
NF
609
610 nid = of_drconf_to_nid_single(lmb);
611 fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
612 &nid);
613 node_set_online(nid);
614 sz = numa_enforce_memory_limit(base, size);
615 if (sz)
616 memblock_set_node(base, sz, &memblock.memory, nid);
617 } while (--ranges);
0204568a
PM
618}
619
1da177e4
LT
620static int __init parse_numa_properties(void)
621{
94db7c5e 622 struct device_node *memory;
482ec7c4 623 int default_nid = 0;
1da177e4
LT
624 unsigned long i;
625
626 if (numa_enabled == 0) {
627 printk(KERN_WARNING "NUMA disabled by user\n");
628 return -1;
629 }
630
1da177e4
LT
631 min_common_depth = find_min_common_depth();
632
1da177e4
LT
633 if (min_common_depth < 0)
634 return min_common_depth;
635
bf4b85b0
NL
636 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
637
1da177e4 638 /*
482ec7c4
NL
639 * Even though we connect cpus to numa domains later in SMP
640 * init, we need to know the node ids now. This is because
641 * each node to be onlined must have NODE_DATA etc backing it.
1da177e4 642 */
482ec7c4 643 for_each_present_cpu(i) {
dfbe93a2 644 struct device_node *cpu;
cf950b7a 645 int nid;
1da177e4 646
8b16cd23 647 cpu = of_get_cpu_node(i, NULL);
482ec7c4 648 BUG_ON(!cpu);
953039c8 649 nid = of_node_to_nid_single(cpu);
482ec7c4 650 of_node_put(cpu);
1da177e4 651
482ec7c4
NL
652 /*
653 * Don't fall back to default_nid yet -- we will plug
654 * cpus into nodes once the memory scan has discovered
655 * the topology.
656 */
657 if (nid < 0)
658 continue;
659 node_set_online(nid);
1da177e4
LT
660 }
661
237a0989 662 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
94db7c5e
AB
663
664 for_each_node_by_type(memory, "memory") {
1da177e4
LT
665 unsigned long start;
666 unsigned long size;
cf950b7a 667 int nid;
1da177e4 668 int ranges;
b08a2a12 669 const __be32 *memcell_buf;
1da177e4
LT
670 unsigned int len;
671
e2eb6392 672 memcell_buf = of_get_property(memory,
ba759485
ME
673 "linux,usable-memory", &len);
674 if (!memcell_buf || len <= 0)
e2eb6392 675 memcell_buf = of_get_property(memory, "reg", &len);
1da177e4
LT
676 if (!memcell_buf || len <= 0)
677 continue;
678
cc5d0189
BH
679 /* ranges in cell */
680 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
1da177e4
LT
681new_range:
682 /* these are order-sensitive, and modify the buffer pointer */
237a0989
MK
683 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
684 size = read_n_cells(n_mem_size_cells, &memcell_buf);
1da177e4 685
482ec7c4
NL
686 /*
687 * Assumption: either all memory nodes or none will
688 * have associativity properties. If none, then
689 * everything goes to default_nid.
690 */
953039c8 691 nid = of_node_to_nid_single(memory);
482ec7c4
NL
692 if (nid < 0)
693 nid = default_nid;
1daa6d08
BS
694
695 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
482ec7c4 696 node_set_online(nid);
1da177e4 697
7656cd8e
RA
698 size = numa_enforce_memory_limit(start, size);
699 if (size)
700 memblock_set_node(start, size, &memblock.memory, nid);
1da177e4
LT
701
702 if (--ranges)
703 goto new_range;
704 }
705
0204568a 706 /*
dfbe93a2
AB
707 * Now do the same thing for each MEMBLOCK listed in the
708 * ibm,dynamic-memory property in the
709 * ibm,dynamic-reconfiguration-memory node.
0204568a
PM
710 */
711 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
514a9cb3
NF
712 if (memory) {
713 walk_drmem_lmbs(memory, numa_setup_drmem_lmb);
714 of_node_put(memory);
715 }
0204568a 716
1da177e4
LT
717 return 0;
718}
719
720static void __init setup_nonnuma(void)
721{
95f72d1e
YL
722 unsigned long top_of_ram = memblock_end_of_DRAM();
723 unsigned long total_ram = memblock_phys_mem_size();
c67c3cb4 724 unsigned long start_pfn, end_pfn;
28be7072
BH
725 unsigned int nid = 0;
726 struct memblock_region *reg;
1da177e4 727
e110b281 728 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
1da177e4 729 top_of_ram, total_ram);
e110b281 730 printk(KERN_DEBUG "Memory hole size: %ldMB\n",
1da177e4
LT
731 (top_of_ram - total_ram) >> 20);
732
28be7072 733 for_each_memblock(memory, reg) {
c7fc2de0
YL
734 start_pfn = memblock_region_memory_base_pfn(reg);
735 end_pfn = memblock_region_memory_end_pfn(reg);
1daa6d08
BS
736
737 fake_numa_create_new_node(end_pfn, &nid);
1d7cfe18 738 memblock_set_node(PFN_PHYS(start_pfn),
e7e8de59
TC
739 PFN_PHYS(end_pfn - start_pfn),
740 &memblock.memory, nid);
1daa6d08 741 node_set_online(nid);
c67c3cb4 742 }
1da177e4
LT
743}
744
4b703a23
AB
745void __init dump_numa_cpu_topology(void)
746{
747 unsigned int node;
748 unsigned int cpu, count;
749
750 if (min_common_depth == -1 || !numa_enabled)
751 return;
752
753 for_each_online_node(node) {
8467801c 754 pr_info("Node %d CPUs:", node);
4b703a23
AB
755
756 count = 0;
757 /*
758 * If we used a CPU iterator here we would miss printing
759 * the holes in the cpumap.
760 */
25863de0
AB
761 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
762 if (cpumask_test_cpu(cpu,
763 node_to_cpumask_map[node])) {
4b703a23 764 if (count == 0)
8467801c 765 pr_cont(" %u", cpu);
4b703a23
AB
766 ++count;
767 } else {
768 if (count > 1)
8467801c 769 pr_cont("-%u", cpu - 1);
4b703a23
AB
770 count = 0;
771 }
772 }
773
774 if (count > 1)
8467801c
AK
775 pr_cont("-%u", nr_cpu_ids - 1);
776 pr_cont("\n");
4b703a23
AB
777 }
778}
779
10239733
AB
780/* Initialize NODE_DATA for a node on the local memory */
781static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
4a618669 782{
10239733
AB
783 u64 spanned_pages = end_pfn - start_pfn;
784 const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
785 u64 nd_pa;
786 void *nd;
787 int tnid;
4a618669 788
9a8dd708 789 nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
33755574
MR
790 if (!nd_pa)
791 panic("Cannot allocate %zu bytes for node %d data\n",
792 nd_size, nid);
793
10239733 794 nd = __va(nd_pa);
4a618669 795
10239733
AB
796 /* report and initialize */
797 pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n",
798 nd_pa, nd_pa + nd_size - 1);
799 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
800 if (tnid != nid)
801 pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
4a618669 802
10239733
AB
803 node_data[nid] = nd;
804 memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
805 NODE_DATA(nid)->node_id = nid;
806 NODE_DATA(nid)->node_start_pfn = start_pfn;
807 NODE_DATA(nid)->node_spanned_pages = spanned_pages;
808}
4a618669 809
a346137e
MB
810static void __init find_possible_nodes(void)
811{
812 struct device_node *rtas;
813 u32 numnodes, i;
814
815 if (min_common_depth <= 0)
816 return;
817
818 rtas = of_find_node_by_path("/rtas");
819 if (!rtas)
820 return;
821
822 if (of_property_read_u32_index(rtas,
823 "ibm,max-associativity-domains",
824 min_common_depth, &numnodes))
825 goto out;
826
827 for (i = 0; i < numnodes; i++) {
ea05ba7c 828 if (!node_possible(i))
a346137e 829 node_set(i, node_possible_map);
a346137e
MB
830 }
831
832out:
833 of_node_put(rtas);
834}
835
9bd9be00 836void __init mem_topology_setup(void)
1da177e4 837{
9bd9be00 838 int cpu;
1da177e4
LT
839
840 if (parse_numa_properties())
841 setup_nonnuma();
1da177e4 842
3af229f2 843 /*
a346137e
MB
844 * Modify the set of possible NUMA nodes to reflect information
845 * available about the set of online nodes, and the set of nodes
846 * that we expect to make use of for this platform's affinity
847 * calculations.
3af229f2
NA
848 */
849 nodes_and(node_possible_map, node_possible_map, node_online_map);
850
a346137e
MB
851 find_possible_nodes();
852
9bd9be00
NP
853 setup_node_to_cpumask_map();
854
855 reset_numa_cpu_lookup_table();
856
857 for_each_present_cpu(cpu)
858 numa_setup_cpu(cpu);
859}
860
861void __init initmem_init(void)
862{
863 int nid;
864
865 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
866 max_pfn = max_low_pfn;
867
868 memblock_dump_all();
869
1da177e4 870 for_each_online_node(nid) {
c67c3cb4 871 unsigned long start_pfn, end_pfn;
1da177e4 872
c67c3cb4 873 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
10239733 874 setup_node_data(nid, start_pfn, end_pfn);
8f64e1f2 875 sparse_memory_present_with_active_regions(nid);
4a618669 876 }
d3f6204a 877
21098b9e 878 sparse_init();
25863de0 879
2fabf084
NA
880 /*
881 * We need the numa_cpu_lookup_table to be accurate for all CPUs,
882 * even before we online them, so that we can use cpu_to_{node,mem}
883 * early in boot, cf. smp_prepare_cpus().
bdab88e0
SAS
884 * _nocalls() + manual invocation is used because cpuhp is not yet
885 * initialized for the boot CPU.
2fabf084 886 */
73c1b41e 887 cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare",
bdab88e0 888 ppc_numa_cpu_prepare, ppc_numa_cpu_dead);
1da177e4
LT
889}
890
1da177e4
LT
891static int __init early_numa(char *p)
892{
893 if (!p)
894 return 0;
895
896 if (strstr(p, "off"))
897 numa_enabled = 0;
898
899 if (strstr(p, "debug"))
900 numa_debug = 1;
901
1daa6d08
BS
902 p = strstr(p, "fake=");
903 if (p)
904 cmdline = p + strlen("fake=");
905
1da177e4
LT
906 return 0;
907}
908early_param("numa", early_numa);
237a0989 909
558f8649
NL
910/*
911 * The platform can inform us through one of several mechanisms
912 * (post-migration device tree updates, PRRN or VPHN) that the NUMA
913 * assignment of a resource has changed. This controls whether we act
914 * on that. Disabled by default.
915 */
916static bool topology_updates_enabled;
2d73bae1
NA
917
918static int __init early_topology_updates(char *p)
919{
920 if (!p)
921 return 0;
922
558f8649
NL
923 if (!strcmp(p, "on")) {
924 pr_warn("Caution: enabling topology updates\n");
925 topology_updates_enabled = true;
2d73bae1
NA
926 }
927
928 return 0;
929}
930early_param("topology_updates", early_topology_updates);
931
237a0989 932#ifdef CONFIG_MEMORY_HOTPLUG
0db9360a 933/*
0f16ef7f
NF
934 * Find the node associated with a hot added memory section for
935 * memory represented in the device tree by the property
936 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
0db9360a 937 */
514a9cb3 938static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
0db9360a 939{
514a9cb3 940 struct drmem_lmb *lmb;
3fdfd990 941 unsigned long lmb_size;
98fa15f3 942 int nid = NUMA_NO_NODE;
0db9360a 943
514a9cb3 944 lmb_size = drmem_lmb_size();
0db9360a 945
514a9cb3 946 for_each_drmem_lmb(lmb) {
0db9360a
NF
947 /* skip this block if it is reserved or not assigned to
948 * this partition */
514a9cb3
NF
949 if ((lmb->flags & DRCONF_MEM_RESERVED)
950 || !(lmb->flags & DRCONF_MEM_ASSIGNED))
0db9360a
NF
951 continue;
952
514a9cb3
NF
953 if ((scn_addr < lmb->base_addr)
954 || (scn_addr >= (lmb->base_addr + lmb_size)))
0f16ef7f
NF
955 continue;
956
514a9cb3 957 nid = of_drconf_to_nid_single(lmb);
0f16ef7f
NF
958 break;
959 }
960
961 return nid;
962}
963
964/*
965 * Find the node associated with a hot added memory section for memory
966 * represented in the device tree as a node (i.e. memory@XXXX) for
95f72d1e 967 * each memblock.
0f16ef7f 968 */
ec32dd66 969static int hot_add_node_scn_to_nid(unsigned long scn_addr)
0f16ef7f 970{
94db7c5e 971 struct device_node *memory;
98fa15f3 972 int nid = NUMA_NO_NODE;
0f16ef7f 973
94db7c5e 974 for_each_node_by_type(memory, "memory") {
0f16ef7f
NF
975 unsigned long start, size;
976 int ranges;
b08a2a12 977 const __be32 *memcell_buf;
0f16ef7f
NF
978 unsigned int len;
979
980 memcell_buf = of_get_property(memory, "reg", &len);
981 if (!memcell_buf || len <= 0)
982 continue;
983
984 /* ranges in cell */
985 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
986
987 while (ranges--) {
988 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
989 size = read_n_cells(n_mem_size_cells, &memcell_buf);
990
991 if ((scn_addr < start) || (scn_addr >= (start + size)))
992 continue;
993
994 nid = of_node_to_nid_single(memory);
995 break;
996 }
0db9360a 997
0f16ef7f
NF
998 if (nid >= 0)
999 break;
0db9360a
NF
1000 }
1001
60831842
AB
1002 of_node_put(memory);
1003
0f16ef7f 1004 return nid;
0db9360a
NF
1005}
1006
237a0989
MK
1007/*
1008 * Find the node associated with a hot added memory section. Section
95f72d1e
YL
1009 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that
1010 * sections are fully contained within a single MEMBLOCK.
237a0989
MK
1011 */
1012int hot_add_scn_to_nid(unsigned long scn_addr)
1013{
1014 struct device_node *memory = NULL;
4a3bac4e 1015 int nid;
237a0989
MK
1016
1017 if (!numa_enabled || (min_common_depth < 0))
72c33688 1018 return first_online_node;
0db9360a
NF
1019
1020 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1021 if (memory) {
514a9cb3 1022 nid = hot_add_drconf_scn_to_nid(scn_addr);
0db9360a 1023 of_node_put(memory);
0f16ef7f
NF
1024 } else {
1025 nid = hot_add_node_scn_to_nid(scn_addr);
0db9360a 1026 }
237a0989 1027
2a8628d4 1028 if (nid < 0 || !node_possible(nid))
72c33688 1029 nid = first_online_node;
237a0989 1030
0f16ef7f 1031 return nid;
237a0989 1032}
0f16ef7f 1033
cd34206e
NA
1034static u64 hot_add_drconf_memory_max(void)
1035{
e70bd3ae 1036 struct device_node *memory = NULL;
45b64ee6 1037 struct device_node *dn = NULL;
45b64ee6 1038 const __be64 *lrdr = NULL;
45b64ee6
BR
1039
1040 dn = of_find_node_by_path("/rtas");
1041 if (dn) {
1042 lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL);
1043 of_node_put(dn);
1044 if (lrdr)
1045 return be64_to_cpup(lrdr);
1046 }
cd34206e 1047
e70bd3ae
BR
1048 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1049 if (memory) {
e70bd3ae 1050 of_node_put(memory);
514a9cb3 1051 return drmem_lmb_memory_max();
e70bd3ae 1052 }
45b64ee6 1053 return 0;
cd34206e
NA
1054}
1055
1056/*
1057 * memory_hotplug_max - return max address of memory that may be added
1058 *
1059 * This is currently only used on systems that support drconfig memory
1060 * hotplug.
1061 */
1062u64 memory_hotplug_max(void)
1063{
1064 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
1065}
237a0989 1066#endif /* CONFIG_MEMORY_HOTPLUG */
9eff1a38 1067
bd03403a 1068/* Virtual Processor Home Node (VPHN) support */
39bf990e 1069#ifdef CONFIG_PPC_SPLPAR
4b6cfb2a 1070
47d99948 1071#include "book3s64/vphn.h"
4b6cfb2a 1072
30c05350
NF
1073struct topology_update_data {
1074 struct topology_update_data *next;
1075 unsigned int cpu;
1076 int old_nid;
1077 int new_nid;
1078};
1079
cee5405d
MB
1080#define TOPOLOGY_DEF_TIMER_SECS 60
1081
5de16699 1082static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
9eff1a38
JL
1083static cpumask_t cpu_associativity_changes_mask;
1084static int vphn_enabled;
5d88aa85
JL
1085static int prrn_enabled;
1086static void reset_topology_timer(void);
cee5405d 1087static int topology_timer_secs = 1;
17f444c0 1088static int topology_inited;
9eff1a38 1089
cee5405d
MB
1090/*
1091 * Change polling interval for associativity changes.
1092 */
1093int timed_topology_update(int nsecs)
1094{
1095 if (vphn_enabled) {
1096 if (nsecs > 0)
1097 topology_timer_secs = nsecs;
1098 else
1099 topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
1100
1101 reset_topology_timer();
1102 }
1103
1104 return 0;
1105}
9eff1a38
JL
1106
1107/*
1108 * Store the current values of the associativity change counters in the
1109 * hypervisor.
1110 */
1111static void setup_cpu_associativity_change_counters(void)
1112{
cd9d6cc7 1113 int cpu;
9eff1a38 1114
5de16699
AB
1115 /* The VPHN feature supports a maximum of 8 reference points */
1116 BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1117
9eff1a38 1118 for_each_possible_cpu(cpu) {
cd9d6cc7 1119 int i;
9eff1a38 1120 u8 *counts = vphn_cpu_change_counts[cpu];
499dcd41 1121 volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
9eff1a38 1122
5de16699 1123 for (i = 0; i < distance_ref_points_depth; i++)
9eff1a38 1124 counts[i] = hypervisor_counts[i];
9eff1a38
JL
1125 }
1126}
1127
1128/*
1129 * The hypervisor maintains a set of 8 associativity change counters in
1130 * the VPA of each cpu that correspond to the associativity levels in the
1131 * ibm,associativity-reference-points property. When an associativity
1132 * level changes, the corresponding counter is incremented.
1133 *
1134 * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1135 * node associativity levels have changed.
1136 *
1137 * Returns the number of cpus with unhandled associativity changes.
1138 */
1139static int update_cpu_associativity_changes_mask(void)
1140{
5d88aa85 1141 int cpu;
9eff1a38
JL
1142 cpumask_t *changes = &cpu_associativity_changes_mask;
1143
9eff1a38
JL
1144 for_each_possible_cpu(cpu) {
1145 int i, changed = 0;
1146 u8 *counts = vphn_cpu_change_counts[cpu];
499dcd41 1147 volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
9eff1a38 1148
5de16699 1149 for (i = 0; i < distance_ref_points_depth; i++) {
d69043e8 1150 if (hypervisor_counts[i] != counts[i]) {
9eff1a38
JL
1151 counts[i] = hypervisor_counts[i];
1152 changed = 1;
1153 }
1154 }
1155 if (changed) {
3be7db6a
RJ
1156 cpumask_or(changes, changes, cpu_sibling_mask(cpu));
1157 cpu = cpu_last_thread_sibling(cpu);
9eff1a38
JL
1158 }
1159 }
1160
5d88aa85 1161 return cpumask_weight(changes);
9eff1a38
JL
1162}
1163
9eff1a38
JL
1164/*
1165 * Retrieve the new associativity information for a virtual processor's
1166 * home node.
1167 */
b08a2a12 1168static long hcall_vphn(unsigned long cpu, __be32 *associativity)
9eff1a38 1169{
cd9d6cc7 1170 long rc;
9eff1a38
JL
1171 long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
1172 u64 flags = 1;
1173 int hwcpu = get_hard_smp_processor_id(cpu);
1174
1175 rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
1176 vphn_unpack_associativity(retbuf, associativity);
1177
1178 return rc;
1179}
1180
1181static long vphn_get_associativity(unsigned long cpu,
b08a2a12 1182 __be32 *associativity)
9eff1a38 1183{
cd9d6cc7 1184 long rc;
9eff1a38
JL
1185
1186 rc = hcall_vphn(cpu, associativity);
1187
1188 switch (rc) {
1189 case H_FUNCTION:
437ccdc8 1190 printk_once(KERN_INFO
9eff1a38
JL
1191 "VPHN is not supported. Disabling polling...\n");
1192 stop_topology_update();
1193 break;
1194 case H_HARDWARE:
1195 printk(KERN_ERR
1196 "hcall_vphn() experienced a hardware fault "
1197 "preventing VPHN. Disabling polling...\n");
1198 stop_topology_update();
17f444c0
MB
1199 break;
1200 case H_SUCCESS:
1201 dbg("VPHN hcall succeeded. Reset polling...\n");
cee5405d 1202 timed_topology_update(0);
17f444c0 1203 break;
9eff1a38
JL
1204 }
1205
1206 return rc;
1207}
1208
e67e02a5 1209int find_and_online_cpu_nid(int cpu)
ea05ba7c
MB
1210{
1211 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
1212 int new_nid;
1213
1214 /* Use associativity from first thread for all siblings */
2483ef05
SD
1215 if (vphn_get_associativity(cpu, associativity))
1216 return cpu_to_node(cpu);
1217
ea05ba7c
MB
1218 new_nid = associativity_to_nid(associativity);
1219 if (new_nid < 0 || !node_possible(new_nid))
1220 new_nid = first_online_node;
1221
1222 if (NODE_DATA(new_nid) == NULL) {
1223#ifdef CONFIG_MEMORY_HOTPLUG
1224 /*
1225 * Need to ensure that NODE_DATA is initialized for a node from
1226 * available memory (see memblock_alloc_try_nid). If unable to
1227 * init the node, then default to nearest node that has memory
ac1788cc
SD
1228 * installed. Skip onlining a node if the subsystems are not
1229 * yet initialized.
ea05ba7c 1230 */
ac1788cc 1231 if (!topology_inited || try_online_node(new_nid))
ea05ba7c
MB
1232 new_nid = first_online_node;
1233#else
1234 /*
1235 * Default to using the nearest node that has memory installed.
1236 * Otherwise, it would be necessary to patch the kernel MM code
1237 * to deal with more memoryless-node error conditions.
1238 */
1239 new_nid = first_online_node;
1240#endif
1241 }
1242
e67e02a5
MB
1243 pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__,
1244 cpu, new_nid);
ea05ba7c
MB
1245 return new_nid;
1246}
1247
30c05350
NF
1248/*
1249 * Update the CPU maps and sysfs entries for a single CPU when its NUMA
1250 * characteristics change. This function doesn't perform any locking and is
1251 * only safe to call from stop_machine().
1252 */
1253static int update_cpu_topology(void *data)
1254{
1255 struct topology_update_data *update;
1256 unsigned long cpu;
1257
1258 if (!data)
1259 return -EINVAL;
1260
3be7db6a 1261 cpu = smp_processor_id();
30c05350
NF
1262
1263 for (update = data; update; update = update->next) {
2c0a33f9 1264 int new_nid = update->new_nid;
30c05350
NF
1265 if (cpu != update->cpu)
1266 continue;
1267
49f8d8c0 1268 unmap_cpu_from_node(cpu);
2c0a33f9
NA
1269 map_cpu_to_node(cpu, new_nid);
1270 set_cpu_numa_node(cpu, new_nid);
1271 set_cpu_numa_mem(cpu, local_memory_node(new_nid));
176bbf14 1272 vdso_getcpu_init();
30c05350
NF
1273 }
1274
1275 return 0;
1276}
1277
d4edc5b6
SB
1278static int update_lookup_table(void *data)
1279{
1280 struct topology_update_data *update;
1281
1282 if (!data)
1283 return -EINVAL;
1284
1285 /*
1286 * Upon topology update, the numa-cpu lookup table needs to be updated
1287 * for all threads in the core, including offline CPUs, to ensure that
1288 * future hotplug operations respect the cpu-to-node associativity
1289 * properly.
1290 */
1291 for (update = data; update; update = update->next) {
1292 int nid, base, j;
1293
1294 nid = update->new_nid;
1295 base = cpu_first_thread_sibling(update->cpu);
1296
1297 for (j = 0; j < threads_per_core; j++) {
1298 update_numa_cpu_lookup_table(base + j, nid);
1299 }
1300 }
1301
1302 return 0;
1303}
1304
9eff1a38
JL
1305/*
1306 * Update the node maps and sysfs entries for each cpu whose home node
79c5fceb 1307 * has changed. Returns 1 when the topology has changed, and 0 otherwise.
3e401f7a
TJB
1308 *
1309 * cpus_locked says whether we already hold cpu_hotplug_lock.
9eff1a38 1310 */
3e401f7a 1311int numa_update_cpu_topology(bool cpus_locked)
9eff1a38 1312{
3be7db6a 1313 unsigned int cpu, sibling, changed = 0;
30c05350 1314 struct topology_update_data *updates, *ud;
176bbf14 1315 cpumask_t updated_cpus;
8a25a2fd 1316 struct device *dev;
3be7db6a 1317 int weight, new_nid, i = 0;
9eff1a38 1318
2ea62630 1319 if (!prrn_enabled && !vphn_enabled && topology_inited)
2d73bae1
NA
1320 return 0;
1321
30c05350
NF
1322 weight = cpumask_weight(&cpu_associativity_changes_mask);
1323 if (!weight)
1324 return 0;
1325
6396bb22 1326 updates = kcalloc(weight, sizeof(*updates), GFP_KERNEL);
30c05350
NF
1327 if (!updates)
1328 return 0;
9eff1a38 1329
176bbf14
JL
1330 cpumask_clear(&updated_cpus);
1331
5d88aa85 1332 for_each_cpu(cpu, &cpu_associativity_changes_mask) {
3be7db6a
RJ
1333 /*
1334 * If siblings aren't flagged for changes, updates list
1335 * will be too short. Skip on this update and set for next
1336 * update.
1337 */
1338 if (!cpumask_subset(cpu_sibling_mask(cpu),
1339 &cpu_associativity_changes_mask)) {
1340 pr_info("Sibling bits not set for associativity "
1341 "change, cpu%d\n", cpu);
1342 cpumask_or(&cpu_associativity_changes_mask,
1343 &cpu_associativity_changes_mask,
1344 cpu_sibling_mask(cpu));
1345 cpu = cpu_last_thread_sibling(cpu);
1346 continue;
1347 }
9eff1a38 1348
ea05ba7c 1349 new_nid = find_and_online_cpu_nid(cpu);
3be7db6a
RJ
1350
1351 if (new_nid == numa_cpu_lookup_table[cpu]) {
1352 cpumask_andnot(&cpu_associativity_changes_mask,
1353 &cpu_associativity_changes_mask,
1354 cpu_sibling_mask(cpu));
17f444c0
MB
1355 dbg("Assoc chg gives same node %d for cpu%d\n",
1356 new_nid, cpu);
3be7db6a
RJ
1357 cpu = cpu_last_thread_sibling(cpu);
1358 continue;
1359 }
9eff1a38 1360
3be7db6a
RJ
1361 for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
1362 ud = &updates[i++];
8bc93149 1363 ud->next = &updates[i];
3be7db6a
RJ
1364 ud->cpu = sibling;
1365 ud->new_nid = new_nid;
1366 ud->old_nid = numa_cpu_lookup_table[sibling];
1367 cpumask_set_cpu(sibling, &updated_cpus);
3be7db6a
RJ
1368 }
1369 cpu = cpu_last_thread_sibling(cpu);
30c05350
NF
1370 }
1371
8bc93149
MB
1372 /*
1373 * Prevent processing of 'updates' from overflowing array
1374 * where last entry filled in a 'next' pointer.
1375 */
1376 if (i)
1377 updates[i-1].next = NULL;
1378
2d73bae1
NA
1379 pr_debug("Topology update for the following CPUs:\n");
1380 if (cpumask_weight(&updated_cpus)) {
1381 for (ud = &updates[0]; ud; ud = ud->next) {
1382 pr_debug("cpu %d moving from node %d "
1383 "to %d\n", ud->cpu,
1384 ud->old_nid, ud->new_nid);
1385 }
1386 }
1387
9a013361
MW
1388 /*
1389 * In cases where we have nothing to update (because the updates list
1390 * is too short or because the new topology is same as the old one),
1391 * skip invoking update_cpu_topology() via stop-machine(). This is
1392 * necessary (and not just a fast-path optimization) since stop-machine
1393 * can end up electing a random CPU to run update_cpu_topology(), and
1394 * thus trick us into setting up incorrect cpu-node mappings (since
1395 * 'updates' is kzalloc()'ed).
1396 *
1397 * And for the similar reason, we will skip all the following updating.
1398 */
1399 if (!cpumask_weight(&updated_cpus))
1400 goto out;
1401
3e401f7a
TJB
1402 if (cpus_locked)
1403 stop_machine_cpuslocked(update_cpu_topology, &updates[0],
1404 &updated_cpus);
1405 else
1406 stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
30c05350 1407
d4edc5b6
SB
1408 /*
1409 * Update the numa-cpu lookup table with the new mappings, even for
1410 * offline CPUs. It is best to perform this update from the stop-
1411 * machine context.
1412 */
3e401f7a
TJB
1413 if (cpus_locked)
1414 stop_machine_cpuslocked(update_lookup_table, &updates[0],
d4edc5b6 1415 cpumask_of(raw_smp_processor_id()));
3e401f7a
TJB
1416 else
1417 stop_machine(update_lookup_table, &updates[0],
1418 cpumask_of(raw_smp_processor_id()));
d4edc5b6 1419
30c05350 1420 for (ud = &updates[0]; ud; ud = ud->next) {
dd023217
NF
1421 unregister_cpu_under_node(ud->cpu, ud->old_nid);
1422 register_cpu_under_node(ud->cpu, ud->new_nid);
1423
30c05350 1424 dev = get_cpu_device(ud->cpu);
8a25a2fd
KS
1425 if (dev)
1426 kobject_uevent(&dev->kobj, KOBJ_CHANGE);
30c05350 1427 cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
79c5fceb 1428 changed = 1;
9eff1a38
JL
1429 }
1430
9a013361 1431out:
30c05350 1432 kfree(updates);
79c5fceb 1433 return changed;
9eff1a38
JL
1434}
1435
3e401f7a
TJB
1436int arch_update_cpu_topology(void)
1437{
3e401f7a
TJB
1438 return numa_update_cpu_topology(true);
1439}
1440
9eff1a38
JL
1441static void topology_work_fn(struct work_struct *work)
1442{
1443 rebuild_sched_domains();
1444}
1445static DECLARE_WORK(topology_work, topology_work_fn);
1446
ec32dd66 1447static void topology_schedule_update(void)
9eff1a38
JL
1448{
1449 schedule_work(&topology_work);
1450}
1451
df7e828c 1452static void topology_timer_fn(struct timer_list *unused)
9eff1a38 1453{
5d88aa85 1454 if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
9eff1a38 1455 topology_schedule_update();
5d88aa85
JL
1456 else if (vphn_enabled) {
1457 if (update_cpu_associativity_changes_mask() > 0)
1458 topology_schedule_update();
1459 reset_topology_timer();
1460 }
9eff1a38 1461}
df7e828c 1462static struct timer_list topology_timer;
9eff1a38 1463
5d88aa85 1464static void reset_topology_timer(void)
9eff1a38 1465{
8604895a
MB
1466 if (vphn_enabled)
1467 mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ);
9eff1a38
JL
1468}
1469
601abdc3
NF
1470#ifdef CONFIG_SMP
1471
5d88aa85
JL
1472static int dt_update_callback(struct notifier_block *nb,
1473 unsigned long action, void *data)
1474{
f5242e5a 1475 struct of_reconfig_data *update = data;
5d88aa85
JL
1476 int rc = NOTIFY_DONE;
1477
1478 switch (action) {
5d88aa85 1479 case OF_RECONFIG_UPDATE_PROPERTY:
e5480bdc 1480 if (of_node_is_type(update->dn, "cpu") &&
30c05350 1481 !of_prop_cmp(update->prop->name, "ibm,associativity")) {
5d88aa85
JL
1482 u32 core_id;
1483 of_property_read_u32(update->dn, "reg", &core_id);
81b61324 1484 rc = dlpar_cpu_readd(core_id);
5d88aa85
JL
1485 rc = NOTIFY_OK;
1486 }
1487 break;
1488 }
1489
1490 return rc;
9eff1a38
JL
1491}
1492
5d88aa85
JL
1493static struct notifier_block dt_update_nb = {
1494 .notifier_call = dt_update_callback,
1495};
1496
601abdc3
NF
1497#endif
1498
9eff1a38 1499/*
5d88aa85 1500 * Start polling for associativity changes.
9eff1a38
JL
1501 */
1502int start_topology_update(void)
1503{
1504 int rc = 0;
1505
2d4d9b30
NL
1506 if (!topology_updates_enabled)
1507 return 0;
1508
5d88aa85
JL
1509 if (firmware_has_feature(FW_FEATURE_PRRN)) {
1510 if (!prrn_enabled) {
1511 prrn_enabled = 1;
601abdc3 1512#ifdef CONFIG_SMP
5d88aa85 1513 rc = of_reconfig_notifier_register(&dt_update_nb);
601abdc3 1514#endif
5d88aa85 1515 }
a3496e91
MB
1516 }
1517 if (firmware_has_feature(FW_FEATURE_VPHN) &&
f13c13a0 1518 lppaca_shared_proc(get_lppaca())) {
5d88aa85 1519 if (!vphn_enabled) {
5d88aa85
JL
1520 vphn_enabled = 1;
1521 setup_cpu_associativity_change_counters();
df7e828c
KC
1522 timer_setup(&topology_timer, topology_timer_fn,
1523 TIMER_DEFERRABLE);
5d88aa85
JL
1524 reset_topology_timer();
1525 }
9eff1a38
JL
1526 }
1527
65b9fdad
MB
1528 pr_info("Starting topology update%s%s\n",
1529 (prrn_enabled ? " prrn_enabled" : ""),
1530 (vphn_enabled ? " vphn_enabled" : ""));
1531
9eff1a38
JL
1532 return rc;
1533}
9eff1a38
JL
1534
1535/*
1536 * Disable polling for VPHN associativity changes.
1537 */
1538int stop_topology_update(void)
1539{
5d88aa85
JL
1540 int rc = 0;
1541
2d4d9b30
NL
1542 if (!topology_updates_enabled)
1543 return 0;
1544
5d88aa85
JL
1545 if (prrn_enabled) {
1546 prrn_enabled = 0;
601abdc3 1547#ifdef CONFIG_SMP
5d88aa85 1548 rc = of_reconfig_notifier_unregister(&dt_update_nb);
601abdc3 1549#endif
a3496e91
MB
1550 }
1551 if (vphn_enabled) {
5d88aa85
JL
1552 vphn_enabled = 0;
1553 rc = del_timer_sync(&topology_timer);
1554 }
1555
65b9fdad
MB
1556 pr_info("Stopping topology update\n");
1557
5d88aa85 1558 return rc;
9eff1a38 1559}
e04fa612
NF
1560
1561int prrn_is_enabled(void)
1562{
1563 return prrn_enabled;
1564}
1565
2ea62630
SD
1566void __init shared_proc_topology_init(void)
1567{
1568 if (lppaca_shared_proc(get_lppaca())) {
1569 bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
1570 nr_cpumask_bits);
1571 numa_update_cpu_topology(false);
1572 }
1573}
1574
e04fa612
NF
1575static int topology_read(struct seq_file *file, void *v)
1576{
1577 if (vphn_enabled || prrn_enabled)
1578 seq_puts(file, "on\n");
1579 else
1580 seq_puts(file, "off\n");
1581
1582 return 0;
1583}
1584
1585static int topology_open(struct inode *inode, struct file *file)
1586{
1587 return single_open(file, topology_read, NULL);
1588}
1589
1590static ssize_t topology_write(struct file *file, const char __user *buf,
1591 size_t count, loff_t *off)
1592{
1593 char kbuf[4]; /* "on" or "off" plus null. */
1594 int read_len;
1595
1596 read_len = count < 3 ? count : 3;
1597 if (copy_from_user(kbuf, buf, read_len))
1598 return -EINVAL;
1599
1600 kbuf[read_len] = '\0';
1601
2d4d9b30
NL
1602 if (!strncmp(kbuf, "on", 2)) {
1603 topology_updates_enabled = true;
e04fa612 1604 start_topology_update();
2d4d9b30 1605 } else if (!strncmp(kbuf, "off", 3)) {
e04fa612 1606 stop_topology_update();
2d4d9b30
NL
1607 topology_updates_enabled = false;
1608 } else
e04fa612
NF
1609 return -EINVAL;
1610
1611 return count;
1612}
1613
1614static const struct file_operations topology_ops = {
1615 .read = seq_read,
1616 .write = topology_write,
1617 .open = topology_open,
1618 .release = single_release
1619};
1620
1621static int topology_update_init(void)
1622{
2d4d9b30 1623 start_topology_update();
2d73bae1 1624
17f444c0
MB
1625 if (vphn_enabled)
1626 topology_schedule_update();
1627
2d15b9b4
NA
1628 if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
1629 return -ENOMEM;
e04fa612 1630
17f444c0 1631 topology_inited = 1;
e04fa612 1632 return 0;
9eff1a38 1633}
e04fa612 1634device_initcall(topology_update_init);
39bf990e 1635#endif /* CONFIG_PPC_SPLPAR */