arch/x86/mm/numa_64.c

   1 /*
   2  * Generic VM initialization for x86-64 NUMA setups.
   3  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  */
   5 #include <linux/kernel.h>
   6 #include <linux/mm.h>
   7 #include <linux/string.h>
   8 #include <linux/init.h>
   9 #include <linux/bootmem.h>
  10 #include <linux/mmzone.h>
  11 #include <linux/ctype.h>
  12 #include <linux/module.h>
  13 #include <linux/nodemask.h>
  14 #include <linux/sched.h>
  15
  16 #include <asm/e820.h>
  17 #include <asm/proto.h>
  18 #include <asm/dma.h>
  19 #include <asm/numa.h>
  20 #include <asm/acpi.h>
  21 #include <asm/k8.h>
  22
  23 #ifndef Dprintk
  24 #define Dprintk(x...)
  25 #endif
  26
  27 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
  28 EXPORT_SYMBOL(node_data);
  29
  30 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
  31
  32 struct memnode memnode;
  33
  34 int x86_cpu_to_node_map_init[NR_CPUS] = {
  35         [0 ... NR_CPUS-1] = NUMA_NO_NODE
  36 };
  37 void *x86_cpu_to_node_map_early_ptr;
  38 DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
  39 EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
  40 EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
  41
  42 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
  43         [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
  44 };
  45
  46 cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
  47 EXPORT_SYMBOL(node_to_cpumask_map);
  48
  49 int numa_off __initdata;
  50 unsigned long __initdata nodemap_addr;
  51 unsigned long __initdata nodemap_size;
  52
  53 /*
  54  * Given a shift value, try to populate memnodemap[]
  55  * Returns :
  56  * 1 if OK
  57  * 0 if memnodmap[] too small (of shift too small)
  58  * -1 if node overlap or lost ram (shift too big)
  59  */
  60 static int __init populate_memnodemap(const struct bootnode *nodes,
  61                                       int numnodes, int shift)
  62 {
  63         unsigned long addr, end;
  64         int i, res = -1;
  65
  66         memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
  67         for (i = 0; i < numnodes; i++) {
  68                 addr = nodes[i].start;
  69                 end = nodes[i].end;
  70                 if (addr >= end)
  71                         continue;
  72                 if ((end >> shift) >= memnodemapsize)
  73                         return 0;
  74                 do {
  75                         if (memnodemap[addr >> shift] != NUMA_NO_NODE)
  76                                 return -1;
  77                         memnodemap[addr >> shift] = i;
  78                         addr += (1UL << shift);
  79                 } while (addr < end);
  80                 res = 1;
  81         }
  82         return res;
  83 }
  84
  85 static int __init allocate_cachealigned_memnodemap(void)
  86 {
  87         unsigned long pad, pad_addr;
  88
  89         memnodemap = memnode.embedded_map;
  90         if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
  91                 return 0;
  92
  93         pad = L1_CACHE_BYTES - 1;
  94         pad_addr = 0x8000;
  95         nodemap_size = pad + sizeof(s16) * memnodemapsize;
  96         nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
  97                                       nodemap_size);
  98         if (nodemap_addr == -1UL) {
  99                 printk(KERN_ERR
 100                        "NUMA: Unable to allocate Memory to Node hash map\n");
 101                 nodemap_addr = nodemap_size = 0;
 102                 return -1;
 103         }
 104         pad_addr = (nodemap_addr + pad) & ~pad;
 105         memnodemap = phys_to_virt(pad_addr);
 106         reserve_early(nodemap_addr, nodemap_addr + nodemap_size);
 107
 108         printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
 109                nodemap_addr, nodemap_addr + nodemap_size);
 110         return 0;
 111 }
 112
 113 /*
 114  * The LSB of all start and end addresses in the node map is the value of the
 115  * maximum possible shift.
 116  */
 117 static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
 118                                          int numnodes)
 119 {
 120         int i, nodes_used = 0;
 121         unsigned long start, end;
 122         unsigned long bitfield = 0, memtop = 0;
 123
 124         for (i = 0; i < numnodes; i++) {
 125                 start = nodes[i].start;
 126                 end = nodes[i].end;
 127                 if (start >= end)
 128                         continue;
 129                 bitfield |= start;
 130                 nodes_used++;
 131                 if (end > memtop)
 132                         memtop = end;
 133         }
 134         if (nodes_used <= 1)
 135                 i = 63;
 136         else
 137                 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
 138         memnodemapsize = (memtop >> i)+1;
 139         return i;
 140 }
 141
 142 int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
 143 {
 144         int shift;
 145
 146         shift = extract_lsb_from_nodes(nodes, numnodes);
 147         if (allocate_cachealigned_memnodemap())
 148                 return -1;
 149         printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
 150                 shift);
 151
 152         if (populate_memnodemap(nodes, numnodes, shift) != 1) {
 153                 printk(KERN_INFO "Your memory is not aligned you need to "
 154                        "rebuild your kernel with a bigger NODEMAPSIZE "
 155                        "shift=%d\n", shift);
 156                 return -1;
 157         }
 158         return shift;
 159 }
 160
 161 int early_pfn_to_nid(unsigned long pfn)
 162 {
 163         return phys_to_nid(pfn << PAGE_SHIFT);
 164 }
 165
 166 static void * __init early_node_mem(int nodeid, unsigned long start,
 167                                     unsigned long end, unsigned long size)
 168 {
 169         unsigned long mem = find_e820_area(start, end, size);
 170         void *ptr;
 171
 172         if (mem != -1L)
 173                 return __va(mem);
 174         ptr = __alloc_bootmem_nopanic(size,
 175                                 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
 176         if (ptr == NULL) {
 177                 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
 178                        size, nodeid);
 179                 return NULL;
 180         }
 181         return ptr;
 182 }
 183
 184 /* Initialize bootmem allocator for a node */
 185 void __init setup_node_bootmem(int nodeid, unsigned long start,
 186                                unsigned long end)
 187 {
 188         unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size;
 189         unsigned long bootmap_start, nodedata_phys;
 190         void *bootmap;
 191         const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
 192
 193         start = round_up(start, ZONE_ALIGN);
 194
 195         printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
 196                start, end);
 197
 198         start_pfn = start >> PAGE_SHIFT;
 199         end_pfn = end >> PAGE_SHIFT;
 200
 201         node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
 202         if (node_data[nodeid] == NULL)
 203                 return;
 204         nodedata_phys = __pa(node_data[nodeid]);
 205
 206         memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
 207         NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
 208         NODE_DATA(nodeid)->node_start_pfn = start_pfn;
 209         NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
 210
 211         /* Find a place for the bootmem map */
 212         bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
 213         bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
 214         bootmap = early_node_mem(nodeid, bootmap_start, end,
 215                                         bootmap_pages<<PAGE_SHIFT);
 216         if (bootmap == NULL)  {
 217                 if (nodedata_phys < start || nodedata_phys >= end)
 218                         free_bootmem((unsigned long)node_data[nodeid],
 219                                      pgdat_size);
 220                 node_data[nodeid] = NULL;
 221                 return;
 222         }
 223         bootmap_start = __pa(bootmap);
 224         Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
 225
 226         bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
 227                                          bootmap_start >> PAGE_SHIFT,
 228                                          start_pfn, end_pfn);
 229
 230         free_bootmem_with_active_regions(nodeid, end);
 231
 232         reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
 233         reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
 234                              bootmap_pages<<PAGE_SHIFT);
 235 #ifdef CONFIG_ACPI_NUMA
 236         srat_reserve_add_area(nodeid);
 237 #endif
 238         node_set_online(nodeid);
 239 }
 240
 241 /*
 242  * There are unfortunately some poorly designed mainboards around that
 243  * only connect memory to a single CPU. This breaks the 1:1 cpu->node
 244  * mapping. To avoid this fill in the mapping for all possible CPUs,
 245  * as the number of CPUs is not known yet. We round robin the existing
 246  * nodes.
 247  */
 248 void __init numa_init_array(void)
 249 {
 250         int rr, i;
 251
 252         rr = first_node(node_online_map);
 253         for (i = 0; i < NR_CPUS; i++) {
 254                 if (early_cpu_to_node(i) != NUMA_NO_NODE)
 255                         continue;
 256                 numa_set_node(i, rr);
 257                 rr = next_node(rr, node_online_map);
 258                 if (rr == MAX_NUMNODES)
 259                         rr = first_node(node_online_map);
 260         }
 261 }
 262
 263 #ifdef CONFIG_NUMA_EMU
 264 /* Numa emulation */
 265 char *cmdline __initdata;
 266
 267 /*
 268  * Setups up nid to range from addr to addr + size.  If the end
 269  * boundary is greater than max_addr, then max_addr is used instead.
 270  * The return value is 0 if there is additional memory left for
 271  * allocation past addr and -1 otherwise.  addr is adjusted to be at
 272  * the end of the node.
 273  */
 274 static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
 275                                    u64 size, u64 max_addr)
 276 {
 277         int ret = 0;
 278
 279         nodes[nid].start = *addr;
 280         *addr += size;
 281         if (*addr >= max_addr) {
 282                 *addr = max_addr;
 283                 ret = -1;
 284         }
 285         nodes[nid].end = *addr;
 286         node_set(nid, node_possible_map);
 287         printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
 288                nodes[nid].start, nodes[nid].end,
 289                (nodes[nid].end - nodes[nid].start) >> 20);
 290         return ret;
 291 }
 292
 293 /*
 294  * Splits num_nodes nodes up equally starting at node_start.  The return value
 295  * is the number of nodes split up and addr is adjusted to be at the end of the
 296  * last node allocated.
 297  */
 298 static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
 299                                       u64 max_addr, int node_start,
 300                                       int num_nodes)
 301 {
 302         unsigned int big;
 303         u64 size;
 304         int i;
 305
 306         if (num_nodes <= 0)
 307                 return -1;
 308         if (num_nodes > MAX_NUMNODES)
 309                 num_nodes = MAX_NUMNODES;
 310         size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
 311                num_nodes;
 312         /*
 313          * Calculate the number of big nodes that can be allocated as a result
 314          * of consolidating the leftovers.
 315          */
 316         big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
 317               FAKE_NODE_MIN_SIZE;
 318
 319         /* Round down to nearest FAKE_NODE_MIN_SIZE. */
 320         size &= FAKE_NODE_MIN_HASH_MASK;
 321         if (!size) {
 322                 printk(KERN_ERR "Not enough memory for each node.  "
 323                        "NUMA emulation disabled.\n");
 324                 return -1;
 325         }
 326
 327         for (i = node_start; i < num_nodes + node_start; i++) {
 328                 u64 end = *addr + size;
 329
 330                 if (i < big)
 331                         end += FAKE_NODE_MIN_SIZE;
 332                 /*
 333                  * The final node can have the remaining system RAM.  Other
 334                  * nodes receive roughly the same amount of available pages.
 335                  */
 336                 if (i == num_nodes + node_start - 1)
 337                         end = max_addr;
 338                 else
 339                         while (end - *addr - e820_hole_size(*addr, end) <
 340                                size) {
 341                                 end += FAKE_NODE_MIN_SIZE;
 342                                 if (end > max_addr) {
 343                                         end = max_addr;
 344                                         break;
 345                                 }
 346                         }
 347                 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
 348                         break;
 349         }
 350         return i - node_start + 1;
 351 }
 352
 353 /*
 354  * Splits the remaining system RAM into chunks of size.  The remaining memory is
 355  * always assigned to a final node and can be asymmetric.  Returns the number of
 356  * nodes split.
 357  */
 358 static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
 359                                       u64 max_addr, int node_start, u64 size)
 360 {
 361         int i = node_start;
 362         size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
 363         while (!setup_node_range(i++, nodes, addr, size, max_addr))
 364                 ;
 365         return i - node_start;
 366 }
 367
 368 /*
 369  * Sets up the system RAM area from start_pfn to end_pfn according to the
 370  * numa=fake command-line option.
 371  */
 372 static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
 373 {
 374         struct bootnode nodes[MAX_NUMNODES];
 375         u64 size, addr = start_pfn << PAGE_SHIFT;
 376         u64 max_addr = end_pfn << PAGE_SHIFT;
 377         int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
 378
 379         memset(&nodes, 0, sizeof(nodes));
 380         /*
 381          * If the numa=fake command-line is just a single number N, split the
 382          * system RAM into N fake nodes.
 383          */
 384         if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
 385                 long n = simple_strtol(cmdline, NULL, 0);
 386
 387                 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
 388                 if (num_nodes < 0)
 389                         return num_nodes;
 390                 goto out;
 391         }
 392
 393         /* Parse the command line. */
 394         for (coeff_flag = 0; ; cmdline++) {
 395                 if (*cmdline && isdigit(*cmdline)) {
 396                         num = num * 10 + *cmdline - '0';
 397                         continue;
 398                 }
 399                 if (*cmdline == '*') {
 400                         if (num > 0)
 401                                 coeff = num;
 402                         coeff_flag = 1;
 403                 }
 404                 if (!*cmdline || *cmdline == ',') {
 405                         if (!coeff_flag)
 406                                 coeff = 1;
 407                         /*
 408                          * Round down to the nearest FAKE_NODE_MIN_SIZE.
 409                          * Command-line coefficients are in megabytes.
 410                          */
 411                         size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
 412                         if (size)
 413                                 for (i = 0; i < coeff; i++, num_nodes++)
 414                                         if (setup_node_range(num_nodes, nodes,
 415                                                 &addr, size, max_addr) < 0)
 416                                                 goto done;
 417                         if (!*cmdline)
 418                                 break;
 419                         coeff_flag = 0;
 420                         coeff = -1;
 421                 }
 422                 num = 0;
 423         }
 424 done:
 425         if (!num_nodes)
 426                 return -1;
 427         /* Fill remainder of system RAM, if appropriate. */
 428         if (addr < max_addr) {
 429                 if (coeff_flag && coeff < 0) {
 430                         /* Split remaining nodes into num-sized chunks */
 431                         num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
 432                                                          num_nodes, num);
 433                         goto out;
 434                 }
 435                 switch (*(cmdline - 1)) {
 436                 case '*':
 437                         /* Split remaining nodes into coeff chunks */
 438                         if (coeff <= 0)
 439                                 break;
 440                         num_nodes += split_nodes_equally(nodes, &addr, max_addr,
 441                                                          num_nodes, coeff);
 442                         break;
 443                 case ',':
 444                         /* Do not allocate remaining system RAM */
 445                         break;
 446                 default:
 447                         /* Give one final node */
 448                         setup_node_range(num_nodes, nodes, &addr,
 449                                          max_addr - addr, max_addr);
 450                         num_nodes++;
 451                 }
 452         }
 453 out:
 454         memnode_shift = compute_hash_shift(nodes, num_nodes);
 455         if (memnode_shift < 0) {
 456                 memnode_shift = 0;
 457                 printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
 458                        "disabled.\n");
 459                 return -1;
 460         }
 461
 462         /*
 463          * We need to vacate all active ranges that may have been registered by
 464          * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
 465          * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
 466          */
 467         remove_all_active_ranges();
 468 #ifdef CONFIG_ACPI_NUMA
 469         acpi_numa = -1;
 470 #endif
 471         for_each_node_mask(i, node_possible_map) {
 472                 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 473                                                 nodes[i].end >> PAGE_SHIFT);
 474                 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 475         }
 476         acpi_fake_nodes(nodes, num_nodes);
 477         numa_init_array();
 478         return 0;
 479 }
 480 #endif /* CONFIG_NUMA_EMU */
 481
 482 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 483 {
 484         int i;
 485
 486         nodes_clear(node_possible_map);
 487
 488 #ifdef CONFIG_NUMA_EMU
 489         if (cmdline && !numa_emulation(start_pfn, end_pfn))
 490                 return;
 491         nodes_clear(node_possible_map);
 492 #endif
 493
 494 #ifdef CONFIG_ACPI_NUMA
 495         if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
 496                                           end_pfn << PAGE_SHIFT))
 497                 return;
 498         nodes_clear(node_possible_map);
 499 #endif
 500
 501 #ifdef CONFIG_K8_NUMA
 502         if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
 503                                         end_pfn<<PAGE_SHIFT))
 504                 return;
 505         nodes_clear(node_possible_map);
 506 #endif
 507         printk(KERN_INFO "%s\n",
 508                numa_off ? "NUMA turned off" : "No NUMA configuration found");
 509
 510         printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
 511                start_pfn << PAGE_SHIFT,
 512                end_pfn << PAGE_SHIFT);
 513         /* setup dummy node covering all memory */
 514         memnode_shift = 63;
 515         memnodemap = memnode.embedded_map;
 516         memnodemap[0] = 0;
 517         nodes_clear(node_online_map);
 518         node_set_online(0);
 519         node_set(0, node_possible_map);
 520         for (i = 0; i < NR_CPUS; i++)
 521                 numa_set_node(i, 0);
 522         /* cpumask_of_cpu() may not be available during early startup */
 523         memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
 524         cpu_set(0, node_to_cpumask_map[0]);
 525         e820_register_active_regions(0, start_pfn, end_pfn);
 526         setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
 527 }
 528
 529 __cpuinit void numa_add_cpu(int cpu)
 530 {
 531         set_bit(cpu,
 532                 (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
 533 }
 534
 535 void __cpuinit numa_set_node(int cpu, int node)
 536 {
 537         int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
 538
 539         cpu_pda(cpu)->nodenumber = node;
 540
 541         if(cpu_to_node_map)
 542                 cpu_to_node_map[cpu] = node;
 543         else if(per_cpu_offset(cpu))
 544                 per_cpu(x86_cpu_to_node_map, cpu) = node;
 545         else
 546                 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
 547 }
 548
 549 unsigned long __init numa_free_all_bootmem(void)
 550 {
 551         unsigned long pages = 0;
 552         int i;
 553
 554         for_each_online_node(i)
 555                 pages += free_all_bootmem_node(NODE_DATA(i));
 556
 557         return pages;
 558 }
 559
 560 void __init paging_init(void)
 561 {
 562         unsigned long max_zone_pfns[MAX_NR_ZONES];
 563
 564         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 565         max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 566         max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 567         max_zone_pfns[ZONE_NORMAL] = end_pfn;
 568
 569         sparse_memory_present_with_active_regions(MAX_NUMNODES);
 570         sparse_init();
 571
 572         free_area_init_nodes(max_zone_pfns);
 573 }
 574
 575 static __init int numa_setup(char *opt)
 576 {
 577         if (!opt)
 578                 return -EINVAL;
 579         if (!strncmp(opt, "off", 3))
 580                 numa_off = 1;
 581 #ifdef CONFIG_NUMA_EMU
 582         if (!strncmp(opt, "fake=", 5))
 583                 cmdline = opt + 5;
 584 #endif
 585 #ifdef CONFIG_ACPI_NUMA
 586         if (!strncmp(opt, "noacpi", 6))
 587                 acpi_numa = -1;
 588         if (!strncmp(opt, "hotadd=", 7))
 589                 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
 590 #endif
 591         return 0;
 592 }
 593 early_param("numa", numa_setup);
 594
 595 /*
 596  * Setup early cpu_to_node.
 597  *
 598  * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
 599  * and apicid_to_node[] tables have valid entries for a CPU.
 600  * This means we skip cpu_to_node[] initialisation for NUMA
 601  * emulation and faking node case (when running a kernel compiled
 602  * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
 603  * is already initialized in a round robin manner at numa_init_array,
 604  * prior to this call, and this initialization is good enough
 605  * for the fake NUMA cases.
 606  */
 607 void __init init_cpu_to_node(void)
 608 {
 609         int i;
 610
 611         for (i = 0; i < NR_CPUS; i++) {
 612                 u16 apicid = x86_cpu_to_apicid_init[i];
 613
 614                 if (apicid == BAD_APICID)
 615                         continue;
 616                 if (apicid_to_node[apicid] == NUMA_NO_NODE)
 617                         continue;
 618                 numa_set_node(i, apicid_to_node[apicid]);
 619         }
 620 }
 621
 622