]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - arch/x86_64/mm/init.c
[PATCH] x86_64: Replace cpu_pda extern with include
[mirror_ubuntu-hirsute-kernel.git] / arch / x86_64 / mm / init.c
CommitLineData
1da177e4
LT
1/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
9#include <linux/config.h>
10#include <linux/signal.h>
11#include <linux/sched.h>
12#include <linux/kernel.h>
13#include <linux/errno.h>
14#include <linux/string.h>
15#include <linux/types.h>
16#include <linux/ptrace.h>
17#include <linux/mman.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/smp.h>
21#include <linux/init.h>
22#include <linux/pagemap.h>
23#include <linux/bootmem.h>
24#include <linux/proc_fs.h>
25
26#include <asm/processor.h>
27#include <asm/system.h>
28#include <asm/uaccess.h>
29#include <asm/pgtable.h>
30#include <asm/pgalloc.h>
31#include <asm/dma.h>
32#include <asm/fixmap.h>
33#include <asm/e820.h>
34#include <asm/apic.h>
35#include <asm/tlb.h>
36#include <asm/mmu_context.h>
37#include <asm/proto.h>
38#include <asm/smp.h>
2bc0414e 39#include <asm/sections.h>
1da177e4
LT
40
41#ifndef Dprintk
42#define Dprintk(x...)
43#endif
44
45#ifdef CONFIG_GART_IOMMU
46extern int swiotlb;
47#endif
48
e18c6874
AK
49static unsigned long dma_reserve __initdata;
50
1da177e4
LT
51DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
52
53/*
54 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
55 * physical space so we can cache the place of the first one and move
56 * around without checking the pgd every time.
57 */
58
59void show_mem(void)
60{
e92343cc
AK
61 long i, total = 0, reserved = 0;
62 long shared = 0, cached = 0;
1da177e4
LT
63 pg_data_t *pgdat;
64 struct page *page;
65
e92343cc 66 printk(KERN_INFO "Mem-info:\n");
1da177e4 67 show_free_areas();
e92343cc 68 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
1da177e4
LT
69
70 for_each_pgdat(pgdat) {
71 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
72 page = pfn_to_page(pgdat->node_start_pfn + i);
73 total++;
e92343cc
AK
74 if (PageReserved(page))
75 reserved++;
76 else if (PageSwapCache(page))
77 cached++;
78 else if (page_count(page))
79 shared += page_count(page) - 1;
1da177e4
LT
80 }
81 }
e92343cc
AK
82 printk(KERN_INFO "%lu pages of RAM\n", total);
83 printk(KERN_INFO "%lu reserved pages\n",reserved);
84 printk(KERN_INFO "%lu pages shared\n",shared);
85 printk(KERN_INFO "%lu pages swap cached\n",cached);
1da177e4
LT
86}
87
88/* References to section boundaries */
89
1da177e4
LT
90int after_bootmem;
91
92static void *spp_getpage(void)
93{
94 void *ptr;
95 if (after_bootmem)
96 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
97 else
98 ptr = alloc_bootmem_pages(PAGE_SIZE);
99 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
100 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
101
102 Dprintk("spp_getpage %p\n", ptr);
103 return ptr;
104}
105
106static void set_pte_phys(unsigned long vaddr,
107 unsigned long phys, pgprot_t prot)
108{
109 pgd_t *pgd;
110 pud_t *pud;
111 pmd_t *pmd;
112 pte_t *pte, new_pte;
113
114 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
115
116 pgd = pgd_offset_k(vaddr);
117 if (pgd_none(*pgd)) {
118 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
119 return;
120 }
121 pud = pud_offset(pgd, vaddr);
122 if (pud_none(*pud)) {
123 pmd = (pmd_t *) spp_getpage();
124 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
125 if (pmd != pmd_offset(pud, 0)) {
126 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
127 return;
128 }
129 }
130 pmd = pmd_offset(pud, vaddr);
131 if (pmd_none(*pmd)) {
132 pte = (pte_t *) spp_getpage();
133 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
134 if (pte != pte_offset_kernel(pmd, 0)) {
135 printk("PAGETABLE BUG #02!\n");
136 return;
137 }
138 }
139 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
140
141 pte = pte_offset_kernel(pmd, vaddr);
142 if (!pte_none(*pte) &&
143 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
144 pte_ERROR(*pte);
145 set_pte(pte, new_pte);
146
147 /*
148 * It's enough to flush this one mapping.
149 * (PGE mappings get flushed as well)
150 */
151 __flush_tlb_one(vaddr);
152}
153
154/* NOTE: this is meant to be run only at boot */
155void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
156{
157 unsigned long address = __fix_to_virt(idx);
158
159 if (idx >= __end_of_fixed_addresses) {
160 printk("Invalid __set_fixmap\n");
161 return;
162 }
163 set_pte_phys(address, phys, prot);
164}
165
166unsigned long __initdata table_start, table_end;
167
168extern pmd_t temp_boot_pmds[];
169
170static struct temp_map {
171 pmd_t *pmd;
172 void *address;
173 int allocated;
174} temp_mappings[] __initdata = {
175 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
176 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
177 {}
178};
179
180static __init void *alloc_low_page(int *index, unsigned long *phys)
181{
182 struct temp_map *ti;
183 int i;
184 unsigned long pfn = table_end++, paddr;
185 void *adr;
186
187 if (pfn >= end_pfn)
188 panic("alloc_low_page: ran out of memory");
189 for (i = 0; temp_mappings[i].allocated; i++) {
190 if (!temp_mappings[i].pmd)
191 panic("alloc_low_page: ran out of temp mappings");
192 }
193 ti = &temp_mappings[i];
194 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
195 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
196 ti->allocated = 1;
197 __flush_tlb();
198 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
199 *index = i;
200 *phys = pfn * PAGE_SIZE;
201 return adr;
202}
203
204static __init void unmap_low_page(int i)
205{
206 struct temp_map *ti = &temp_mappings[i];
207 set_pmd(ti->pmd, __pmd(0));
208 ti->allocated = 0;
209}
210
211static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
212{
213 long i, j;
214
215 i = pud_index(address);
216 pud = pud + i;
217 for (; i < PTRS_PER_PUD; pud++, i++) {
218 int map;
219 unsigned long paddr, pmd_phys;
220 pmd_t *pmd;
221
222 paddr = address + i*PUD_SIZE;
223 if (paddr >= end) {
224 for (; i < PTRS_PER_PUD; i++, pud++)
225 set_pud(pud, __pud(0));
226 break;
227 }
228
229 if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
230 set_pud(pud, __pud(0));
231 continue;
232 }
233
234 pmd = alloc_low_page(&map, &pmd_phys);
235 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
236 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
237 unsigned long pe;
238
239 if (paddr >= end) {
240 for (; j < PTRS_PER_PMD; j++, pmd++)
241 set_pmd(pmd, __pmd(0));
242 break;
243 }
244 pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
245 pe &= __supported_pte_mask;
246 set_pmd(pmd, __pmd(pe));
247 }
248 unmap_low_page(map);
249 }
250 __flush_tlb();
251}
252
253static void __init find_early_table_space(unsigned long end)
254{
255 unsigned long puds, pmds, tables;
256
257 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
258 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
259 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
260 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
261
262 table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
263 if (table_start == -1UL)
264 panic("Cannot find space for the kernel page tables");
265
266 table_start >>= PAGE_SHIFT;
267 table_end = table_start;
268}
269
270/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
271 This runs before bootmem is initialized and gets pages directly from the
272 physical memory. To access them they are temporarily mapped. */
273void __init init_memory_mapping(unsigned long start, unsigned long end)
274{
275 unsigned long next;
276
277 Dprintk("init_memory_mapping\n");
278
279 /*
280 * Find space for the kernel direct mapping tables.
281 * Later we should allocate these tables in the local node of the memory
282 * mapped. Unfortunately this is done currently before the nodes are
283 * discovered.
284 */
285 find_early_table_space(end);
286
287 start = (unsigned long)__va(start);
288 end = (unsigned long)__va(end);
289
290 for (; start < end; start = next) {
291 int map;
292 unsigned long pud_phys;
293 pud_t *pud = alloc_low_page(&map, &pud_phys);
294 next = start + PGDIR_SIZE;
295 if (next > end)
296 next = end;
297 phys_pud_init(pud, __pa(start), __pa(next));
298 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
299 unmap_low_page(map);
300 }
301
302 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
303 __flush_tlb_all();
304 early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
305 table_start<<PAGE_SHIFT,
306 table_end<<PAGE_SHIFT);
307}
308
f6c2e333 309void __cpuinit zap_low_mappings(int cpu)
1da177e4 310{
f6c2e333
SS
311 if (cpu == 0) {
312 pgd_t *pgd = pgd_offset_k(0UL);
313 pgd_clear(pgd);
314 } else {
315 /*
316 * For AP's, zap the low identity mappings by changing the cr3
317 * to init_level4_pgt and doing local flush tlb all
318 */
319 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
320 }
321 __flush_tlb_all();
1da177e4
LT
322}
323
a2f1b424
AK
324/* Compute zone sizes for the DMA and DMA32 zones in a node. */
325__init void
326size_zones(unsigned long *z, unsigned long *h,
327 unsigned long start_pfn, unsigned long end_pfn)
328{
329 int i;
330 unsigned long w;
331
332 for (i = 0; i < MAX_NR_ZONES; i++)
333 z[i] = 0;
334
335 if (start_pfn < MAX_DMA_PFN)
336 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
337 if (start_pfn < MAX_DMA32_PFN) {
338 unsigned long dma32_pfn = MAX_DMA32_PFN;
339 if (dma32_pfn > end_pfn)
340 dma32_pfn = end_pfn;
341 z[ZONE_DMA32] = dma32_pfn - start_pfn;
342 }
343 z[ZONE_NORMAL] = end_pfn - start_pfn;
344
345 /* Remove lower zones from higher ones. */
346 w = 0;
347 for (i = 0; i < MAX_NR_ZONES; i++) {
348 if (z[i])
349 z[i] -= w;
350 w += z[i];
351 }
352
353 /* Compute holes */
354 w = 0;
355 for (i = 0; i < MAX_NR_ZONES; i++) {
356 unsigned long s = w;
357 w += z[i];
358 h[i] = e820_hole_size(s, w);
359 }
e18c6874
AK
360
361 /* Add the space pace needed for mem_map to the holes too. */
362 for (i = 0; i < MAX_NR_ZONES; i++)
363 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
364
365 /* The 16MB DMA zone has the kernel and other misc mappings.
366 Account them too */
367 if (h[ZONE_DMA]) {
368 h[ZONE_DMA] += dma_reserve;
369 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
370 printk(KERN_WARNING
371 "Kernel too large and filling up ZONE_DMA?\n");
372 h[ZONE_DMA] = z[ZONE_DMA];
373 }
374 }
a2f1b424
AK
375}
376
2b97690f 377#ifndef CONFIG_NUMA
1da177e4
LT
378void __init paging_init(void)
379{
a2f1b424
AK
380 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
381 size_zones(zones, holes, 0, end_pfn);
382 free_area_init_node(0, NODE_DATA(0), zones,
383 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
1da177e4
LT
384}
385#endif
386
387/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
388 from the CPU leading to inconsistent cache lines. address and size
389 must be aligned to 2MB boundaries.
390 Does nothing when the mapping doesn't exist. */
391void __init clear_kernel_mapping(unsigned long address, unsigned long size)
392{
393 unsigned long end = address + size;
394
395 BUG_ON(address & ~LARGE_PAGE_MASK);
396 BUG_ON(size & ~LARGE_PAGE_MASK);
397
398 for (; address < end; address += LARGE_PAGE_SIZE) {
399 pgd_t *pgd = pgd_offset_k(address);
400 pud_t *pud;
401 pmd_t *pmd;
402 if (pgd_none(*pgd))
403 continue;
404 pud = pud_offset(pgd, address);
405 if (pud_none(*pud))
406 continue;
407 pmd = pmd_offset(pud, address);
408 if (!pmd || pmd_none(*pmd))
409 continue;
410 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
411 /* Could handle this, but it should not happen currently. */
412 printk(KERN_ERR
413 "clear_kernel_mapping: mapping has been split. will leak memory\n");
414 pmd_ERROR(*pmd);
415 }
416 set_pmd(pmd, __pmd(0));
417 }
418 __flush_tlb_all();
419}
420
1da177e4
LT
421static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
422 kcore_vsyscall;
423
424void __init mem_init(void)
425{
0a43e4bf 426 long codesize, reservedpages, datasize, initsize;
1da177e4
LT
427
428#ifdef CONFIG_SWIOTLB
1da177e4
LT
429 if (!iommu_aperture &&
430 (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
431 swiotlb = 1;
432 if (swiotlb)
433 swiotlb_init();
434#endif
435
436 /* How many end-of-memory variables you have, grandma! */
437 max_low_pfn = end_pfn;
438 max_pfn = end_pfn;
439 num_physpages = end_pfn;
440 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
441
442 /* clear the zero-page */
443 memset(empty_zero_page, 0, PAGE_SIZE);
444
445 reservedpages = 0;
446
447 /* this will put all low memory onto the freelists */
2b97690f 448#ifdef CONFIG_NUMA
0a43e4bf 449 totalram_pages = numa_free_all_bootmem();
1da177e4 450#else
0a43e4bf 451 totalram_pages = free_all_bootmem();
1da177e4 452#endif
0a43e4bf 453 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1da177e4
LT
454
455 after_bootmem = 1;
456
457 codesize = (unsigned long) &_etext - (unsigned long) &_text;
458 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
459 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
460
461 /* Register memory areas for /proc/kcore */
462 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
463 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
464 VMALLOC_END-VMALLOC_START);
465 kclist_add(&kcore_kernel, &_stext, _end - _stext);
466 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
467 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
468 VSYSCALL_END - VSYSCALL_START);
469
0a43e4bf 470 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1da177e4
LT
471 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
472 end_pfn << (PAGE_SHIFT-10),
473 codesize >> 10,
474 reservedpages << (PAGE_SHIFT-10),
475 datasize >> 10,
476 initsize >> 10);
477
f6c2e333 478#ifdef CONFIG_SMP
1da177e4 479 /*
f6c2e333
SS
480 * Sync boot_level4_pgt mappings with the init_level4_pgt
481 * except for the low identity mappings which are already zapped
482 * in init_level4_pgt. This sync-up is essential for AP's bringup
1da177e4 483 */
f6c2e333 484 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1da177e4
LT
485#endif
486}
487
1da177e4
LT
488void free_initmem(void)
489{
490 unsigned long addr;
491
492 addr = (unsigned long)(&__init_begin);
493 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
494 ClearPageReserved(virt_to_page(addr));
495 set_page_count(virt_to_page(addr), 1);
496 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
497 free_page(addr);
498 totalram_pages++;
499 }
500 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
2bc0414e 501 printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
1da177e4
LT
502}
503
504#ifdef CONFIG_BLK_DEV_INITRD
505void free_initrd_mem(unsigned long start, unsigned long end)
506{
507 if (start < (unsigned long)&_end)
508 return;
509 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
510 for (; start < end; start += PAGE_SIZE) {
511 ClearPageReserved(virt_to_page(start));
512 set_page_count(virt_to_page(start), 1);
513 free_page(start);
514 totalram_pages++;
515 }
516}
517#endif
518
519void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
520{
521 /* Should check here against the e820 map to avoid double free */
2b97690f 522#ifdef CONFIG_NUMA
1da177e4
LT
523 int nid = phys_to_nid(phys);
524 reserve_bootmem_node(NODE_DATA(nid), phys, len);
525#else
526 reserve_bootmem(phys, len);
527#endif
e18c6874
AK
528 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
529 dma_reserve += len / PAGE_SIZE;
1da177e4
LT
530}
531
532int kern_addr_valid(unsigned long addr)
533{
534 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
535 pgd_t *pgd;
536 pud_t *pud;
537 pmd_t *pmd;
538 pte_t *pte;
539
540 if (above != 0 && above != -1UL)
541 return 0;
542
543 pgd = pgd_offset_k(addr);
544 if (pgd_none(*pgd))
545 return 0;
546
547 pud = pud_offset(pgd, addr);
548 if (pud_none(*pud))
549 return 0;
550
551 pmd = pmd_offset(pud, addr);
552 if (pmd_none(*pmd))
553 return 0;
554 if (pmd_large(*pmd))
555 return pfn_valid(pmd_pfn(*pmd));
556
557 pte = pte_offset_kernel(pmd, addr);
558 if (pte_none(*pte))
559 return 0;
560 return pfn_valid(pte_pfn(*pte));
561}
562
563#ifdef CONFIG_SYSCTL
564#include <linux/sysctl.h>
565
566extern int exception_trace, page_fault_trace;
567
568static ctl_table debug_table2[] = {
569 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
570 proc_dointvec },
571#ifdef CONFIG_CHECKING
572 { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
573 proc_dointvec },
574#endif
575 { 0, }
576};
577
578static ctl_table debug_root_table2[] = {
579 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
580 .child = debug_table2 },
581 { 0 },
582};
583
584static __init int x8664_sysctl_init(void)
585{
586 register_sysctl_table(debug_root_table2, 1);
587 return 0;
588}
589__initcall(x8664_sysctl_init);
590#endif
591
1e014410
AK
592/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
593 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
594 not need special handling anymore. */
1da177e4
LT
595
596static struct vm_area_struct gate_vma = {
597 .vm_start = VSYSCALL_START,
598 .vm_end = VSYSCALL_END,
599 .vm_page_prot = PAGE_READONLY
600};
601
1da177e4
LT
602struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
603{
604#ifdef CONFIG_IA32_EMULATION
1e014410
AK
605 if (test_tsk_thread_flag(tsk, TIF_IA32))
606 return NULL;
1da177e4
LT
607#endif
608 return &gate_vma;
609}
610
611int in_gate_area(struct task_struct *task, unsigned long addr)
612{
613 struct vm_area_struct *vma = get_gate_vma(task);
1e014410
AK
614 if (!vma)
615 return 0;
1da177e4
LT
616 return (addr >= vma->vm_start) && (addr < vma->vm_end);
617}
618
619/* Use this when you have no reliable task/vma, typically from interrupt
620 * context. It is less reliable than using the task's vma and may give
621 * false positives.
622 */
623int in_gate_area_no_task(unsigned long addr)
624{
1e014410 625 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
1da177e4 626}