]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - arch/x86/mm/init_64.c
x86/mm: Introduce "default" kernel PTE mask
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / mm / init_64.c
1 /*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
17 #include <linux/mm.h>
18 #include <linux/swap.h>
19 #include <linux/smp.h>
20 #include <linux/init.h>
21 #include <linux/initrd.h>
22 #include <linux/pagemap.h>
23 #include <linux/bootmem.h>
24 #include <linux/memblock.h>
25 #include <linux/proc_fs.h>
26 #include <linux/pci.h>
27 #include <linux/pfn.h>
28 #include <linux/poison.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/memory.h>
31 #include <linux/memory_hotplug.h>
32 #include <linux/memremap.h>
33 #include <linux/nmi.h>
34 #include <linux/gfp.h>
35 #include <linux/kcore.h>
36
37 #include <asm/processor.h>
38 #include <asm/bios_ebda.h>
39 #include <linux/uaccess.h>
40 #include <asm/pgtable.h>
41 #include <asm/pgalloc.h>
42 #include <asm/dma.h>
43 #include <asm/fixmap.h>
44 #include <asm/e820/api.h>
45 #include <asm/apic.h>
46 #include <asm/tlb.h>
47 #include <asm/mmu_context.h>
48 #include <asm/proto.h>
49 #include <asm/smp.h>
50 #include <asm/sections.h>
51 #include <asm/kdebug.h>
52 #include <asm/numa.h>
53 #include <asm/set_memory.h>
54 #include <asm/init.h>
55 #include <asm/uv/uv.h>
56 #include <asm/setup.h>
57
58 #include "mm_internal.h"
59
60 #include "ident_map.c"
61
62 /*
63 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
64 * physical space so we can cache the place of the first one and move
65 * around without checking the pgd every time.
66 */
67
68 /* Bits supported by the hardware: */
69 pteval_t __supported_pte_mask __read_mostly = ~0;
70 /* Bits allowed in normal kernel mappings: */
71 pteval_t __default_kernel_pte_mask __read_mostly = ~0;
72 EXPORT_SYMBOL_GPL(__supported_pte_mask);
73 /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
74 EXPORT_SYMBOL(__default_kernel_pte_mask);
75
76 int force_personality32;
77
78 /*
79 * noexec32=on|off
80 * Control non executable heap for 32bit processes.
81 * To control the stack too use noexec=off
82 *
83 * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
84 * off PROT_READ implies PROT_EXEC
85 */
86 static int __init nonx32_setup(char *str)
87 {
88 if (!strcmp(str, "on"))
89 force_personality32 &= ~READ_IMPLIES_EXEC;
90 else if (!strcmp(str, "off"))
91 force_personality32 |= READ_IMPLIES_EXEC;
92 return 1;
93 }
94 __setup("noexec32=", nonx32_setup);
95
96 /*
97 * When memory was added make sure all the processes MM have
98 * suitable PGD entries in the local PGD level page.
99 */
100 #ifdef CONFIG_X86_5LEVEL
101 void sync_global_pgds(unsigned long start, unsigned long end)
102 {
103 unsigned long addr;
104
105 for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
106 const pgd_t *pgd_ref = pgd_offset_k(addr);
107 struct page *page;
108
109 /* Check for overflow */
110 if (addr < start)
111 break;
112
113 if (pgd_none(*pgd_ref))
114 continue;
115
116 spin_lock(&pgd_lock);
117 list_for_each_entry(page, &pgd_list, lru) {
118 pgd_t *pgd;
119 spinlock_t *pgt_lock;
120
121 pgd = (pgd_t *)page_address(page) + pgd_index(addr);
122 /* the pgt_lock only for Xen */
123 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
124 spin_lock(pgt_lock);
125
126 if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
127 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
128
129 if (pgd_none(*pgd))
130 set_pgd(pgd, *pgd_ref);
131
132 spin_unlock(pgt_lock);
133 }
134 spin_unlock(&pgd_lock);
135 }
136 }
137 #else
138 void sync_global_pgds(unsigned long start, unsigned long end)
139 {
140 unsigned long addr;
141
142 for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
143 pgd_t *pgd_ref = pgd_offset_k(addr);
144 const p4d_t *p4d_ref;
145 struct page *page;
146
147 /*
148 * With folded p4d, pgd_none() is always false, we need to
149 * handle synchonization on p4d level.
150 */
151 BUILD_BUG_ON(pgd_none(*pgd_ref));
152 p4d_ref = p4d_offset(pgd_ref, addr);
153
154 if (p4d_none(*p4d_ref))
155 continue;
156
157 spin_lock(&pgd_lock);
158 list_for_each_entry(page, &pgd_list, lru) {
159 pgd_t *pgd;
160 p4d_t *p4d;
161 spinlock_t *pgt_lock;
162
163 pgd = (pgd_t *)page_address(page) + pgd_index(addr);
164 p4d = p4d_offset(pgd, addr);
165 /* the pgt_lock only for Xen */
166 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
167 spin_lock(pgt_lock);
168
169 if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
170 BUG_ON(p4d_page_vaddr(*p4d)
171 != p4d_page_vaddr(*p4d_ref));
172
173 if (p4d_none(*p4d))
174 set_p4d(p4d, *p4d_ref);
175
176 spin_unlock(pgt_lock);
177 }
178 spin_unlock(&pgd_lock);
179 }
180 }
181 #endif
182
183 /*
184 * NOTE: This function is marked __ref because it calls __init function
185 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
186 */
187 static __ref void *spp_getpage(void)
188 {
189 void *ptr;
190
191 if (after_bootmem)
192 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
193 else
194 ptr = alloc_bootmem_pages(PAGE_SIZE);
195
196 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
197 panic("set_pte_phys: cannot allocate page data %s\n",
198 after_bootmem ? "after bootmem" : "");
199 }
200
201 pr_debug("spp_getpage %p\n", ptr);
202
203 return ptr;
204 }
205
206 static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
207 {
208 if (pgd_none(*pgd)) {
209 p4d_t *p4d = (p4d_t *)spp_getpage();
210 pgd_populate(&init_mm, pgd, p4d);
211 if (p4d != p4d_offset(pgd, 0))
212 printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
213 p4d, p4d_offset(pgd, 0));
214 }
215 return p4d_offset(pgd, vaddr);
216 }
217
218 static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
219 {
220 if (p4d_none(*p4d)) {
221 pud_t *pud = (pud_t *)spp_getpage();
222 p4d_populate(&init_mm, p4d, pud);
223 if (pud != pud_offset(p4d, 0))
224 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
225 pud, pud_offset(p4d, 0));
226 }
227 return pud_offset(p4d, vaddr);
228 }
229
230 static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
231 {
232 if (pud_none(*pud)) {
233 pmd_t *pmd = (pmd_t *) spp_getpage();
234 pud_populate(&init_mm, pud, pmd);
235 if (pmd != pmd_offset(pud, 0))
236 printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
237 pmd, pmd_offset(pud, 0));
238 }
239 return pmd_offset(pud, vaddr);
240 }
241
242 static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
243 {
244 if (pmd_none(*pmd)) {
245 pte_t *pte = (pte_t *) spp_getpage();
246 pmd_populate_kernel(&init_mm, pmd, pte);
247 if (pte != pte_offset_kernel(pmd, 0))
248 printk(KERN_ERR "PAGETABLE BUG #03!\n");
249 }
250 return pte_offset_kernel(pmd, vaddr);
251 }
252
253 static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
254 {
255 pmd_t *pmd = fill_pmd(pud, vaddr);
256 pte_t *pte = fill_pte(pmd, vaddr);
257
258 set_pte(pte, new_pte);
259
260 /*
261 * It's enough to flush this one mapping.
262 * (PGE mappings get flushed as well)
263 */
264 __flush_tlb_one_kernel(vaddr);
265 }
266
267 void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
268 {
269 p4d_t *p4d = p4d_page + p4d_index(vaddr);
270 pud_t *pud = fill_pud(p4d, vaddr);
271
272 __set_pte_vaddr(pud, vaddr, new_pte);
273 }
274
275 void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
276 {
277 pud_t *pud = pud_page + pud_index(vaddr);
278
279 __set_pte_vaddr(pud, vaddr, new_pte);
280 }
281
282 void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
283 {
284 pgd_t *pgd;
285 p4d_t *p4d_page;
286
287 pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
288
289 pgd = pgd_offset_k(vaddr);
290 if (pgd_none(*pgd)) {
291 printk(KERN_ERR
292 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
293 return;
294 }
295
296 p4d_page = p4d_offset(pgd, 0);
297 set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
298 }
299
300 pmd_t * __init populate_extra_pmd(unsigned long vaddr)
301 {
302 pgd_t *pgd;
303 p4d_t *p4d;
304 pud_t *pud;
305
306 pgd = pgd_offset_k(vaddr);
307 p4d = fill_p4d(pgd, vaddr);
308 pud = fill_pud(p4d, vaddr);
309 return fill_pmd(pud, vaddr);
310 }
311
312 pte_t * __init populate_extra_pte(unsigned long vaddr)
313 {
314 pmd_t *pmd;
315
316 pmd = populate_extra_pmd(vaddr);
317 return fill_pte(pmd, vaddr);
318 }
319
320 /*
321 * Create large page table mappings for a range of physical addresses.
322 */
323 static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
324 enum page_cache_mode cache)
325 {
326 pgd_t *pgd;
327 p4d_t *p4d;
328 pud_t *pud;
329 pmd_t *pmd;
330 pgprot_t prot;
331
332 pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
333 pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
334 BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
335 for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
336 pgd = pgd_offset_k((unsigned long)__va(phys));
337 if (pgd_none(*pgd)) {
338 p4d = (p4d_t *) spp_getpage();
339 set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
340 _PAGE_USER));
341 }
342 p4d = p4d_offset(pgd, (unsigned long)__va(phys));
343 if (p4d_none(*p4d)) {
344 pud = (pud_t *) spp_getpage();
345 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
346 _PAGE_USER));
347 }
348 pud = pud_offset(p4d, (unsigned long)__va(phys));
349 if (pud_none(*pud)) {
350 pmd = (pmd_t *) spp_getpage();
351 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
352 _PAGE_USER));
353 }
354 pmd = pmd_offset(pud, phys);
355 BUG_ON(!pmd_none(*pmd));
356 set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
357 }
358 }
359
360 void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
361 {
362 __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB);
363 }
364
365 void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
366 {
367 __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC);
368 }
369
370 /*
371 * The head.S code sets up the kernel high mapping:
372 *
373 * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
374 *
375 * phys_base holds the negative offset to the kernel, which is added
376 * to the compile time generated pmds. This results in invalid pmds up
377 * to the point where we hit the physaddr 0 mapping.
378 *
379 * We limit the mappings to the region from _text to _brk_end. _brk_end
380 * is rounded up to the 2MB boundary. This catches the invalid pmds as
381 * well, as they are located before _text:
382 */
383 void __init cleanup_highmap(void)
384 {
385 unsigned long vaddr = __START_KERNEL_map;
386 unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
387 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
388 pmd_t *pmd = level2_kernel_pgt;
389
390 /*
391 * Native path, max_pfn_mapped is not set yet.
392 * Xen has valid max_pfn_mapped set in
393 * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
394 */
395 if (max_pfn_mapped)
396 vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
397
398 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
399 if (pmd_none(*pmd))
400 continue;
401 if (vaddr < (unsigned long) _text || vaddr > end)
402 set_pmd(pmd, __pmd(0));
403 }
404 }
405
406 /*
407 * Create PTE level page table mapping for physical addresses.
408 * It returns the last physical address mapped.
409 */
410 static unsigned long __meminit
411 phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
412 pgprot_t prot)
413 {
414 unsigned long pages = 0, paddr_next;
415 unsigned long paddr_last = paddr_end;
416 pte_t *pte;
417 int i;
418
419 pte = pte_page + pte_index(paddr);
420 i = pte_index(paddr);
421
422 for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) {
423 paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE;
424 if (paddr >= paddr_end) {
425 if (!after_bootmem &&
426 !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
427 E820_TYPE_RAM) &&
428 !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
429 E820_TYPE_RESERVED_KERN))
430 set_pte(pte, __pte(0));
431 continue;
432 }
433
434 /*
435 * We will re-use the existing mapping.
436 * Xen for example has some special requirements, like mapping
437 * pagetable pages as RO. So assume someone who pre-setup
438 * these mappings are more intelligent.
439 */
440 if (!pte_none(*pte)) {
441 if (!after_bootmem)
442 pages++;
443 continue;
444 }
445
446 if (0)
447 pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr,
448 pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
449 pages++;
450 set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
451 paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
452 }
453
454 update_page_count(PG_LEVEL_4K, pages);
455
456 return paddr_last;
457 }
458
459 /*
460 * Create PMD level page table mapping for physical addresses. The virtual
461 * and physical address have to be aligned at this level.
462 * It returns the last physical address mapped.
463 */
464 static unsigned long __meminit
465 phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
466 unsigned long page_size_mask, pgprot_t prot)
467 {
468 unsigned long pages = 0, paddr_next;
469 unsigned long paddr_last = paddr_end;
470
471 int i = pmd_index(paddr);
472
473 for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) {
474 pmd_t *pmd = pmd_page + pmd_index(paddr);
475 pte_t *pte;
476 pgprot_t new_prot = prot;
477
478 paddr_next = (paddr & PMD_MASK) + PMD_SIZE;
479 if (paddr >= paddr_end) {
480 if (!after_bootmem &&
481 !e820__mapped_any(paddr & PMD_MASK, paddr_next,
482 E820_TYPE_RAM) &&
483 !e820__mapped_any(paddr & PMD_MASK, paddr_next,
484 E820_TYPE_RESERVED_KERN))
485 set_pmd(pmd, __pmd(0));
486 continue;
487 }
488
489 if (!pmd_none(*pmd)) {
490 if (!pmd_large(*pmd)) {
491 spin_lock(&init_mm.page_table_lock);
492 pte = (pte_t *)pmd_page_vaddr(*pmd);
493 paddr_last = phys_pte_init(pte, paddr,
494 paddr_end, prot);
495 spin_unlock(&init_mm.page_table_lock);
496 continue;
497 }
498 /*
499 * If we are ok with PG_LEVEL_2M mapping, then we will
500 * use the existing mapping,
501 *
502 * Otherwise, we will split the large page mapping but
503 * use the same existing protection bits except for
504 * large page, so that we don't violate Intel's TLB
505 * Application note (317080) which says, while changing
506 * the page sizes, new and old translations should
507 * not differ with respect to page frame and
508 * attributes.
509 */
510 if (page_size_mask & (1 << PG_LEVEL_2M)) {
511 if (!after_bootmem)
512 pages++;
513 paddr_last = paddr_next;
514 continue;
515 }
516 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
517 }
518
519 if (page_size_mask & (1<<PG_LEVEL_2M)) {
520 pages++;
521 spin_lock(&init_mm.page_table_lock);
522 set_pte((pte_t *)pmd,
523 pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
524 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
525 spin_unlock(&init_mm.page_table_lock);
526 paddr_last = paddr_next;
527 continue;
528 }
529
530 pte = alloc_low_page();
531 paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
532
533 spin_lock(&init_mm.page_table_lock);
534 pmd_populate_kernel(&init_mm, pmd, pte);
535 spin_unlock(&init_mm.page_table_lock);
536 }
537 update_page_count(PG_LEVEL_2M, pages);
538 return paddr_last;
539 }
540
541 /*
542 * Create PUD level page table mapping for physical addresses. The virtual
543 * and physical address do not have to be aligned at this level. KASLR can
544 * randomize virtual addresses up to this level.
545 * It returns the last physical address mapped.
546 */
547 static unsigned long __meminit
548 phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
549 unsigned long page_size_mask)
550 {
551 unsigned long pages = 0, paddr_next;
552 unsigned long paddr_last = paddr_end;
553 unsigned long vaddr = (unsigned long)__va(paddr);
554 int i = pud_index(vaddr);
555
556 for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
557 pud_t *pud;
558 pmd_t *pmd;
559 pgprot_t prot = PAGE_KERNEL;
560
561 vaddr = (unsigned long)__va(paddr);
562 pud = pud_page + pud_index(vaddr);
563 paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
564
565 if (paddr >= paddr_end) {
566 if (!after_bootmem &&
567 !e820__mapped_any(paddr & PUD_MASK, paddr_next,
568 E820_TYPE_RAM) &&
569 !e820__mapped_any(paddr & PUD_MASK, paddr_next,
570 E820_TYPE_RESERVED_KERN))
571 set_pud(pud, __pud(0));
572 continue;
573 }
574
575 if (!pud_none(*pud)) {
576 if (!pud_large(*pud)) {
577 pmd = pmd_offset(pud, 0);
578 paddr_last = phys_pmd_init(pmd, paddr,
579 paddr_end,
580 page_size_mask,
581 prot);
582 __flush_tlb_all();
583 continue;
584 }
585 /*
586 * If we are ok with PG_LEVEL_1G mapping, then we will
587 * use the existing mapping.
588 *
589 * Otherwise, we will split the gbpage mapping but use
590 * the same existing protection bits except for large
591 * page, so that we don't violate Intel's TLB
592 * Application note (317080) which says, while changing
593 * the page sizes, new and old translations should
594 * not differ with respect to page frame and
595 * attributes.
596 */
597 if (page_size_mask & (1 << PG_LEVEL_1G)) {
598 if (!after_bootmem)
599 pages++;
600 paddr_last = paddr_next;
601 continue;
602 }
603 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
604 }
605
606 if (page_size_mask & (1<<PG_LEVEL_1G)) {
607 pages++;
608 spin_lock(&init_mm.page_table_lock);
609 set_pte((pte_t *)pud,
610 pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
611 PAGE_KERNEL_LARGE));
612 spin_unlock(&init_mm.page_table_lock);
613 paddr_last = paddr_next;
614 continue;
615 }
616
617 pmd = alloc_low_page();
618 paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
619 page_size_mask, prot);
620
621 spin_lock(&init_mm.page_table_lock);
622 pud_populate(&init_mm, pud, pmd);
623 spin_unlock(&init_mm.page_table_lock);
624 }
625 __flush_tlb_all();
626
627 update_page_count(PG_LEVEL_1G, pages);
628
629 return paddr_last;
630 }
631
632 static unsigned long __meminit
633 phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
634 unsigned long page_size_mask)
635 {
636 unsigned long paddr_next, paddr_last = paddr_end;
637 unsigned long vaddr = (unsigned long)__va(paddr);
638 int i = p4d_index(vaddr);
639
640 if (!IS_ENABLED(CONFIG_X86_5LEVEL))
641 return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
642
643 for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
644 p4d_t *p4d;
645 pud_t *pud;
646
647 vaddr = (unsigned long)__va(paddr);
648 p4d = p4d_page + p4d_index(vaddr);
649 paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
650
651 if (paddr >= paddr_end) {
652 if (!after_bootmem &&
653 !e820__mapped_any(paddr & P4D_MASK, paddr_next,
654 E820_TYPE_RAM) &&
655 !e820__mapped_any(paddr & P4D_MASK, paddr_next,
656 E820_TYPE_RESERVED_KERN))
657 set_p4d(p4d, __p4d(0));
658 continue;
659 }
660
661 if (!p4d_none(*p4d)) {
662 pud = pud_offset(p4d, 0);
663 paddr_last = phys_pud_init(pud, paddr,
664 paddr_end,
665 page_size_mask);
666 __flush_tlb_all();
667 continue;
668 }
669
670 pud = alloc_low_page();
671 paddr_last = phys_pud_init(pud, paddr, paddr_end,
672 page_size_mask);
673
674 spin_lock(&init_mm.page_table_lock);
675 p4d_populate(&init_mm, p4d, pud);
676 spin_unlock(&init_mm.page_table_lock);
677 }
678 __flush_tlb_all();
679
680 return paddr_last;
681 }
682
683 /*
684 * Create page table mapping for the physical memory for specific physical
685 * addresses. The virtual and physical addresses have to be aligned on PMD level
686 * down. It returns the last physical address mapped.
687 */
688 unsigned long __meminit
689 kernel_physical_mapping_init(unsigned long paddr_start,
690 unsigned long paddr_end,
691 unsigned long page_size_mask)
692 {
693 bool pgd_changed = false;
694 unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
695
696 paddr_last = paddr_end;
697 vaddr = (unsigned long)__va(paddr_start);
698 vaddr_end = (unsigned long)__va(paddr_end);
699 vaddr_start = vaddr;
700
701 for (; vaddr < vaddr_end; vaddr = vaddr_next) {
702 pgd_t *pgd = pgd_offset_k(vaddr);
703 p4d_t *p4d;
704
705 vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
706
707 if (pgd_val(*pgd)) {
708 p4d = (p4d_t *)pgd_page_vaddr(*pgd);
709 paddr_last = phys_p4d_init(p4d, __pa(vaddr),
710 __pa(vaddr_end),
711 page_size_mask);
712 continue;
713 }
714
715 p4d = alloc_low_page();
716 paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
717 page_size_mask);
718
719 spin_lock(&init_mm.page_table_lock);
720 if (IS_ENABLED(CONFIG_X86_5LEVEL))
721 pgd_populate(&init_mm, pgd, p4d);
722 else
723 p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
724 spin_unlock(&init_mm.page_table_lock);
725 pgd_changed = true;
726 }
727
728 if (pgd_changed)
729 sync_global_pgds(vaddr_start, vaddr_end - 1);
730
731 __flush_tlb_all();
732
733 return paddr_last;
734 }
735
736 #ifndef CONFIG_NUMA
737 void __init initmem_init(void)
738 {
739 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
740 }
741 #endif
742
743 void __init paging_init(void)
744 {
745 sparse_memory_present_with_active_regions(MAX_NUMNODES);
746 sparse_init();
747
748 /*
749 * clear the default setting with node 0
750 * note: don't use nodes_clear here, that is really clearing when
751 * numa support is not compiled in, and later node_set_state
752 * will not set it back.
753 */
754 node_clear_state(0, N_MEMORY);
755 if (N_MEMORY != N_NORMAL_MEMORY)
756 node_clear_state(0, N_NORMAL_MEMORY);
757
758 zone_sizes_init();
759 }
760
761 /*
762 * Memory hotplug specific functions
763 */
764 #ifdef CONFIG_MEMORY_HOTPLUG
765 /*
766 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
767 * updating.
768 */
769 static void update_end_of_memory_vars(u64 start, u64 size)
770 {
771 unsigned long end_pfn = PFN_UP(start + size);
772
773 if (end_pfn > max_pfn) {
774 max_pfn = end_pfn;
775 max_low_pfn = end_pfn;
776 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
777 }
778 }
779
780 int add_pages(int nid, unsigned long start_pfn,
781 unsigned long nr_pages, bool want_memblock)
782 {
783 int ret;
784
785 ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
786 WARN_ON_ONCE(ret);
787
788 /* update max_pfn, max_low_pfn and high_memory */
789 update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
790 nr_pages << PAGE_SHIFT);
791
792 return ret;
793 }
794
795 int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
796 {
797 unsigned long start_pfn = start >> PAGE_SHIFT;
798 unsigned long nr_pages = size >> PAGE_SHIFT;
799
800 init_memory_mapping(start, start + size);
801
802 return add_pages(nid, start_pfn, nr_pages, want_memblock);
803 }
804 EXPORT_SYMBOL_GPL(arch_add_memory);
805
806 #define PAGE_INUSE 0xFD
807
808 static void __meminit free_pagetable(struct page *page, int order)
809 {
810 unsigned long magic;
811 unsigned int nr_pages = 1 << order;
812 struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page);
813
814 if (altmap) {
815 vmem_altmap_free(altmap, nr_pages);
816 return;
817 }
818
819 /* bootmem page has reserved flag */
820 if (PageReserved(page)) {
821 __ClearPageReserved(page);
822
823 magic = (unsigned long)page->freelist;
824 if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
825 while (nr_pages--)
826 put_page_bootmem(page++);
827 } else
828 while (nr_pages--)
829 free_reserved_page(page++);
830 } else
831 free_pages((unsigned long)page_address(page), order);
832 }
833
834 static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
835 {
836 pte_t *pte;
837 int i;
838
839 for (i = 0; i < PTRS_PER_PTE; i++) {
840 pte = pte_start + i;
841 if (!pte_none(*pte))
842 return;
843 }
844
845 /* free a pte talbe */
846 free_pagetable(pmd_page(*pmd), 0);
847 spin_lock(&init_mm.page_table_lock);
848 pmd_clear(pmd);
849 spin_unlock(&init_mm.page_table_lock);
850 }
851
852 static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
853 {
854 pmd_t *pmd;
855 int i;
856
857 for (i = 0; i < PTRS_PER_PMD; i++) {
858 pmd = pmd_start + i;
859 if (!pmd_none(*pmd))
860 return;
861 }
862
863 /* free a pmd talbe */
864 free_pagetable(pud_page(*pud), 0);
865 spin_lock(&init_mm.page_table_lock);
866 pud_clear(pud);
867 spin_unlock(&init_mm.page_table_lock);
868 }
869
870 static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
871 {
872 pud_t *pud;
873 int i;
874
875 for (i = 0; i < PTRS_PER_PUD; i++) {
876 pud = pud_start + i;
877 if (!pud_none(*pud))
878 return;
879 }
880
881 /* free a pud talbe */
882 free_pagetable(p4d_page(*p4d), 0);
883 spin_lock(&init_mm.page_table_lock);
884 p4d_clear(p4d);
885 spin_unlock(&init_mm.page_table_lock);
886 }
887
888 static void __meminit
889 remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
890 bool direct)
891 {
892 unsigned long next, pages = 0;
893 pte_t *pte;
894 void *page_addr;
895 phys_addr_t phys_addr;
896
897 pte = pte_start + pte_index(addr);
898 for (; addr < end; addr = next, pte++) {
899 next = (addr + PAGE_SIZE) & PAGE_MASK;
900 if (next > end)
901 next = end;
902
903 if (!pte_present(*pte))
904 continue;
905
906 /*
907 * We mapped [0,1G) memory as identity mapping when
908 * initializing, in arch/x86/kernel/head_64.S. These
909 * pagetables cannot be removed.
910 */
911 phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
912 if (phys_addr < (phys_addr_t)0x40000000)
913 return;
914
915 if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
916 /*
917 * Do not free direct mapping pages since they were
918 * freed when offlining, or simplely not in use.
919 */
920 if (!direct)
921 free_pagetable(pte_page(*pte), 0);
922
923 spin_lock(&init_mm.page_table_lock);
924 pte_clear(&init_mm, addr, pte);
925 spin_unlock(&init_mm.page_table_lock);
926
927 /* For non-direct mapping, pages means nothing. */
928 pages++;
929 } else {
930 /*
931 * If we are here, we are freeing vmemmap pages since
932 * direct mapped memory ranges to be freed are aligned.
933 *
934 * If we are not removing the whole page, it means
935 * other page structs in this page are being used and
936 * we canot remove them. So fill the unused page_structs
937 * with 0xFD, and remove the page when it is wholly
938 * filled with 0xFD.
939 */
940 memset((void *)addr, PAGE_INUSE, next - addr);
941
942 page_addr = page_address(pte_page(*pte));
943 if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
944 free_pagetable(pte_page(*pte), 0);
945
946 spin_lock(&init_mm.page_table_lock);
947 pte_clear(&init_mm, addr, pte);
948 spin_unlock(&init_mm.page_table_lock);
949 }
950 }
951 }
952
953 /* Call free_pte_table() in remove_pmd_table(). */
954 flush_tlb_all();
955 if (direct)
956 update_page_count(PG_LEVEL_4K, -pages);
957 }
958
959 static void __meminit
960 remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
961 bool direct)
962 {
963 unsigned long next, pages = 0;
964 pte_t *pte_base;
965 pmd_t *pmd;
966 void *page_addr;
967
968 pmd = pmd_start + pmd_index(addr);
969 for (; addr < end; addr = next, pmd++) {
970 next = pmd_addr_end(addr, end);
971
972 if (!pmd_present(*pmd))
973 continue;
974
975 if (pmd_large(*pmd)) {
976 if (IS_ALIGNED(addr, PMD_SIZE) &&
977 IS_ALIGNED(next, PMD_SIZE)) {
978 if (!direct)
979 free_pagetable(pmd_page(*pmd),
980 get_order(PMD_SIZE));
981
982 spin_lock(&init_mm.page_table_lock);
983 pmd_clear(pmd);
984 spin_unlock(&init_mm.page_table_lock);
985 pages++;
986 } else {
987 /* If here, we are freeing vmemmap pages. */
988 memset((void *)addr, PAGE_INUSE, next - addr);
989
990 page_addr = page_address(pmd_page(*pmd));
991 if (!memchr_inv(page_addr, PAGE_INUSE,
992 PMD_SIZE)) {
993 free_pagetable(pmd_page(*pmd),
994 get_order(PMD_SIZE));
995
996 spin_lock(&init_mm.page_table_lock);
997 pmd_clear(pmd);
998 spin_unlock(&init_mm.page_table_lock);
999 }
1000 }
1001
1002 continue;
1003 }
1004
1005 pte_base = (pte_t *)pmd_page_vaddr(*pmd);
1006 remove_pte_table(pte_base, addr, next, direct);
1007 free_pte_table(pte_base, pmd);
1008 }
1009
1010 /* Call free_pmd_table() in remove_pud_table(). */
1011 if (direct)
1012 update_page_count(PG_LEVEL_2M, -pages);
1013 }
1014
1015 static void __meminit
1016 remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
1017 bool direct)
1018 {
1019 unsigned long next, pages = 0;
1020 pmd_t *pmd_base;
1021 pud_t *pud;
1022 void *page_addr;
1023
1024 pud = pud_start + pud_index(addr);
1025 for (; addr < end; addr = next, pud++) {
1026 next = pud_addr_end(addr, end);
1027
1028 if (!pud_present(*pud))
1029 continue;
1030
1031 if (pud_large(*pud)) {
1032 if (IS_ALIGNED(addr, PUD_SIZE) &&
1033 IS_ALIGNED(next, PUD_SIZE)) {
1034 if (!direct)
1035 free_pagetable(pud_page(*pud),
1036 get_order(PUD_SIZE));
1037
1038 spin_lock(&init_mm.page_table_lock);
1039 pud_clear(pud);
1040 spin_unlock(&init_mm.page_table_lock);
1041 pages++;
1042 } else {
1043 /* If here, we are freeing vmemmap pages. */
1044 memset((void *)addr, PAGE_INUSE, next - addr);
1045
1046 page_addr = page_address(pud_page(*pud));
1047 if (!memchr_inv(page_addr, PAGE_INUSE,
1048 PUD_SIZE)) {
1049 free_pagetable(pud_page(*pud),
1050 get_order(PUD_SIZE));
1051
1052 spin_lock(&init_mm.page_table_lock);
1053 pud_clear(pud);
1054 spin_unlock(&init_mm.page_table_lock);
1055 }
1056 }
1057
1058 continue;
1059 }
1060
1061 pmd_base = pmd_offset(pud, 0);
1062 remove_pmd_table(pmd_base, addr, next, direct);
1063 free_pmd_table(pmd_base, pud);
1064 }
1065
1066 if (direct)
1067 update_page_count(PG_LEVEL_1G, -pages);
1068 }
1069
1070 static void __meminit
1071 remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
1072 bool direct)
1073 {
1074 unsigned long next, pages = 0;
1075 pud_t *pud_base;
1076 p4d_t *p4d;
1077
1078 p4d = p4d_start + p4d_index(addr);
1079 for (; addr < end; addr = next, p4d++) {
1080 next = p4d_addr_end(addr, end);
1081
1082 if (!p4d_present(*p4d))
1083 continue;
1084
1085 BUILD_BUG_ON(p4d_large(*p4d));
1086
1087 pud_base = pud_offset(p4d, 0);
1088 remove_pud_table(pud_base, addr, next, direct);
1089 /*
1090 * For 4-level page tables we do not want to free PUDs, but in the
1091 * 5-level case we should free them. This code will have to change
1092 * to adapt for boot-time switching between 4 and 5 level page tables.
1093 */
1094 if (CONFIG_PGTABLE_LEVELS == 5)
1095 free_pud_table(pud_base, p4d);
1096 }
1097
1098 if (direct)
1099 update_page_count(PG_LEVEL_512G, -pages);
1100 }
1101
1102 /* start and end are both virtual address. */
1103 static void __meminit
1104 remove_pagetable(unsigned long start, unsigned long end, bool direct)
1105 {
1106 unsigned long next;
1107 unsigned long addr;
1108 pgd_t *pgd;
1109 p4d_t *p4d;
1110
1111 for (addr = start; addr < end; addr = next) {
1112 next = pgd_addr_end(addr, end);
1113
1114 pgd = pgd_offset_k(addr);
1115 if (!pgd_present(*pgd))
1116 continue;
1117
1118 p4d = p4d_offset(pgd, 0);
1119 remove_p4d_table(p4d, addr, next, direct);
1120 }
1121
1122 flush_tlb_all();
1123 }
1124
1125 void __ref vmemmap_free(unsigned long start, unsigned long end)
1126 {
1127 remove_pagetable(start, end, false);
1128 }
1129
1130 #ifdef CONFIG_MEMORY_HOTREMOVE
1131 static void __meminit
1132 kernel_physical_mapping_remove(unsigned long start, unsigned long end)
1133 {
1134 start = (unsigned long)__va(start);
1135 end = (unsigned long)__va(end);
1136
1137 remove_pagetable(start, end, true);
1138 }
1139
1140 int __ref arch_remove_memory(u64 start, u64 size)
1141 {
1142 unsigned long start_pfn = start >> PAGE_SHIFT;
1143 unsigned long nr_pages = size >> PAGE_SHIFT;
1144 struct page *page = pfn_to_page(start_pfn);
1145 struct vmem_altmap *altmap;
1146 struct zone *zone;
1147 int ret;
1148
1149 /* With altmap the first mapped page is offset from @start */
1150 altmap = to_vmem_altmap((unsigned long) page);
1151 if (altmap)
1152 page += vmem_altmap_offset(altmap);
1153 zone = page_zone(page);
1154 ret = __remove_pages(zone, start_pfn, nr_pages);
1155 WARN_ON_ONCE(ret);
1156 kernel_physical_mapping_remove(start, start + size);
1157
1158 return ret;
1159 }
1160 #endif
1161 #endif /* CONFIG_MEMORY_HOTPLUG */
1162
1163 static struct kcore_list kcore_vsyscall;
1164
1165 static void __init register_page_bootmem_info(void)
1166 {
1167 #ifdef CONFIG_NUMA
1168 int i;
1169
1170 for_each_online_node(i)
1171 register_page_bootmem_info_node(NODE_DATA(i));
1172 #endif
1173 }
1174
1175 void __init mem_init(void)
1176 {
1177 pci_iommu_alloc();
1178
1179 /* clear_bss() already clear the empty_zero_page */
1180
1181 /* this will put all memory onto the freelists */
1182 free_all_bootmem();
1183 after_bootmem = 1;
1184
1185 /*
1186 * Must be done after boot memory is put on freelist, because here we
1187 * might set fields in deferred struct pages that have not yet been
1188 * initialized, and free_all_bootmem() initializes all the reserved
1189 * deferred pages for us.
1190 */
1191 register_page_bootmem_info();
1192
1193 /* Register memory areas for /proc/kcore */
1194 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
1195
1196 mem_init_print_info(NULL);
1197 }
1198
1199 int kernel_set_to_readonly;
1200
1201 void set_kernel_text_rw(void)
1202 {
1203 unsigned long start = PFN_ALIGN(_text);
1204 unsigned long end = PFN_ALIGN(__stop___ex_table);
1205
1206 if (!kernel_set_to_readonly)
1207 return;
1208
1209 pr_debug("Set kernel text: %lx - %lx for read write\n",
1210 start, end);
1211
1212 /*
1213 * Make the kernel identity mapping for text RW. Kernel text
1214 * mapping will always be RO. Refer to the comment in
1215 * static_protections() in pageattr.c
1216 */
1217 set_memory_rw(start, (end - start) >> PAGE_SHIFT);
1218 }
1219
1220 void set_kernel_text_ro(void)
1221 {
1222 unsigned long start = PFN_ALIGN(_text);
1223 unsigned long end = PFN_ALIGN(__stop___ex_table);
1224
1225 if (!kernel_set_to_readonly)
1226 return;
1227
1228 pr_debug("Set kernel text: %lx - %lx for read only\n",
1229 start, end);
1230
1231 /*
1232 * Set the kernel identity mapping for text RO.
1233 */
1234 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
1235 }
1236
1237 void mark_rodata_ro(void)
1238 {
1239 unsigned long start = PFN_ALIGN(_text);
1240 unsigned long rodata_start = PFN_ALIGN(__start_rodata);
1241 unsigned long end = (unsigned long) &__end_rodata_hpage_align;
1242 unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
1243 unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
1244 unsigned long all_end;
1245
1246 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
1247 (end - start) >> 10);
1248 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
1249
1250 kernel_set_to_readonly = 1;
1251
1252 /*
1253 * The rodata/data/bss/brk section (but not the kernel text!)
1254 * should also be not-executable.
1255 *
1256 * We align all_end to PMD_SIZE because the existing mapping
1257 * is a full PMD. If we would align _brk_end to PAGE_SIZE we
1258 * split the PMD and the reminder between _brk_end and the end
1259 * of the PMD will remain mapped executable.
1260 *
1261 * Any PMD which was setup after the one which covers _brk_end
1262 * has been zapped already via cleanup_highmem().
1263 */
1264 all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
1265 set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
1266
1267 #ifdef CONFIG_CPA_DEBUG
1268 printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
1269 set_memory_rw(start, (end-start) >> PAGE_SHIFT);
1270
1271 printk(KERN_INFO "Testing CPA: again\n");
1272 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
1273 #endif
1274
1275 free_init_pages("unused kernel",
1276 (unsigned long) __va(__pa_symbol(text_end)),
1277 (unsigned long) __va(__pa_symbol(rodata_start)));
1278 free_init_pages("unused kernel",
1279 (unsigned long) __va(__pa_symbol(rodata_end)),
1280 (unsigned long) __va(__pa_symbol(_sdata)));
1281
1282 debug_checkwx();
1283 }
1284
1285 int kern_addr_valid(unsigned long addr)
1286 {
1287 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1288 pgd_t *pgd;
1289 p4d_t *p4d;
1290 pud_t *pud;
1291 pmd_t *pmd;
1292 pte_t *pte;
1293
1294 if (above != 0 && above != -1UL)
1295 return 0;
1296
1297 pgd = pgd_offset_k(addr);
1298 if (pgd_none(*pgd))
1299 return 0;
1300
1301 p4d = p4d_offset(pgd, addr);
1302 if (p4d_none(*p4d))
1303 return 0;
1304
1305 pud = pud_offset(p4d, addr);
1306 if (pud_none(*pud))
1307 return 0;
1308
1309 if (pud_large(*pud))
1310 return pfn_valid(pud_pfn(*pud));
1311
1312 pmd = pmd_offset(pud, addr);
1313 if (pmd_none(*pmd))
1314 return 0;
1315
1316 if (pmd_large(*pmd))
1317 return pfn_valid(pmd_pfn(*pmd));
1318
1319 pte = pte_offset_kernel(pmd, addr);
1320 if (pte_none(*pte))
1321 return 0;
1322
1323 return pfn_valid(pte_pfn(*pte));
1324 }
1325
1326 static unsigned long probe_memory_block_size(void)
1327 {
1328 unsigned long bz = MIN_MEMORY_BLOCK_SIZE;
1329
1330 /* if system is UV or has 64GB of RAM or more, use large blocks */
1331 if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30)))
1332 bz = 2UL << 30; /* 2GB */
1333
1334 pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
1335
1336 return bz;
1337 }
1338
1339 static unsigned long memory_block_size_probed;
1340 unsigned long memory_block_size_bytes(void)
1341 {
1342 if (!memory_block_size_probed)
1343 memory_block_size_probed = probe_memory_block_size();
1344
1345 return memory_block_size_probed;
1346 }
1347
1348 #ifdef CONFIG_SPARSEMEM_VMEMMAP
1349 /*
1350 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
1351 */
1352 static long __meminitdata addr_start, addr_end;
1353 static void __meminitdata *p_start, *p_end;
1354 static int __meminitdata node_start;
1355
1356 static int __meminit vmemmap_populate_hugepages(unsigned long start,
1357 unsigned long end, int node, struct vmem_altmap *altmap)
1358 {
1359 unsigned long addr;
1360 unsigned long next;
1361 pgd_t *pgd;
1362 p4d_t *p4d;
1363 pud_t *pud;
1364 pmd_t *pmd;
1365
1366 for (addr = start; addr < end; addr = next) {
1367 next = pmd_addr_end(addr, end);
1368
1369 pgd = vmemmap_pgd_populate(addr, node);
1370 if (!pgd)
1371 return -ENOMEM;
1372
1373 p4d = vmemmap_p4d_populate(pgd, addr, node);
1374 if (!p4d)
1375 return -ENOMEM;
1376
1377 pud = vmemmap_pud_populate(p4d, addr, node);
1378 if (!pud)
1379 return -ENOMEM;
1380
1381 pmd = pmd_offset(pud, addr);
1382 if (pmd_none(*pmd)) {
1383 void *p;
1384
1385 p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
1386 if (p) {
1387 pte_t entry;
1388
1389 entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
1390 PAGE_KERNEL_LARGE);
1391 set_pmd(pmd, __pmd(pte_val(entry)));
1392
1393 /* check to see if we have contiguous blocks */
1394 if (p_end != p || node_start != node) {
1395 if (p_start)
1396 pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1397 addr_start, addr_end-1, p_start, p_end-1, node_start);
1398 addr_start = addr;
1399 node_start = node;
1400 p_start = p;
1401 }
1402
1403 addr_end = addr + PMD_SIZE;
1404 p_end = p + PMD_SIZE;
1405 continue;
1406 } else if (altmap)
1407 return -ENOMEM; /* no fallback */
1408 } else if (pmd_large(*pmd)) {
1409 vmemmap_verify((pte_t *)pmd, node, addr, next);
1410 continue;
1411 }
1412 if (vmemmap_populate_basepages(addr, next, node))
1413 return -ENOMEM;
1414 }
1415 return 0;
1416 }
1417
1418 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
1419 {
1420 struct vmem_altmap *altmap = to_vmem_altmap(start);
1421 int err;
1422
1423 if (boot_cpu_has(X86_FEATURE_PSE))
1424 err = vmemmap_populate_hugepages(start, end, node, altmap);
1425 else if (altmap) {
1426 pr_err_once("%s: no cpu support for altmap allocations\n",
1427 __func__);
1428 err = -ENOMEM;
1429 } else
1430 err = vmemmap_populate_basepages(start, end, node);
1431 if (!err)
1432 sync_global_pgds(start, end - 1);
1433 return err;
1434 }
1435
1436 #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
1437 void register_page_bootmem_memmap(unsigned long section_nr,
1438 struct page *start_page, unsigned long nr_pages)
1439 {
1440 unsigned long addr = (unsigned long)start_page;
1441 unsigned long end = (unsigned long)(start_page + nr_pages);
1442 unsigned long next;
1443 pgd_t *pgd;
1444 p4d_t *p4d;
1445 pud_t *pud;
1446 pmd_t *pmd;
1447 unsigned int nr_pmd_pages;
1448 struct page *page;
1449
1450 for (; addr < end; addr = next) {
1451 pte_t *pte = NULL;
1452
1453 pgd = pgd_offset_k(addr);
1454 if (pgd_none(*pgd)) {
1455 next = (addr + PAGE_SIZE) & PAGE_MASK;
1456 continue;
1457 }
1458 get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
1459
1460 p4d = p4d_offset(pgd, addr);
1461 if (p4d_none(*p4d)) {
1462 next = (addr + PAGE_SIZE) & PAGE_MASK;
1463 continue;
1464 }
1465 get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO);
1466
1467 pud = pud_offset(p4d, addr);
1468 if (pud_none(*pud)) {
1469 next = (addr + PAGE_SIZE) & PAGE_MASK;
1470 continue;
1471 }
1472 get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
1473
1474 if (!boot_cpu_has(X86_FEATURE_PSE)) {
1475 next = (addr + PAGE_SIZE) & PAGE_MASK;
1476 pmd = pmd_offset(pud, addr);
1477 if (pmd_none(*pmd))
1478 continue;
1479 get_page_bootmem(section_nr, pmd_page(*pmd),
1480 MIX_SECTION_INFO);
1481
1482 pte = pte_offset_kernel(pmd, addr);
1483 if (pte_none(*pte))
1484 continue;
1485 get_page_bootmem(section_nr, pte_page(*pte),
1486 SECTION_INFO);
1487 } else {
1488 next = pmd_addr_end(addr, end);
1489
1490 pmd = pmd_offset(pud, addr);
1491 if (pmd_none(*pmd))
1492 continue;
1493
1494 nr_pmd_pages = 1 << get_order(PMD_SIZE);
1495 page = pmd_page(*pmd);
1496 while (nr_pmd_pages--)
1497 get_page_bootmem(section_nr, page++,
1498 SECTION_INFO);
1499 }
1500 }
1501 }
1502 #endif
1503
1504 void __meminit vmemmap_populate_print_last(void)
1505 {
1506 if (p_start) {
1507 pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1508 addr_start, addr_end-1, p_start, p_end-1, node_start);
1509 p_start = NULL;
1510 p_end = NULL;
1511 node_start = 0;
1512 }
1513 }
1514 #endif