]>
Commit | Line | Data |
---|---|---|
1 | // SPDX-License-Identifier: GPL-2.0-only | |
2 | /* | |
3 | * Debug helper to dump the current kernel pagetables of the system | |
4 | * so that we can see what the various memory ranges are set to. | |
5 | * | |
6 | * (C) Copyright 2008 Intel Corporation | |
7 | * | |
8 | * Author: Arjan van de Ven <arjan@linux.intel.com> | |
9 | */ | |
10 | ||
11 | #include <linux/debugfs.h> | |
12 | #include <linux/kasan.h> | |
13 | #include <linux/mm.h> | |
14 | #include <linux/init.h> | |
15 | #include <linux/sched.h> | |
16 | #include <linux/seq_file.h> | |
17 | #include <linux/highmem.h> | |
18 | #include <linux/pci.h> | |
19 | #include <linux/ptdump.h> | |
20 | ||
21 | #include <asm/e820/types.h> | |
22 | ||
23 | /* | |
24 | * The dumper groups pagetable entries of the same type into one, and for | |
25 | * that it needs to keep some state when walking, and flush this state | |
26 | * when a "break" in the continuity is found. | |
27 | */ | |
28 | struct pg_state { | |
29 | struct ptdump_state ptdump; | |
30 | int level; | |
31 | pgprotval_t current_prot; | |
32 | pgprotval_t effective_prot; | |
33 | pgprotval_t prot_levels[5]; | |
34 | unsigned long start_address; | |
35 | const struct addr_marker *marker; | |
36 | unsigned long lines; | |
37 | bool to_dmesg; | |
38 | bool check_wx; | |
39 | unsigned long wx_pages; | |
40 | struct seq_file *seq; | |
41 | }; | |
42 | ||
43 | struct addr_marker { | |
44 | unsigned long start_address; | |
45 | const char *name; | |
46 | unsigned long max_lines; | |
47 | }; | |
48 | ||
49 | /* Address space markers hints */ | |
50 | ||
51 | #ifdef CONFIG_X86_64 | |
52 | ||
53 | enum address_markers_idx { | |
54 | USER_SPACE_NR = 0, | |
55 | KERNEL_SPACE_NR, | |
56 | #ifdef CONFIG_MODIFY_LDT_SYSCALL | |
57 | LDT_NR, | |
58 | #endif | |
59 | LOW_KERNEL_NR, | |
60 | VMALLOC_START_NR, | |
61 | VMEMMAP_START_NR, | |
62 | #ifdef CONFIG_KASAN | |
63 | KASAN_SHADOW_START_NR, | |
64 | KASAN_SHADOW_END_NR, | |
65 | #endif | |
66 | CPU_ENTRY_AREA_NR, | |
67 | #ifdef CONFIG_X86_ESPFIX64 | |
68 | ESPFIX_START_NR, | |
69 | #endif | |
70 | #ifdef CONFIG_EFI | |
71 | EFI_END_NR, | |
72 | #endif | |
73 | HIGH_KERNEL_NR, | |
74 | MODULES_VADDR_NR, | |
75 | MODULES_END_NR, | |
76 | FIXADDR_START_NR, | |
77 | END_OF_SPACE_NR, | |
78 | }; | |
79 | ||
80 | static struct addr_marker address_markers[] = { | |
81 | [USER_SPACE_NR] = { 0, "User Space" }, | |
82 | [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, | |
83 | [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, | |
84 | [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, | |
85 | [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, | |
86 | #ifdef CONFIG_KASAN | |
87 | /* | |
88 | * These fields get initialized with the (dynamic) | |
89 | * KASAN_SHADOW_{START,END} values in pt_dump_init(). | |
90 | */ | |
91 | [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" }, | |
92 | [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" }, | |
93 | #endif | |
94 | #ifdef CONFIG_MODIFY_LDT_SYSCALL | |
95 | [LDT_NR] = { 0UL, "LDT remap" }, | |
96 | #endif | |
97 | [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, | |
98 | #ifdef CONFIG_X86_ESPFIX64 | |
99 | [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, | |
100 | #endif | |
101 | #ifdef CONFIG_EFI | |
102 | [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, | |
103 | #endif | |
104 | [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, | |
105 | [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, | |
106 | [MODULES_END_NR] = { MODULES_END, "End Modules" }, | |
107 | [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, | |
108 | [END_OF_SPACE_NR] = { -1, NULL } | |
109 | }; | |
110 | ||
111 | #define INIT_PGD ((pgd_t *) &init_top_pgt) | |
112 | ||
113 | #else /* CONFIG_X86_64 */ | |
114 | ||
115 | enum address_markers_idx { | |
116 | USER_SPACE_NR = 0, | |
117 | KERNEL_SPACE_NR, | |
118 | VMALLOC_START_NR, | |
119 | VMALLOC_END_NR, | |
120 | #ifdef CONFIG_HIGHMEM | |
121 | PKMAP_BASE_NR, | |
122 | #endif | |
123 | #ifdef CONFIG_MODIFY_LDT_SYSCALL | |
124 | LDT_NR, | |
125 | #endif | |
126 | CPU_ENTRY_AREA_NR, | |
127 | FIXADDR_START_NR, | |
128 | END_OF_SPACE_NR, | |
129 | }; | |
130 | ||
131 | static struct addr_marker address_markers[] = { | |
132 | [USER_SPACE_NR] = { 0, "User Space" }, | |
133 | [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, | |
134 | [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, | |
135 | [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, | |
136 | #ifdef CONFIG_HIGHMEM | |
137 | [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, | |
138 | #endif | |
139 | #ifdef CONFIG_MODIFY_LDT_SYSCALL | |
140 | [LDT_NR] = { 0UL, "LDT remap" }, | |
141 | #endif | |
142 | [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, | |
143 | [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, | |
144 | [END_OF_SPACE_NR] = { -1, NULL } | |
145 | }; | |
146 | ||
147 | #define INIT_PGD (swapper_pg_dir) | |
148 | ||
149 | #endif /* !CONFIG_X86_64 */ | |
150 | ||
151 | /* Multipliers for offsets within the PTEs */ | |
152 | #define PTE_LEVEL_MULT (PAGE_SIZE) | |
153 | #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) | |
154 | #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) | |
155 | #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) | |
156 | #define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT) | |
157 | ||
158 | #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ | |
159 | ({ \ | |
160 | if (to_dmesg) \ | |
161 | printk(KERN_INFO fmt, ##args); \ | |
162 | else \ | |
163 | if (m) \ | |
164 | seq_printf(m, fmt, ##args); \ | |
165 | }) | |
166 | ||
167 | #define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \ | |
168 | ({ \ | |
169 | if (to_dmesg) \ | |
170 | printk(KERN_CONT fmt, ##args); \ | |
171 | else \ | |
172 | if (m) \ | |
173 | seq_printf(m, fmt, ##args); \ | |
174 | }) | |
175 | ||
176 | /* | |
177 | * Print a readable form of a pgprot_t to the seq_file | |
178 | */ | |
179 | static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg) | |
180 | { | |
181 | static const char * const level_name[] = | |
182 | { "pgd", "p4d", "pud", "pmd", "pte" }; | |
183 | ||
184 | if (!(pr & _PAGE_PRESENT)) { | |
185 | /* Not present */ | |
186 | pt_dump_cont_printf(m, dmsg, " "); | |
187 | } else { | |
188 | if (pr & _PAGE_USER) | |
189 | pt_dump_cont_printf(m, dmsg, "USR "); | |
190 | else | |
191 | pt_dump_cont_printf(m, dmsg, " "); | |
192 | if (pr & _PAGE_RW) | |
193 | pt_dump_cont_printf(m, dmsg, "RW "); | |
194 | else | |
195 | pt_dump_cont_printf(m, dmsg, "ro "); | |
196 | if (pr & _PAGE_PWT) | |
197 | pt_dump_cont_printf(m, dmsg, "PWT "); | |
198 | else | |
199 | pt_dump_cont_printf(m, dmsg, " "); | |
200 | if (pr & _PAGE_PCD) | |
201 | pt_dump_cont_printf(m, dmsg, "PCD "); | |
202 | else | |
203 | pt_dump_cont_printf(m, dmsg, " "); | |
204 | ||
205 | /* Bit 7 has a different meaning on level 3 vs 4 */ | |
206 | if (level <= 3 && pr & _PAGE_PSE) | |
207 | pt_dump_cont_printf(m, dmsg, "PSE "); | |
208 | else | |
209 | pt_dump_cont_printf(m, dmsg, " "); | |
210 | if ((level == 4 && pr & _PAGE_PAT) || | |
211 | ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) | |
212 | pt_dump_cont_printf(m, dmsg, "PAT "); | |
213 | else | |
214 | pt_dump_cont_printf(m, dmsg, " "); | |
215 | if (pr & _PAGE_GLOBAL) | |
216 | pt_dump_cont_printf(m, dmsg, "GLB "); | |
217 | else | |
218 | pt_dump_cont_printf(m, dmsg, " "); | |
219 | if (pr & _PAGE_NX) | |
220 | pt_dump_cont_printf(m, dmsg, "NX "); | |
221 | else | |
222 | pt_dump_cont_printf(m, dmsg, "x "); | |
223 | } | |
224 | pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); | |
225 | } | |
226 | ||
227 | static void note_wx(struct pg_state *st, unsigned long addr) | |
228 | { | |
229 | unsigned long npages; | |
230 | ||
231 | npages = (addr - st->start_address) / PAGE_SIZE; | |
232 | ||
233 | #ifdef CONFIG_PCI_BIOS | |
234 | /* | |
235 | * If PCI BIOS is enabled, the PCI BIOS area is forced to WX. | |
236 | * Inform about it, but avoid the warning. | |
237 | */ | |
238 | if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && | |
239 | addr <= PAGE_OFFSET + BIOS_END) { | |
240 | pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); | |
241 | return; | |
242 | } | |
243 | #endif | |
244 | /* Account the WX pages */ | |
245 | st->wx_pages += npages; | |
246 | WARN_ONCE(__supported_pte_mask & _PAGE_NX, | |
247 | "x86/mm: Found insecure W+X mapping at address %pS\n", | |
248 | (void *)st->start_address); | |
249 | } | |
250 | ||
251 | static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) | |
252 | { | |
253 | struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); | |
254 | pgprotval_t prot = val & PTE_FLAGS_MASK; | |
255 | pgprotval_t effective; | |
256 | ||
257 | if (level > 0) { | |
258 | pgprotval_t higher_prot = st->prot_levels[level - 1]; | |
259 | ||
260 | effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) | | |
261 | ((higher_prot | prot) & _PAGE_NX); | |
262 | } else { | |
263 | effective = prot; | |
264 | } | |
265 | ||
266 | st->prot_levels[level] = effective; | |
267 | } | |
268 | ||
269 | /* | |
270 | * This function gets called on a break in a continuous series | |
271 | * of PTE entries; the next one is different so we need to | |
272 | * print what we collected so far. | |
273 | */ | |
274 | static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, | |
275 | u64 val) | |
276 | { | |
277 | struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); | |
278 | pgprotval_t new_prot, new_eff; | |
279 | pgprotval_t cur, eff; | |
280 | static const char units[] = "BKMGTPE"; | |
281 | struct seq_file *m = st->seq; | |
282 | ||
283 | new_prot = val & PTE_FLAGS_MASK; | |
284 | if (!val) | |
285 | new_eff = 0; | |
286 | else | |
287 | new_eff = st->prot_levels[level]; | |
288 | ||
289 | /* | |
290 | * If we have a "break" in the series, we need to flush the state that | |
291 | * we have now. "break" is either changing perms, levels or | |
292 | * address space marker. | |
293 | */ | |
294 | cur = st->current_prot; | |
295 | eff = st->effective_prot; | |
296 | ||
297 | if (st->level == -1) { | |
298 | /* First entry */ | |
299 | st->current_prot = new_prot; | |
300 | st->effective_prot = new_eff; | |
301 | st->level = level; | |
302 | st->marker = address_markers; | |
303 | st->lines = 0; | |
304 | pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", | |
305 | st->marker->name); | |
306 | } else if (new_prot != cur || new_eff != eff || level != st->level || | |
307 | addr >= st->marker[1].start_address) { | |
308 | const char *unit = units; | |
309 | unsigned long delta; | |
310 | int width = sizeof(unsigned long) * 2; | |
311 | ||
312 | if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) | |
313 | note_wx(st, addr); | |
314 | ||
315 | /* | |
316 | * Now print the actual finished series | |
317 | */ | |
318 | if (!st->marker->max_lines || | |
319 | st->lines < st->marker->max_lines) { | |
320 | pt_dump_seq_printf(m, st->to_dmesg, | |
321 | "0x%0*lx-0x%0*lx ", | |
322 | width, st->start_address, | |
323 | width, addr); | |
324 | ||
325 | delta = addr - st->start_address; | |
326 | while (!(delta & 1023) && unit[1]) { | |
327 | delta >>= 10; | |
328 | unit++; | |
329 | } | |
330 | pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", | |
331 | delta, *unit); | |
332 | printk_prot(m, st->current_prot, st->level, | |
333 | st->to_dmesg); | |
334 | } | |
335 | st->lines++; | |
336 | ||
337 | /* | |
338 | * We print markers for special areas of address space, | |
339 | * such as the start of vmalloc space etc. | |
340 | * This helps in the interpretation. | |
341 | */ | |
342 | if (addr >= st->marker[1].start_address) { | |
343 | if (st->marker->max_lines && | |
344 | st->lines > st->marker->max_lines) { | |
345 | unsigned long nskip = | |
346 | st->lines - st->marker->max_lines; | |
347 | pt_dump_seq_printf(m, st->to_dmesg, | |
348 | "... %lu entr%s skipped ... \n", | |
349 | nskip, | |
350 | nskip == 1 ? "y" : "ies"); | |
351 | } | |
352 | st->marker++; | |
353 | st->lines = 0; | |
354 | pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", | |
355 | st->marker->name); | |
356 | } | |
357 | ||
358 | st->start_address = addr; | |
359 | st->current_prot = new_prot; | |
360 | st->effective_prot = new_eff; | |
361 | st->level = level; | |
362 | } | |
363 | } | |
364 | ||
365 | static void ptdump_walk_pgd_level_core(struct seq_file *m, | |
366 | struct mm_struct *mm, pgd_t *pgd, | |
367 | bool checkwx, bool dmesg) | |
368 | { | |
369 | const struct ptdump_range ptdump_ranges[] = { | |
370 | #ifdef CONFIG_X86_64 | |
371 | {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2}, | |
372 | {GUARD_HOLE_END_ADDR, ~0UL}, | |
373 | #else | |
374 | {0, ~0UL}, | |
375 | #endif | |
376 | {0, 0} | |
377 | }; | |
378 | ||
379 | struct pg_state st = { | |
380 | .ptdump = { | |
381 | .note_page = note_page, | |
382 | .effective_prot = effective_prot, | |
383 | .range = ptdump_ranges | |
384 | }, | |
385 | .level = -1, | |
386 | .to_dmesg = dmesg, | |
387 | .check_wx = checkwx, | |
388 | .seq = m | |
389 | }; | |
390 | ||
391 | ptdump_walk_pgd(&st.ptdump, mm, pgd); | |
392 | ||
393 | if (!checkwx) | |
394 | return; | |
395 | if (st.wx_pages) | |
396 | pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", | |
397 | st.wx_pages); | |
398 | else | |
399 | pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); | |
400 | } | |
401 | ||
402 | void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm) | |
403 | { | |
404 | ptdump_walk_pgd_level_core(m, mm, mm->pgd, false, true); | |
405 | } | |
406 | ||
407 | void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, | |
408 | bool user) | |
409 | { | |
410 | pgd_t *pgd = mm->pgd; | |
411 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | |
412 | if (user && boot_cpu_has(X86_FEATURE_PTI)) | |
413 | pgd = kernel_to_user_pgdp(pgd); | |
414 | #endif | |
415 | ptdump_walk_pgd_level_core(m, mm, pgd, false, false); | |
416 | } | |
417 | EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); | |
418 | ||
419 | void ptdump_walk_user_pgd_level_checkwx(void) | |
420 | { | |
421 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | |
422 | pgd_t *pgd = INIT_PGD; | |
423 | ||
424 | if (!(__supported_pte_mask & _PAGE_NX) || | |
425 | !boot_cpu_has(X86_FEATURE_PTI)) | |
426 | return; | |
427 | ||
428 | pr_info("x86/mm: Checking user space page tables\n"); | |
429 | pgd = kernel_to_user_pgdp(pgd); | |
430 | ptdump_walk_pgd_level_core(NULL, &init_mm, pgd, true, false); | |
431 | #endif | |
432 | } | |
433 | ||
434 | void ptdump_walk_pgd_level_checkwx(void) | |
435 | { | |
436 | ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); | |
437 | } | |
438 | ||
439 | static int __init pt_dump_init(void) | |
440 | { | |
441 | /* | |
442 | * Various markers are not compile-time constants, so assign them | |
443 | * here. | |
444 | */ | |
445 | #ifdef CONFIG_X86_64 | |
446 | address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; | |
447 | address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; | |
448 | address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; | |
449 | #ifdef CONFIG_MODIFY_LDT_SYSCALL | |
450 | address_markers[LDT_NR].start_address = LDT_BASE_ADDR; | |
451 | #endif | |
452 | #ifdef CONFIG_KASAN | |
453 | address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; | |
454 | address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; | |
455 | #endif | |
456 | #endif | |
457 | #ifdef CONFIG_X86_32 | |
458 | address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; | |
459 | address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; | |
460 | # ifdef CONFIG_HIGHMEM | |
461 | address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; | |
462 | # endif | |
463 | address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; | |
464 | address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; | |
465 | # ifdef CONFIG_MODIFY_LDT_SYSCALL | |
466 | address_markers[LDT_NR].start_address = LDT_BASE_ADDR; | |
467 | # endif | |
468 | #endif | |
469 | return 0; | |
470 | } | |
471 | __initcall(pt_dump_init); |