]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - arch/x86/mm/fault.c
x86/fault: Fold smap_violation() into do_user_addr_fault()
[mirror_ubuntu-hirsute-kernel.git] / arch / x86 / mm / fault.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
1da177e4 3 * Copyright (C) 1995 Linus Torvalds
2d4a7167 4 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
f8eeb2e6 5 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
1da177e4 6 */
a2bcd473 7#include <linux/sched.h> /* test_thread_flag(), ... */
68db0cf1 8#include <linux/sched/task_stack.h> /* task_stack_*(), ... */
a2bcd473 9#include <linux/kdebug.h> /* oops_begin/end, ... */
4cdf8dbe 10#include <linux/extable.h> /* search_exception_tables */
57c8a661 11#include <linux/memblock.h> /* max_low_pfn */
9326638c 12#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
a2bcd473 13#include <linux/mmiotrace.h> /* kmmio_handler, ... */
cdd6c482 14#include <linux/perf_event.h> /* perf_sw_event */
f672b49b 15#include <linux/hugetlb.h> /* hstate_index_to_shift */
268bb0ce 16#include <linux/prefetch.h> /* prefetchw */
56dd9470 17#include <linux/context_tracking.h> /* exception_enter(), ... */
70ffdb93 18#include <linux/uaccess.h> /* faulthandler_disabled() */
3425d934 19#include <linux/efi.h> /* efi_recover_from_page_fault()*/
50a7ca3c 20#include <linux/mm_types.h>
2d4a7167 21
019132ff 22#include <asm/cpufeature.h> /* boot_cpu_has, ... */
a2bcd473
IM
23#include <asm/traps.h> /* dotraplinkage, ... */
24#include <asm/pgalloc.h> /* pgd_*(), ... */
f40c3300
AL
25#include <asm/fixmap.h> /* VSYSCALL_ADDR */
26#include <asm/vsyscall.h> /* emulate_vsyscall */
ba3e127e 27#include <asm/vm86.h> /* struct vm86 */
019132ff 28#include <asm/mmu_context.h> /* vma_pkey() */
3425d934 29#include <asm/efi.h> /* efi_recover_from_page_fault()*/
1da177e4 30
d34603b0
SA
31#define CREATE_TRACE_POINTS
32#include <asm/trace/exceptions.h>
33
b814d41f 34/*
b319eed0
IM
35 * Returns 0 if mmiotrace is disabled, or if the fault is not
36 * handled by mmiotrace:
b814d41f 37 */
9326638c 38static nokprobe_inline int
62c9295f 39kmmio_fault(struct pt_regs *regs, unsigned long addr)
86069782 40{
0fd0e3da
PP
41 if (unlikely(is_kmmio_active()))
42 if (kmmio_handler(regs, addr) == 1)
43 return -1;
0fd0e3da 44 return 0;
86069782
PP
45}
46
9326638c 47static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
1bd858a5 48{
a980c0ef
JH
49 if (!kprobes_built_in())
50 return 0;
51 if (user_mode(regs))
52 return 0;
53 /*
54 * To be potentially processing a kprobe fault and to be allowed to call
55 * kprobe_running(), we have to be non-preemptible.
56 */
57 if (preemptible())
58 return 0;
59 if (!kprobe_running())
60 return 0;
61 return kprobe_fault_handler(regs, X86_TRAP_PF);
33cb5243 62}
1bd858a5 63
1dc85be0 64/*
2d4a7167
IM
65 * Prefetch quirks:
66 *
67 * 32-bit mode:
68 *
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
1dc85be0 71 *
2d4a7167 72 * 64-bit mode:
1dc85be0 73 *
2d4a7167
IM
74 * Sometimes the CPU reports invalid exceptions on prefetch.
75 * Check that here and ignore it.
76 *
77 * Opcode checker based on code by Richard Brunner.
1dc85be0 78 */
107a0367
IM
79static inline int
80check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
81 unsigned char opcode, int *prefetch)
82{
83 unsigned char instr_hi = opcode & 0xf0;
84 unsigned char instr_lo = opcode & 0x0f;
85
86 switch (instr_hi) {
87 case 0x20:
88 case 0x30:
89 /*
90 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
91 * In X86_64 long mode, the CPU will signal invalid
92 * opcode if some of these prefixes are present so
93 * X86_64 will never get here anyway
94 */
95 return ((instr_lo & 7) == 0x6);
96#ifdef CONFIG_X86_64
97 case 0x40:
98 /*
99 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
100 * Need to figure out under what instruction mode the
101 * instruction was issued. Could check the LDT for lm,
102 * but for now it's good enough to assume that long
103 * mode only uses well known segments or kernel.
104 */
318f5a2a 105 return (!user_mode(regs) || user_64bit_mode(regs));
107a0367
IM
106#endif
107 case 0x60:
108 /* 0x64 thru 0x67 are valid prefixes in all modes. */
109 return (instr_lo & 0xC) == 0x4;
110 case 0xF0:
111 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
112 return !instr_lo || (instr_lo>>1) == 1;
113 case 0x00:
114 /* Prefetch instruction is 0x0F0D or 0x0F18 */
115 if (probe_kernel_address(instr, opcode))
116 return 0;
117
118 *prefetch = (instr_lo == 0xF) &&
119 (opcode == 0x0D || opcode == 0x18);
120 return 0;
121 default:
122 return 0;
123 }
124}
125
2d4a7167
IM
126static int
127is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
33cb5243 128{
2d4a7167 129 unsigned char *max_instr;
ab2bf0c1 130 unsigned char *instr;
33cb5243 131 int prefetch = 0;
1da177e4 132
3085354d
IM
133 /*
134 * If it was a exec (instruction fetch) fault on NX page, then
135 * do not ignore the fault:
136 */
1067f030 137 if (error_code & X86_PF_INSTR)
1da177e4 138 return 0;
1dc85be0 139
107a0367 140 instr = (void *)convert_ip_to_linear(current, regs);
f1290ec9 141 max_instr = instr + 15;
1da177e4 142
d31bf07f 143 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
1da177e4
LT
144 return 0;
145
107a0367 146 while (instr < max_instr) {
2d4a7167 147 unsigned char opcode;
1da177e4 148
ab2bf0c1 149 if (probe_kernel_address(instr, opcode))
33cb5243 150 break;
1da177e4 151
1da177e4
LT
152 instr++;
153
107a0367 154 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
1da177e4 155 break;
1da177e4
LT
156 }
157 return prefetch;
158}
159
f2f13a85
IM
160DEFINE_SPINLOCK(pgd_lock);
161LIST_HEAD(pgd_list);
162
163#ifdef CONFIG_X86_32
164static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
33cb5243 165{
f2f13a85
IM
166 unsigned index = pgd_index(address);
167 pgd_t *pgd_k;
e0c4f675 168 p4d_t *p4d, *p4d_k;
f2f13a85
IM
169 pud_t *pud, *pud_k;
170 pmd_t *pmd, *pmd_k;
2d4a7167 171
f2f13a85
IM
172 pgd += index;
173 pgd_k = init_mm.pgd + index;
174
175 if (!pgd_present(*pgd_k))
176 return NULL;
177
178 /*
179 * set_pgd(pgd, *pgd_k); here would be useless on PAE
180 * and redundant with the set_pmd() on non-PAE. As would
e0c4f675 181 * set_p4d/set_pud.
f2f13a85 182 */
e0c4f675
KS
183 p4d = p4d_offset(pgd, address);
184 p4d_k = p4d_offset(pgd_k, address);
185 if (!p4d_present(*p4d_k))
186 return NULL;
187
188 pud = pud_offset(p4d, address);
189 pud_k = pud_offset(p4d_k, address);
f2f13a85
IM
190 if (!pud_present(*pud_k))
191 return NULL;
192
193 pmd = pmd_offset(pud, address);
194 pmd_k = pmd_offset(pud_k, address);
195 if (!pmd_present(*pmd_k))
196 return NULL;
197
b8bcfe99 198 if (!pmd_present(*pmd))
f2f13a85 199 set_pmd(pmd, *pmd_k);
b8bcfe99 200 else
f2f13a85 201 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
f2f13a85
IM
202
203 return pmd_k;
204}
205
206void vmalloc_sync_all(void)
207{
208 unsigned long address;
209
210 if (SHARED_KERNEL_PMD)
211 return;
212
213 for (address = VMALLOC_START & PMD_MASK;
dc4fac84 214 address >= TASK_SIZE_MAX && address < FIXADDR_TOP;
f2f13a85 215 address += PMD_SIZE) {
f2f13a85
IM
216 struct page *page;
217
a79e53d8 218 spin_lock(&pgd_lock);
f2f13a85 219 list_for_each_entry(page, &pgd_list, lru) {
617d34d9 220 spinlock_t *pgt_lock;
f01f7c56 221 pmd_t *ret;
617d34d9 222
a79e53d8 223 /* the pgt_lock only for Xen */
617d34d9
JF
224 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
225
226 spin_lock(pgt_lock);
227 ret = vmalloc_sync_one(page_address(page), address);
228 spin_unlock(pgt_lock);
229
230 if (!ret)
f2f13a85
IM
231 break;
232 }
a79e53d8 233 spin_unlock(&pgd_lock);
f2f13a85
IM
234 }
235}
236
237/*
238 * 32-bit:
239 *
240 * Handle a fault on the vmalloc or module mapping area
241 */
9326638c 242static noinline int vmalloc_fault(unsigned long address)
f2f13a85
IM
243{
244 unsigned long pgd_paddr;
245 pmd_t *pmd_k;
246 pte_t *pte_k;
247
248 /* Make sure we are in vmalloc area: */
249 if (!(address >= VMALLOC_START && address < VMALLOC_END))
250 return -1;
251
252 /*
253 * Synchronize this task's top level page-table
254 * with the 'reference' page table.
255 *
256 * Do _not_ use "current" here. We might be inside
257 * an interrupt in the middle of a task switch..
258 */
6c690ee1 259 pgd_paddr = read_cr3_pa();
f2f13a85
IM
260 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
261 if (!pmd_k)
262 return -1;
263
18a95521 264 if (pmd_large(*pmd_k))
f4eafd8b
TK
265 return 0;
266
f2f13a85
IM
267 pte_k = pte_offset_kernel(pmd_k, address);
268 if (!pte_present(*pte_k))
269 return -1;
270
271 return 0;
272}
9326638c 273NOKPROBE_SYMBOL(vmalloc_fault);
f2f13a85
IM
274
275/*
276 * Did it hit the DOS screen memory VA from vm86 mode?
277 */
278static inline void
279check_v8086_mode(struct pt_regs *regs, unsigned long address,
280 struct task_struct *tsk)
281{
9fda6a06 282#ifdef CONFIG_VM86
f2f13a85
IM
283 unsigned long bit;
284
9fda6a06 285 if (!v8086_mode(regs) || !tsk->thread.vm86)
f2f13a85
IM
286 return;
287
288 bit = (address - 0xA0000) >> PAGE_SHIFT;
289 if (bit < 32)
9fda6a06
BG
290 tsk->thread.vm86->screen_bitmap |= 1 << bit;
291#endif
33cb5243 292}
1da177e4 293
087975b0 294static bool low_pfn(unsigned long pfn)
1da177e4 295{
087975b0
AM
296 return pfn < max_low_pfn;
297}
1156e098 298
087975b0
AM
299static void dump_pagetable(unsigned long address)
300{
6c690ee1 301 pgd_t *base = __va(read_cr3_pa());
087975b0 302 pgd_t *pgd = &base[pgd_index(address)];
e0c4f675
KS
303 p4d_t *p4d;
304 pud_t *pud;
087975b0
AM
305 pmd_t *pmd;
306 pte_t *pte;
2d4a7167 307
1156e098 308#ifdef CONFIG_X86_PAE
39e48d9b 309 pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
087975b0
AM
310 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
311 goto out;
39e48d9b
JB
312#define pr_pde pr_cont
313#else
314#define pr_pde pr_info
1156e098 315#endif
e0c4f675
KS
316 p4d = p4d_offset(pgd, address);
317 pud = pud_offset(p4d, address);
318 pmd = pmd_offset(pud, address);
39e48d9b
JB
319 pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
320#undef pr_pde
1156e098
HH
321
322 /*
323 * We must not directly access the pte in the highpte
324 * case if the page table is located in highmem.
325 * And let's rather not kmap-atomic the pte, just in case
2d4a7167 326 * it's allocated already:
1156e098 327 */
087975b0
AM
328 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
329 goto out;
1156e098 330
087975b0 331 pte = pte_offset_kernel(pmd, address);
39e48d9b 332 pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
087975b0 333out:
39e48d9b 334 pr_cont("\n");
f2f13a85
IM
335}
336
337#else /* CONFIG_X86_64: */
338
339void vmalloc_sync_all(void)
340{
5372e155 341 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
f2f13a85
IM
342}
343
344/*
345 * 64-bit:
346 *
347 * Handle a fault on the vmalloc area
f2f13a85 348 */
9326638c 349static noinline int vmalloc_fault(unsigned long address)
f2f13a85 350{
565977a3
TK
351 pgd_t *pgd, *pgd_k;
352 p4d_t *p4d, *p4d_k;
353 pud_t *pud;
354 pmd_t *pmd;
355 pte_t *pte;
f2f13a85
IM
356
357 /* Make sure we are in vmalloc area: */
358 if (!(address >= VMALLOC_START && address < VMALLOC_END))
359 return -1;
360
ebc8827f
FW
361 WARN_ON_ONCE(in_nmi());
362
f2f13a85
IM
363 /*
364 * Copy kernel mappings over when needed. This can also
365 * happen within a race in page table update. In the later
366 * case just flush:
367 */
6c690ee1 368 pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
565977a3
TK
369 pgd_k = pgd_offset_k(address);
370 if (pgd_none(*pgd_k))
f2f13a85
IM
371 return -1;
372
ed7588d5 373 if (pgtable_l5_enabled()) {
36b3a772 374 if (pgd_none(*pgd)) {
565977a3 375 set_pgd(pgd, *pgd_k);
36b3a772
AL
376 arch_flush_lazy_mmu_mode();
377 } else {
565977a3 378 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
36b3a772 379 }
1160c277 380 }
f2f13a85 381
b50858ce
KS
382 /* With 4-level paging, copying happens on the p4d level. */
383 p4d = p4d_offset(pgd, address);
565977a3
TK
384 p4d_k = p4d_offset(pgd_k, address);
385 if (p4d_none(*p4d_k))
b50858ce
KS
386 return -1;
387
ed7588d5 388 if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
565977a3 389 set_p4d(p4d, *p4d_k);
b50858ce
KS
390 arch_flush_lazy_mmu_mode();
391 } else {
565977a3 392 BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
b50858ce
KS
393 }
394
36b3a772 395 BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
f2f13a85 396
b50858ce 397 pud = pud_offset(p4d, address);
565977a3 398 if (pud_none(*pud))
f2f13a85
IM
399 return -1;
400
18a95521 401 if (pud_large(*pud))
f4eafd8b
TK
402 return 0;
403
f2f13a85 404 pmd = pmd_offset(pud, address);
565977a3 405 if (pmd_none(*pmd))
f2f13a85
IM
406 return -1;
407
18a95521 408 if (pmd_large(*pmd))
f4eafd8b
TK
409 return 0;
410
f2f13a85 411 pte = pte_offset_kernel(pmd, address);
565977a3
TK
412 if (!pte_present(*pte))
413 return -1;
f2f13a85
IM
414
415 return 0;
416}
9326638c 417NOKPROBE_SYMBOL(vmalloc_fault);
f2f13a85 418
e05139f2 419#ifdef CONFIG_CPU_SUP_AMD
f2f13a85 420static const char errata93_warning[] =
ad361c98
JP
421KERN_ERR
422"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
423"******* Working around it, but it may cause SEGVs or burn power.\n"
424"******* Please consider a BIOS update.\n"
425"******* Disabling USB legacy in the BIOS may also help.\n";
e05139f2 426#endif
f2f13a85
IM
427
428/*
429 * No vm86 mode in 64-bit mode:
430 */
431static inline void
432check_v8086_mode(struct pt_regs *regs, unsigned long address,
433 struct task_struct *tsk)
434{
435}
436
437static int bad_address(void *p)
438{
439 unsigned long dummy;
440
441 return probe_kernel_address((unsigned long *)p, dummy);
442}
443
444static void dump_pagetable(unsigned long address)
445{
6c690ee1 446 pgd_t *base = __va(read_cr3_pa());
087975b0 447 pgd_t *pgd = base + pgd_index(address);
e0c4f675 448 p4d_t *p4d;
1da177e4
LT
449 pud_t *pud;
450 pmd_t *pmd;
451 pte_t *pte;
452
2d4a7167
IM
453 if (bad_address(pgd))
454 goto bad;
455
39e48d9b 456 pr_info("PGD %lx ", pgd_val(*pgd));
2d4a7167
IM
457
458 if (!pgd_present(*pgd))
459 goto out;
1da177e4 460
e0c4f675
KS
461 p4d = p4d_offset(pgd, address);
462 if (bad_address(p4d))
463 goto bad;
464
39e48d9b 465 pr_cont("P4D %lx ", p4d_val(*p4d));
e0c4f675
KS
466 if (!p4d_present(*p4d) || p4d_large(*p4d))
467 goto out;
468
469 pud = pud_offset(p4d, address);
2d4a7167
IM
470 if (bad_address(pud))
471 goto bad;
472
39e48d9b 473 pr_cont("PUD %lx ", pud_val(*pud));
b5360222 474 if (!pud_present(*pud) || pud_large(*pud))
2d4a7167 475 goto out;
1da177e4
LT
476
477 pmd = pmd_offset(pud, address);
2d4a7167
IM
478 if (bad_address(pmd))
479 goto bad;
480
39e48d9b 481 pr_cont("PMD %lx ", pmd_val(*pmd));
2d4a7167
IM
482 if (!pmd_present(*pmd) || pmd_large(*pmd))
483 goto out;
1da177e4
LT
484
485 pte = pte_offset_kernel(pmd, address);
2d4a7167
IM
486 if (bad_address(pte))
487 goto bad;
488
39e48d9b 489 pr_cont("PTE %lx", pte_val(*pte));
2d4a7167 490out:
39e48d9b 491 pr_cont("\n");
1da177e4
LT
492 return;
493bad:
39e48d9b 494 pr_info("BAD\n");
8c938f9f
IM
495}
496
f2f13a85 497#endif /* CONFIG_X86_64 */
1da177e4 498
2d4a7167
IM
499/*
500 * Workaround for K8 erratum #93 & buggy BIOS.
501 *
502 * BIOS SMM functions are required to use a specific workaround
503 * to avoid corruption of the 64bit RIP register on C stepping K8.
504 *
505 * A lot of BIOS that didn't get tested properly miss this.
506 *
507 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
508 * Try to work around it here.
509 *
510 * Note we only handle faults in kernel here.
511 * Does nothing on 32-bit.
fdfe8aa8 512 */
33cb5243 513static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 514{
e05139f2
JB
515#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
516 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
517 || boot_cpu_data.x86 != 0xf)
518 return 0;
519
65ea5b03 520 if (address != regs->ip)
1da177e4 521 return 0;
2d4a7167 522
33cb5243 523 if ((address >> 32) != 0)
1da177e4 524 return 0;
2d4a7167 525
1da177e4 526 address |= 0xffffffffUL << 32;
33cb5243
HH
527 if ((address >= (u64)_stext && address <= (u64)_etext) ||
528 (address >= MODULES_VADDR && address <= MODULES_END)) {
a454ab31 529 printk_once(errata93_warning);
65ea5b03 530 regs->ip = address;
1da177e4
LT
531 return 1;
532 }
fdfe8aa8 533#endif
1da177e4 534 return 0;
33cb5243 535}
1da177e4 536
35f3266f 537/*
2d4a7167
IM
538 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
539 * to illegal addresses >4GB.
540 *
541 * We catch this in the page fault handler because these addresses
542 * are not reachable. Just detect this case and return. Any code
35f3266f
HH
543 * segment in LDT is compatibility mode.
544 */
545static int is_errata100(struct pt_regs *regs, unsigned long address)
546{
547#ifdef CONFIG_X86_64
2d4a7167 548 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
35f3266f
HH
549 return 1;
550#endif
551 return 0;
552}
553
29caf2f9
HH
554static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
555{
556#ifdef CONFIG_X86_F00F_BUG
557 unsigned long nr;
2d4a7167 558
29caf2f9 559 /*
2d4a7167 560 * Pentium F0 0F C7 C8 bug workaround:
29caf2f9 561 */
e2604b49 562 if (boot_cpu_has_bug(X86_BUG_F00F)) {
29caf2f9
HH
563 nr = (address - idt_descr.address) >> 3;
564
565 if (nr == 6) {
566 do_invalid_op(regs, 0);
567 return 1;
568 }
569 }
570#endif
571 return 0;
572}
573
2d4a7167
IM
574static void
575show_fault_oops(struct pt_regs *regs, unsigned long error_code,
576 unsigned long address)
b3279c7f 577{
1156e098
HH
578 if (!oops_may_print())
579 return;
580
1067f030 581 if (error_code & X86_PF_INSTR) {
93809be8 582 unsigned int level;
426e34cc
MF
583 pgd_t *pgd;
584 pte_t *pte;
2d4a7167 585
6c690ee1 586 pgd = __va(read_cr3_pa());
426e34cc
MF
587 pgd += pgd_index(address);
588
589 pte = lookup_address_in_pgd(pgd, address, &level);
1156e098 590
8f766149 591 if (pte && pte_present(*pte) && !pte_exec(*pte))
d79d0d8a
DV
592 pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
593 from_kuid(&init_user_ns, current_uid()));
eff50c34
JK
594 if (pte && pte_present(*pte) && pte_exec(*pte) &&
595 (pgd_flags(*pgd) & _PAGE_USER) &&
1e02ce4c 596 (__read_cr4() & X86_CR4_SMEP))
d79d0d8a
DV
597 pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
598 from_kuid(&init_user_ns, current_uid()));
1156e098 599 }
1156e098 600
4188f063
DV
601 pr_alert("BUG: unable to handle kernel %s at %px\n",
602 address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
603 (void *)address);
2d4a7167 604
b3279c7f
HH
605 dump_pagetable(address);
606}
607
2d4a7167
IM
608static noinline void
609pgtable_bad(struct pt_regs *regs, unsigned long error_code,
610 unsigned long address)
1da177e4 611{
2d4a7167
IM
612 struct task_struct *tsk;
613 unsigned long flags;
614 int sig;
615
616 flags = oops_begin();
617 tsk = current;
618 sig = SIGKILL;
1209140c 619
1da177e4 620 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
92181f19 621 tsk->comm, address);
1da177e4 622 dump_pagetable(address);
2d4a7167
IM
623
624 tsk->thread.cr2 = address;
51e7dc70 625 tsk->thread.trap_nr = X86_TRAP_PF;
2d4a7167
IM
626 tsk->thread.error_code = error_code;
627
22f5991c 628 if (__die("Bad pagetable", regs, error_code))
874d93d1 629 sig = 0;
2d4a7167 630
874d93d1 631 oops_end(flags, regs, sig);
1da177e4
LT
632}
633
2d4a7167
IM
634static noinline void
635no_context(struct pt_regs *regs, unsigned long error_code,
4fc34901 636 unsigned long address, int signal, int si_code)
92181f19
NP
637{
638 struct task_struct *tsk = current;
92181f19
NP
639 unsigned long flags;
640 int sig;
92181f19 641
2d4a7167 642 /* Are we prepared to handle this kernel fault? */
81fd9c18 643 if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
c026b359
PZ
644 /*
645 * Any interrupt that takes a fault gets the fixup. This makes
646 * the below recursive fault logic only apply to a faults from
647 * task context.
648 */
649 if (in_interrupt())
650 return;
651
652 /*
653 * Per the above we're !in_interrupt(), aka. task context.
654 *
655 * In this case we need to make sure we're not recursively
656 * faulting through the emulate_vsyscall() logic.
657 */
2a53ccbc 658 if (current->thread.sig_on_uaccess_err && signal) {
51e7dc70 659 tsk->thread.trap_nr = X86_TRAP_PF;
1067f030 660 tsk->thread.error_code = error_code | X86_PF_USER;
4fc34901
AL
661 tsk->thread.cr2 = address;
662
663 /* XXX: hwpoison faults will set the wrong code. */
b4fd52f2
EB
664 force_sig_fault(signal, si_code, (void __user *)address,
665 tsk);
4fc34901 666 }
c026b359
PZ
667
668 /*
669 * Barring that, we can do the fixup and be happy.
670 */
92181f19 671 return;
4fc34901 672 }
92181f19 673
6271cfdf
AL
674#ifdef CONFIG_VMAP_STACK
675 /*
676 * Stack overflow? During boot, we can fault near the initial
677 * stack in the direct map, but that's not an overflow -- check
678 * that we're in vmalloc space to avoid this.
679 */
680 if (is_vmalloc_addr((void *)address) &&
681 (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
682 address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
6271cfdf
AL
683 unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
684 /*
685 * We're likely to be running with very little stack space
686 * left. It's plausible that we'd hit this condition but
687 * double-fault even before we get this far, in which case
688 * we're fine: the double-fault handler will deal with it.
689 *
690 * We don't want to make it all the way into the oops code
691 * and then double-fault, though, because we're likely to
692 * break the console driver and lose most of the stack dump.
693 */
694 asm volatile ("movq %[stack], %%rsp\n\t"
695 "call handle_stack_overflow\n\t"
696 "1: jmp 1b"
f5caf621 697 : ASM_CALL_CONSTRAINT
6271cfdf
AL
698 : "D" ("kernel stack overflow (page fault)"),
699 "S" (regs), "d" (address),
700 [stack] "rm" (stack));
701 unreachable();
702 }
703#endif
704
92181f19 705 /*
2d4a7167
IM
706 * 32-bit:
707 *
708 * Valid to do another page fault here, because if this fault
709 * had been triggered by is_prefetch fixup_exception would have
710 * handled it.
711 *
712 * 64-bit:
92181f19 713 *
2d4a7167 714 * Hall of shame of CPU/BIOS bugs.
92181f19
NP
715 */
716 if (is_prefetch(regs, error_code, address))
717 return;
718
719 if (is_errata93(regs, address))
720 return;
721
3425d934
SP
722 /*
723 * Buggy firmware could access regions which might page fault, try to
724 * recover from such faults.
725 */
726 if (IS_ENABLED(CONFIG_EFI))
727 efi_recover_from_page_fault(address);
728
92181f19
NP
729 /*
730 * Oops. The kernel tried to access some bad page. We'll have to
2d4a7167 731 * terminate things with extreme prejudice:
92181f19 732 */
92181f19 733 flags = oops_begin();
92181f19
NP
734
735 show_fault_oops(regs, error_code, address);
736
a70857e4 737 if (task_stack_end_corrupted(tsk))
b0f4c4b3 738 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
19803078 739
1cc99544 740 tsk->thread.cr2 = address;
51e7dc70 741 tsk->thread.trap_nr = X86_TRAP_PF;
1cc99544 742 tsk->thread.error_code = error_code;
92181f19 743
92181f19
NP
744 sig = SIGKILL;
745 if (__die("Oops", regs, error_code))
746 sig = 0;
2d4a7167 747
92181f19 748 /* Executive summary in case the body of the oops scrolled away */
b0f4c4b3 749 printk(KERN_DEFAULT "CR2: %016lx\n", address);
2d4a7167 750
92181f19 751 oops_end(flags, regs, sig);
92181f19
NP
752}
753
2d4a7167
IM
754/*
755 * Print out info about fatal segfaults, if the show_unhandled_signals
756 * sysctl is set:
757 */
758static inline void
759show_signal_msg(struct pt_regs *regs, unsigned long error_code,
760 unsigned long address, struct task_struct *tsk)
761{
ba54d856
BP
762 const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
763
2d4a7167
IM
764 if (!unhandled_signal(tsk, SIGSEGV))
765 return;
766
767 if (!printk_ratelimit())
768 return;
769
10a7e9d8 770 printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
ba54d856 771 loglvl, tsk->comm, task_pid_nr(tsk), address,
2d4a7167
IM
772 (void *)regs->ip, (void *)regs->sp, error_code);
773
774 print_vma_addr(KERN_CONT " in ", regs->ip);
775
776 printk(KERN_CONT "\n");
ba54d856 777
342db04a 778 show_opcodes(regs, loglvl);
2d4a7167
IM
779}
780
02e983b7
DH
781/*
782 * The (legacy) vsyscall page is the long page in the kernel portion
783 * of the address space that has user-accessible permissions.
784 */
785static bool is_vsyscall_vaddr(unsigned long vaddr)
786{
3ae0ad92 787 return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
02e983b7
DH
788}
789
2d4a7167
IM
790static void
791__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
419ceeb1 792 unsigned long address, u32 pkey, int si_code)
92181f19
NP
793{
794 struct task_struct *tsk = current;
795
796 /* User mode accesses just cause a SIGSEGV */
1067f030 797 if (error_code & X86_PF_USER) {
92181f19 798 /*
2d4a7167 799 * It's possible to have interrupts off here:
92181f19
NP
800 */
801 local_irq_enable();
802
803 /*
804 * Valid to do another page fault here because this one came
2d4a7167 805 * from user space:
92181f19
NP
806 */
807 if (is_prefetch(regs, error_code, address))
808 return;
809
810 if (is_errata100(regs, address))
811 return;
812
dc4fac84
AL
813 /*
814 * To avoid leaking information about the kernel page table
815 * layout, pretend that user-mode accesses to kernel addresses
816 * are always protection faults.
817 */
818 if (address >= TASK_SIZE_MAX)
1067f030 819 error_code |= X86_PF_PROT;
3ae36655 820
e575a86f 821 if (likely(show_unhandled_signals))
2d4a7167
IM
822 show_signal_msg(regs, error_code, address, tsk);
823
2d4a7167 824 tsk->thread.cr2 = address;
e575a86f 825 tsk->thread.error_code = error_code;
51e7dc70 826 tsk->thread.trap_nr = X86_TRAP_PF;
92181f19 827
9db812db 828 if (si_code == SEGV_PKUERR)
419ceeb1 829 force_sig_pkuerr((void __user *)address, pkey);
9db812db 830
b4fd52f2 831 force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk);
2d4a7167 832
92181f19
NP
833 return;
834 }
835
836 if (is_f00f_bug(regs, address))
837 return;
838
4fc34901 839 no_context(regs, error_code, address, SIGSEGV, si_code);
92181f19
NP
840}
841
2d4a7167
IM
842static noinline void
843bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
768fd9c6 844 unsigned long address)
92181f19 845{
419ceeb1 846 __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
92181f19
NP
847}
848
2d4a7167
IM
849static void
850__bad_area(struct pt_regs *regs, unsigned long error_code,
419ceeb1 851 unsigned long address, u32 pkey, int si_code)
92181f19
NP
852{
853 struct mm_struct *mm = current->mm;
92181f19
NP
854 /*
855 * Something tried to access memory that isn't in our memory map..
856 * Fix it, but check if it's kernel or user first..
857 */
858 up_read(&mm->mmap_sem);
859
aba1ecd3 860 __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
92181f19
NP
861}
862
2d4a7167
IM
863static noinline void
864bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
92181f19 865{
419ceeb1 866 __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
92181f19
NP
867}
868
33a709b2
DH
869static inline bool bad_area_access_from_pkeys(unsigned long error_code,
870 struct vm_area_struct *vma)
871{
07f146f5
DH
872 /* This code is always called on the current mm */
873 bool foreign = false;
874
33a709b2
DH
875 if (!boot_cpu_has(X86_FEATURE_OSPKE))
876 return false;
1067f030 877 if (error_code & X86_PF_PK)
33a709b2 878 return true;
07f146f5 879 /* this checks permission keys on the VMA: */
1067f030
RN
880 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
881 (error_code & X86_PF_INSTR), foreign))
07f146f5 882 return true;
33a709b2 883 return false;
92181f19
NP
884}
885
2d4a7167
IM
886static noinline void
887bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
7b2d0dba 888 unsigned long address, struct vm_area_struct *vma)
92181f19 889{
019132ff
DH
890 /*
891 * This OSPKE check is not strictly necessary at runtime.
892 * But, doing it this way allows compiler optimizations
893 * if pkeys are compiled out.
894 */
aba1ecd3 895 if (bad_area_access_from_pkeys(error_code, vma)) {
9db812db
EB
896 /*
897 * A protection key fault means that the PKRU value did not allow
898 * access to some PTE. Userspace can figure out what PKRU was
899 * from the XSAVE state. This function captures the pkey from
900 * the vma and passes it to userspace so userspace can discover
901 * which protection key was set on the PTE.
902 *
903 * If we get here, we know that the hardware signaled a X86_PF_PK
904 * fault and that there was a VMA once we got in the fault
905 * handler. It does *not* guarantee that the VMA we find here
906 * was the one that we faulted on.
907 *
908 * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
909 * 2. T1 : set PKRU to deny access to pkey=4, touches page
910 * 3. T1 : faults...
911 * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
912 * 5. T1 : enters fault handler, takes mmap_sem, etc...
913 * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
914 * faulted on a pte with its pkey=4.
915 */
aba1ecd3 916 u32 pkey = vma_pkey(vma);
9db812db 917
419ceeb1 918 __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
aba1ecd3 919 } else {
419ceeb1 920 __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
aba1ecd3 921 }
92181f19
NP
922}
923
2d4a7167 924static void
a6e04aa9 925do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
27274f73 926 unsigned int fault)
92181f19
NP
927{
928 struct task_struct *tsk = current;
92181f19 929
2d4a7167 930 /* Kernel mode? Handle exceptions or die: */
1067f030 931 if (!(error_code & X86_PF_USER)) {
4fc34901 932 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
96054569
LT
933 return;
934 }
2d4a7167 935
cd1b68f0 936 /* User-space => ok to do another page fault: */
92181f19
NP
937 if (is_prefetch(regs, error_code, address))
938 return;
2d4a7167
IM
939
940 tsk->thread.cr2 = address;
941 tsk->thread.error_code = error_code;
51e7dc70 942 tsk->thread.trap_nr = X86_TRAP_PF;
2d4a7167 943
a6e04aa9 944#ifdef CONFIG_MEMORY_FAILURE
f672b49b 945 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
40e55394
EB
946 unsigned lsb = 0;
947
948 pr_err(
a6e04aa9
AK
949 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
950 tsk->comm, tsk->pid, address);
40e55394
EB
951 if (fault & VM_FAULT_HWPOISON_LARGE)
952 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
953 if (fault & VM_FAULT_HWPOISON)
954 lsb = PAGE_SHIFT;
955 force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, tsk);
956 return;
a6e04aa9
AK
957 }
958#endif
b4fd52f2 959 force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk);
92181f19
NP
960}
961
3a13c4d7 962static noinline void
2d4a7167 963mm_fault_error(struct pt_regs *regs, unsigned long error_code,
25c102d8 964 unsigned long address, vm_fault_t fault)
92181f19 965{
1067f030 966 if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
3a13c4d7
JW
967 no_context(regs, error_code, address, 0, 0);
968 return;
b80ef10e 969 }
b80ef10e 970
2d4a7167 971 if (fault & VM_FAULT_OOM) {
f8626854 972 /* Kernel mode? Handle exceptions or die: */
1067f030 973 if (!(error_code & X86_PF_USER)) {
4fc34901
AL
974 no_context(regs, error_code, address,
975 SIGSEGV, SEGV_MAPERR);
3a13c4d7 976 return;
f8626854
AV
977 }
978
c2d23f91
DR
979 /*
980 * We ran out of memory, call the OOM killer, and return the
981 * userspace (which will retry the fault, or kill us if we got
982 * oom-killed):
983 */
984 pagefault_out_of_memory();
2d4a7167 985 } else {
f672b49b
AK
986 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
987 VM_FAULT_HWPOISON_LARGE))
27274f73 988 do_sigbus(regs, error_code, address, fault);
33692f27 989 else if (fault & VM_FAULT_SIGSEGV)
768fd9c6 990 bad_area_nosemaphore(regs, error_code, address);
2d4a7167
IM
991 else
992 BUG();
993 }
92181f19
NP
994}
995
8fed6200 996static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
d8b57bb7 997{
1067f030 998 if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
d8b57bb7 999 return 0;
2d4a7167 1000
1067f030 1001 if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
d8b57bb7
TG
1002 return 0;
1003
1004 return 1;
1005}
1006
5b727a3b 1007/*
2d4a7167
IM
1008 * Handle a spurious fault caused by a stale TLB entry.
1009 *
1010 * This allows us to lazily refresh the TLB when increasing the
1011 * permissions of a kernel page (RO -> RW or NX -> X). Doing it
1012 * eagerly is very expensive since that implies doing a full
1013 * cross-processor TLB flush, even if no stale TLB entries exist
1014 * on other processors.
1015 *
31668511
DV
1016 * Spurious faults may only occur if the TLB contains an entry with
1017 * fewer permission than the page table entry. Non-present (P = 0)
1018 * and reserved bit (R = 1) faults are never spurious.
1019 *
5b727a3b
JF
1020 * There are no security implications to leaving a stale TLB when
1021 * increasing the permissions on a page.
31668511
DV
1022 *
1023 * Returns non-zero if a spurious fault was handled, zero otherwise.
1024 *
1025 * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
1026 * (Optional Invalidation).
5b727a3b 1027 */
9326638c 1028static noinline int
8fed6200 1029spurious_kernel_fault(unsigned long error_code, unsigned long address)
5b727a3b
JF
1030{
1031 pgd_t *pgd;
e0c4f675 1032 p4d_t *p4d;
5b727a3b
JF
1033 pud_t *pud;
1034 pmd_t *pmd;
1035 pte_t *pte;
3c3e5694 1036 int ret;
5b727a3b 1037
31668511
DV
1038 /*
1039 * Only writes to RO or instruction fetches from NX may cause
1040 * spurious faults.
1041 *
1042 * These could be from user or supervisor accesses but the TLB
1043 * is only lazily flushed after a kernel mapping protection
1044 * change, so user accesses are not expected to cause spurious
1045 * faults.
1046 */
1067f030
RN
1047 if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
1048 error_code != (X86_PF_INSTR | X86_PF_PROT))
5b727a3b
JF
1049 return 0;
1050
1051 pgd = init_mm.pgd + pgd_index(address);
1052 if (!pgd_present(*pgd))
1053 return 0;
1054
e0c4f675
KS
1055 p4d = p4d_offset(pgd, address);
1056 if (!p4d_present(*p4d))
1057 return 0;
1058
1059 if (p4d_large(*p4d))
8fed6200 1060 return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
e0c4f675
KS
1061
1062 pud = pud_offset(p4d, address);
5b727a3b
JF
1063 if (!pud_present(*pud))
1064 return 0;
1065
d8b57bb7 1066 if (pud_large(*pud))
8fed6200 1067 return spurious_kernel_fault_check(error_code, (pte_t *) pud);
d8b57bb7 1068
5b727a3b
JF
1069 pmd = pmd_offset(pud, address);
1070 if (!pmd_present(*pmd))
1071 return 0;
1072
d8b57bb7 1073 if (pmd_large(*pmd))
8fed6200 1074 return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
d8b57bb7 1075
5b727a3b 1076 pte = pte_offset_kernel(pmd, address);
954f8571 1077 if (!pte_present(*pte))
5b727a3b
JF
1078 return 0;
1079
8fed6200 1080 ret = spurious_kernel_fault_check(error_code, pte);
3c3e5694
SR
1081 if (!ret)
1082 return 0;
1083
1084 /*
2d4a7167
IM
1085 * Make sure we have permissions in PMD.
1086 * If not, then there's a bug in the page tables:
3c3e5694 1087 */
8fed6200 1088 ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
3c3e5694 1089 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
2d4a7167 1090
3c3e5694 1091 return ret;
5b727a3b 1092}
8fed6200 1093NOKPROBE_SYMBOL(spurious_kernel_fault);
5b727a3b 1094
abd4f750 1095int show_unhandled_signals = 1;
1da177e4 1096
2d4a7167 1097static inline int
68da336a 1098access_error(unsigned long error_code, struct vm_area_struct *vma)
92181f19 1099{
07f146f5
DH
1100 /* This is only called for the current mm, so: */
1101 bool foreign = false;
e8c6226d
DH
1102
1103 /*
1104 * Read or write was blocked by protection keys. This is
1105 * always an unconditional error and can never result in
1106 * a follow-up action to resolve the fault, like a COW.
1107 */
1067f030 1108 if (error_code & X86_PF_PK)
e8c6226d
DH
1109 return 1;
1110
07f146f5
DH
1111 /*
1112 * Make sure to check the VMA so that we do not perform
1067f030 1113 * faults just to hit a X86_PF_PK as soon as we fill in a
07f146f5
DH
1114 * page.
1115 */
1067f030
RN
1116 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
1117 (error_code & X86_PF_INSTR), foreign))
07f146f5 1118 return 1;
33a709b2 1119
1067f030 1120 if (error_code & X86_PF_WRITE) {
2d4a7167 1121 /* write, present and write, not present: */
92181f19
NP
1122 if (unlikely(!(vma->vm_flags & VM_WRITE)))
1123 return 1;
2d4a7167 1124 return 0;
92181f19
NP
1125 }
1126
2d4a7167 1127 /* read, present: */
1067f030 1128 if (unlikely(error_code & X86_PF_PROT))
2d4a7167
IM
1129 return 1;
1130
1131 /* read, not present: */
1132 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
1133 return 1;
1134
92181f19
NP
1135 return 0;
1136}
1137
0973a06c
HS
1138static int fault_in_kernel_space(unsigned long address)
1139{
3ae0ad92
DH
1140 /*
1141 * On 64-bit systems, the vsyscall page is at an address above
1142 * TASK_SIZE_MAX, but is not considered part of the kernel
1143 * address space.
1144 */
1145 if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
1146 return false;
1147
d9517346 1148 return address >= TASK_SIZE_MAX;
0973a06c
HS
1149}
1150
1da177e4 1151/*
8fed6200
DH
1152 * Called for all faults where 'address' is part of the kernel address
1153 * space. Might get called for faults that originate from *code* that
1154 * ran in userspace or the kernel.
1da177e4 1155 */
8fed6200
DH
1156static void
1157do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
1158 unsigned long address)
1da177e4 1159{
367e3f1d
DH
1160 /*
1161 * Protection keys exceptions only happen on user pages. We
1162 * have no user pages in the kernel portion of the address
1163 * space, so do not expect them here.
1164 */
1165 WARN_ON_ONCE(hw_error_code & X86_PF_PK);
1da177e4
LT
1166
1167 /*
8fed6200 1168 * We can fault-in kernel-space virtual memory on-demand. The
1da177e4
LT
1169 * 'reference' page table is init_mm.pgd.
1170 *
1171 * NOTE! We MUST NOT take any locks for this case. We may
1172 * be in an interrupt or a critical region, and should
1173 * only copy the information from the master page table,
1174 * nothing more.
1175 *
8fed6200
DH
1176 * Before doing this on-demand faulting, ensure that the
1177 * fault is not any of the following:
1178 * 1. A fault on a PTE with a reserved bit set.
1179 * 2. A fault caused by a user-mode access. (Do not demand-
1180 * fault kernel memory due to user-mode accesses).
1181 * 3. A fault caused by a page-level protection violation.
1182 * (A demand fault would be on a non-present page which
1183 * would have X86_PF_PROT==0).
1da177e4 1184 */
8fed6200
DH
1185 if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1186 if (vmalloc_fault(address) >= 0)
5b727a3b 1187 return;
8fed6200 1188 }
5b727a3b 1189
8fed6200
DH
1190 /* Was the fault spurious, caused by lazy TLB invalidation? */
1191 if (spurious_kernel_fault(hw_error_code, address))
1192 return;
2d4a7167 1193
8fed6200
DH
1194 /* kprobes don't want to hook the spurious faults: */
1195 if (kprobes_fault(regs))
92181f19 1196 return;
8fed6200
DH
1197
1198 /*
1199 * Note, despite being a "bad area", there are quite a few
1200 * acceptable reasons to get here, such as erratum fixups
1201 * and handling kernel code that can fault, like get_user().
1202 *
1203 * Don't take the mm semaphore here. If we fixup a prefetch
1204 * fault we could otherwise deadlock:
1205 */
ba9f6f89 1206 bad_area_nosemaphore(regs, hw_error_code, address);
8fed6200
DH
1207}
1208NOKPROBE_SYMBOL(do_kern_addr_fault);
1209
aa37c51b
DH
1210/* Handle faults in the user portion of the address space */
1211static inline
1212void do_user_addr_fault(struct pt_regs *regs,
1213 unsigned long hw_error_code,
1214 unsigned long address)
1da177e4 1215{
164477c2 1216 unsigned long sw_error_code;
2d4a7167 1217 struct vm_area_struct *vma;
1da177e4
LT
1218 struct task_struct *tsk;
1219 struct mm_struct *mm;
50a7ca3c 1220 vm_fault_t fault, major = 0;
759496ba 1221 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1da177e4 1222
a9ba9a3b
AV
1223 tsk = current;
1224 mm = tsk->mm;
f8c2ee22 1225
2d4a7167 1226 /* kprobes don't want to hook the spurious faults: */
e00b12e6 1227 if (unlikely(kprobes_fault(regs)))
9be260a6 1228 return;
8c914cb7 1229
5b0c2cac
DH
1230 /*
1231 * Reserved bits are never expected to be set on
1232 * entries in the user portion of the page tables.
1233 */
164477c2
DH
1234 if (unlikely(hw_error_code & X86_PF_RSVD))
1235 pgtable_bad(regs, hw_error_code, address);
1da177e4 1236
5b0c2cac 1237 /*
a15781b5
AL
1238 * If SMAP is on, check for invalid kernel (supervisor)
1239 * access to user pages in the user address space.
5b0c2cac 1240 */
a15781b5
AL
1241 if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
1242 !(hw_error_code & X86_PF_USER) &&
1243 (user_mode(regs) || !(regs->flags & X86_EFLAGS_AC))))
1244 {
ba9f6f89 1245 bad_area_nosemaphore(regs, hw_error_code, address);
4640c7ee 1246 return;
40d3cd66
PA
1247 }
1248
1da177e4 1249 /*
2d4a7167 1250 * If we're in an interrupt, have no user context or are running
70ffdb93 1251 * in a region with pagefaults disabled then we must not take the fault
1da177e4 1252 */
70ffdb93 1253 if (unlikely(faulthandler_disabled() || !mm)) {
ba9f6f89 1254 bad_area_nosemaphore(regs, hw_error_code, address);
92181f19
NP
1255 return;
1256 }
1da177e4 1257
164477c2
DH
1258 /*
1259 * hw_error_code is literally the "page fault error code" passed to
1260 * the kernel directly from the hardware. But, we will shortly be
1261 * modifying it in software, so give it a new name.
1262 */
1263 sw_error_code = hw_error_code;
1264
e00b12e6
PZ
1265 /*
1266 * It's safe to allow irq's after cr2 has been saved and the
1267 * vmalloc fault has been handled.
1268 *
1269 * User-mode registers count as a user access even for any
1270 * potential system fault or CPU buglet:
1271 */
f39b6f0e 1272 if (user_mode(regs)) {
e00b12e6 1273 local_irq_enable();
164477c2
DH
1274 /*
1275 * Up to this point, X86_PF_USER set in hw_error_code
1276 * indicated a user-mode access. But, after this,
1277 * X86_PF_USER in sw_error_code will indicate either
1278 * that, *or* an implicit kernel(supervisor)-mode access
1279 * which originated from user mode.
1280 */
1281 if (!(hw_error_code & X86_PF_USER)) {
1282 /*
1283 * The CPU was in user mode, but the CPU says
1284 * the fault was not a user-mode access.
1285 * Must be an implicit kernel-mode access,
1286 * which we do not expect to happen in the
1287 * user address space.
1288 */
1289 pr_warn_once("kernel-mode error from user-mode: %lx\n",
1290 hw_error_code);
1291
1292 sw_error_code |= X86_PF_USER;
1293 }
e00b12e6
PZ
1294 flags |= FAULT_FLAG_USER;
1295 } else {
1296 if (regs->flags & X86_EFLAGS_IF)
1297 local_irq_enable();
1298 }
1299
1300 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1301
164477c2 1302 if (sw_error_code & X86_PF_WRITE)
759496ba 1303 flags |= FAULT_FLAG_WRITE;
164477c2 1304 if (sw_error_code & X86_PF_INSTR)
d61172b4 1305 flags |= FAULT_FLAG_INSTRUCTION;
759496ba 1306
3ae0ad92 1307#ifdef CONFIG_X86_64
3a1dfe6e 1308 /*
3ae0ad92
DH
1309 * Instruction fetch faults in the vsyscall page might need
1310 * emulation. The vsyscall page is at a high address
1311 * (>PAGE_OFFSET), but is considered to be part of the user
1312 * address space.
1da177e4 1313 *
3ae0ad92
DH
1314 * The vsyscall page does not have a "real" VMA, so do this
1315 * emulation before we go searching for VMAs.
1316 */
1317 if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
1318 if (emulate_vsyscall(regs, address))
1319 return;
1320 }
1321#endif
1322
3a1dfe6e 1323 /*
88259744
DH
1324 * Kernel-mode access to the user address space should only occur
1325 * on well-defined single instructions listed in the exception
1326 * tables. But, an erroneous kernel fault occurring outside one of
1327 * those areas which also holds mmap_sem might deadlock attempting
1328 * to validate the fault against the address space.
1da177e4 1329 *
88259744
DH
1330 * Only do the expensive exception table search when we might be at
1331 * risk of a deadlock. This happens if we
1332 * 1. Failed to acquire mmap_sem, and
6344be60 1333 * 2. The access did not originate in userspace.
1da177e4 1334 */
92181f19 1335 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
6344be60 1336 if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
88259744
DH
1337 /*
1338 * Fault from code in kernel from
1339 * which we do not expect faults.
1340 */
ba9f6f89 1341 bad_area_nosemaphore(regs, sw_error_code, address);
92181f19
NP
1342 return;
1343 }
d065bd81 1344retry:
1da177e4 1345 down_read(&mm->mmap_sem);
01006074
PZ
1346 } else {
1347 /*
2d4a7167
IM
1348 * The above down_read_trylock() might have succeeded in
1349 * which case we'll have missed the might_sleep() from
1350 * down_read():
01006074
PZ
1351 */
1352 might_sleep();
1da177e4
LT
1353 }
1354
1355 vma = find_vma(mm, address);
92181f19 1356 if (unlikely(!vma)) {
164477c2 1357 bad_area(regs, sw_error_code, address);
92181f19
NP
1358 return;
1359 }
1360 if (likely(vma->vm_start <= address))
1da177e4 1361 goto good_area;
92181f19 1362 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
164477c2 1363 bad_area(regs, sw_error_code, address);
92181f19
NP
1364 return;
1365 }
92181f19 1366 if (unlikely(expand_stack(vma, address))) {
164477c2 1367 bad_area(regs, sw_error_code, address);
92181f19
NP
1368 return;
1369 }
1370
1371 /*
1372 * Ok, we have a good vm_area for this memory access, so
1373 * we can handle it..
1374 */
1da177e4 1375good_area:
164477c2
DH
1376 if (unlikely(access_error(sw_error_code, vma))) {
1377 bad_area_access_error(regs, sw_error_code, address, vma);
92181f19 1378 return;
1da177e4
LT
1379 }
1380
1381 /*
1382 * If for any reason at all we couldn't handle the fault,
1383 * make sure we exit gracefully rather than endlessly redo
9a95f3cf
PC
1384 * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
1385 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
cb0631fd
VB
1386 *
1387 * Note that handle_userfault() may also release and reacquire mmap_sem
1388 * (and not return with VM_FAULT_RETRY), when returning to userland to
1389 * repeat the page fault later with a VM_FAULT_NOPAGE retval
1390 * (potentially after handling any pending signal during the return to
1391 * userland). The return to userland is identified whenever
1392 * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
1da177e4 1393 */
dcddffd4 1394 fault = handle_mm_fault(vma, address, flags);
26178ec1 1395 major |= fault & VM_FAULT_MAJOR;
2d4a7167 1396
3a13c4d7 1397 /*
26178ec1
LT
1398 * If we need to retry the mmap_sem has already been released,
1399 * and if there is a fatal signal pending there is no guarantee
1400 * that we made any progress. Handle this case first.
3a13c4d7 1401 */
26178ec1
LT
1402 if (unlikely(fault & VM_FAULT_RETRY)) {
1403 /* Retry at most once */
1404 if (flags & FAULT_FLAG_ALLOW_RETRY) {
1405 flags &= ~FAULT_FLAG_ALLOW_RETRY;
1406 flags |= FAULT_FLAG_TRIED;
1407 if (!fatal_signal_pending(tsk))
1408 goto retry;
1409 }
1410
1411 /* User mode? Just return to handle the fatal exception */
cf3c0a15 1412 if (flags & FAULT_FLAG_USER)
26178ec1
LT
1413 return;
1414
1415 /* Not returning to user mode? Handle exceptions or die: */
164477c2 1416 no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
3a13c4d7 1417 return;
26178ec1 1418 }
3a13c4d7 1419
26178ec1 1420 up_read(&mm->mmap_sem);
3a13c4d7 1421 if (unlikely(fault & VM_FAULT_ERROR)) {
ba9f6f89 1422 mm_fault_error(regs, sw_error_code, address, fault);
3a13c4d7 1423 return;
37b23e05
KM
1424 }
1425
d065bd81 1426 /*
26178ec1
LT
1427 * Major/minor page fault accounting. If any of the events
1428 * returned VM_FAULT_MAJOR, we account it as a major fault.
d065bd81 1429 */
26178ec1
LT
1430 if (major) {
1431 tsk->maj_flt++;
1432 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
1433 } else {
1434 tsk->min_flt++;
1435 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
ac17dc8e 1436 }
d729ab35 1437
8c938f9f 1438 check_v8086_mode(regs, address, tsk);
1da177e4 1439}
aa37c51b
DH
1440NOKPROBE_SYMBOL(do_user_addr_fault);
1441
1442/*
1443 * This routine handles page faults. It determines the address,
1444 * and the problem, and then passes it off to one of the appropriate
1445 * routines.
1446 */
1447static noinline void
1448__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
1449 unsigned long address)
1450{
1451 prefetchw(&current->mm->mmap_sem);
1452
1453 if (unlikely(kmmio_fault(regs, address)))
1454 return;
1455
1456 /* Was the fault on kernel-controlled part of the address space? */
1457 if (unlikely(fault_in_kernel_space(address)))
1458 do_kern_addr_fault(regs, hw_error_code, address);
1459 else
1460 do_user_addr_fault(regs, hw_error_code, address);
1461}
9326638c 1462NOKPROBE_SYMBOL(__do_page_fault);
6ba3c97a 1463
9326638c
MH
1464static nokprobe_inline void
1465trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
1466 unsigned long error_code)
d34603b0
SA
1467{
1468 if (user_mode(regs))
d4078e23 1469 trace_page_fault_user(address, regs, error_code);
d34603b0 1470 else
d4078e23 1471 trace_page_fault_kernel(address, regs, error_code);
d34603b0
SA
1472}
1473
11a7ffb0
TG
1474/*
1475 * We must have this function blacklisted from kprobes, tagged with notrace
1476 * and call read_cr2() before calling anything else. To avoid calling any
1477 * kind of tracing machinery before we've observed the CR2 value.
1478 *
1479 * exception_{enter,exit}() contains all sorts of tracepoints.
1480 */
9326638c 1481dotraplinkage void notrace
11a7ffb0 1482do_page_fault(struct pt_regs *regs, unsigned long error_code)
25c74b10 1483{
11a7ffb0 1484 unsigned long address = read_cr2(); /* Get the faulting address */
d4078e23 1485 enum ctx_state prev_state;
25c74b10
SA
1486
1487 prev_state = exception_enter();
80954747 1488 if (trace_pagefault_enabled())
11a7ffb0
TG
1489 trace_page_fault_entries(address, regs, error_code);
1490
0ac09f9f 1491 __do_page_fault(regs, error_code, address);
25c74b10
SA
1492 exception_exit(prev_state);
1493}
11a7ffb0 1494NOKPROBE_SYMBOL(do_page_fault);