]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - arch/x86/mm/fault.c
x86/extable: Introduce _ASM_EXTABLE_UA for uaccess fixups
[mirror_ubuntu-hirsute-kernel.git] / arch / x86 / mm / fault.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
1da177e4 3 * Copyright (C) 1995 Linus Torvalds
2d4a7167 4 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
f8eeb2e6 5 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
1da177e4 6 */
a2bcd473 7#include <linux/sched.h> /* test_thread_flag(), ... */
68db0cf1 8#include <linux/sched/task_stack.h> /* task_stack_*(), ... */
a2bcd473 9#include <linux/kdebug.h> /* oops_begin/end, ... */
4cdf8dbe 10#include <linux/extable.h> /* search_exception_tables */
a2bcd473 11#include <linux/bootmem.h> /* max_low_pfn */
9326638c 12#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
a2bcd473 13#include <linux/mmiotrace.h> /* kmmio_handler, ... */
cdd6c482 14#include <linux/perf_event.h> /* perf_sw_event */
f672b49b 15#include <linux/hugetlb.h> /* hstate_index_to_shift */
268bb0ce 16#include <linux/prefetch.h> /* prefetchw */
56dd9470 17#include <linux/context_tracking.h> /* exception_enter(), ... */
70ffdb93 18#include <linux/uaccess.h> /* faulthandler_disabled() */
50a7ca3c 19#include <linux/mm_types.h>
2d4a7167 20
019132ff 21#include <asm/cpufeature.h> /* boot_cpu_has, ... */
a2bcd473
IM
22#include <asm/traps.h> /* dotraplinkage, ... */
23#include <asm/pgalloc.h> /* pgd_*(), ... */
f40c3300
AL
24#include <asm/fixmap.h> /* VSYSCALL_ADDR */
25#include <asm/vsyscall.h> /* emulate_vsyscall */
ba3e127e 26#include <asm/vm86.h> /* struct vm86 */
019132ff 27#include <asm/mmu_context.h> /* vma_pkey() */
1da177e4 28
d34603b0
SA
29#define CREATE_TRACE_POINTS
30#include <asm/trace/exceptions.h>
31
b814d41f 32/*
b319eed0
IM
33 * Returns 0 if mmiotrace is disabled, or if the fault is not
34 * handled by mmiotrace:
b814d41f 35 */
9326638c 36static nokprobe_inline int
62c9295f 37kmmio_fault(struct pt_regs *regs, unsigned long addr)
86069782 38{
0fd0e3da
PP
39 if (unlikely(is_kmmio_active()))
40 if (kmmio_handler(regs, addr) == 1)
41 return -1;
0fd0e3da 42 return 0;
86069782
PP
43}
44
9326638c 45static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
1bd858a5 46{
a980c0ef
JH
47 if (!kprobes_built_in())
48 return 0;
49 if (user_mode(regs))
50 return 0;
51 /*
52 * To be potentially processing a kprobe fault and to be allowed to call
53 * kprobe_running(), we have to be non-preemptible.
54 */
55 if (preemptible())
56 return 0;
57 if (!kprobe_running())
58 return 0;
59 return kprobe_fault_handler(regs, X86_TRAP_PF);
33cb5243 60}
1bd858a5 61
1dc85be0 62/*
2d4a7167
IM
63 * Prefetch quirks:
64 *
65 * 32-bit mode:
66 *
67 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
68 * Check that here and ignore it.
1dc85be0 69 *
2d4a7167 70 * 64-bit mode:
1dc85be0 71 *
2d4a7167
IM
72 * Sometimes the CPU reports invalid exceptions on prefetch.
73 * Check that here and ignore it.
74 *
75 * Opcode checker based on code by Richard Brunner.
1dc85be0 76 */
107a0367
IM
77static inline int
78check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
79 unsigned char opcode, int *prefetch)
80{
81 unsigned char instr_hi = opcode & 0xf0;
82 unsigned char instr_lo = opcode & 0x0f;
83
84 switch (instr_hi) {
85 case 0x20:
86 case 0x30:
87 /*
88 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
89 * In X86_64 long mode, the CPU will signal invalid
90 * opcode if some of these prefixes are present so
91 * X86_64 will never get here anyway
92 */
93 return ((instr_lo & 7) == 0x6);
94#ifdef CONFIG_X86_64
95 case 0x40:
96 /*
97 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
98 * Need to figure out under what instruction mode the
99 * instruction was issued. Could check the LDT for lm,
100 * but for now it's good enough to assume that long
101 * mode only uses well known segments or kernel.
102 */
318f5a2a 103 return (!user_mode(regs) || user_64bit_mode(regs));
107a0367
IM
104#endif
105 case 0x60:
106 /* 0x64 thru 0x67 are valid prefixes in all modes. */
107 return (instr_lo & 0xC) == 0x4;
108 case 0xF0:
109 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
110 return !instr_lo || (instr_lo>>1) == 1;
111 case 0x00:
112 /* Prefetch instruction is 0x0F0D or 0x0F18 */
113 if (probe_kernel_address(instr, opcode))
114 return 0;
115
116 *prefetch = (instr_lo == 0xF) &&
117 (opcode == 0x0D || opcode == 0x18);
118 return 0;
119 default:
120 return 0;
121 }
122}
123
2d4a7167
IM
124static int
125is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
33cb5243 126{
2d4a7167 127 unsigned char *max_instr;
ab2bf0c1 128 unsigned char *instr;
33cb5243 129 int prefetch = 0;
1da177e4 130
3085354d
IM
131 /*
132 * If it was a exec (instruction fetch) fault on NX page, then
133 * do not ignore the fault:
134 */
1067f030 135 if (error_code & X86_PF_INSTR)
1da177e4 136 return 0;
1dc85be0 137
107a0367 138 instr = (void *)convert_ip_to_linear(current, regs);
f1290ec9 139 max_instr = instr + 15;
1da177e4 140
d31bf07f 141 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
1da177e4
LT
142 return 0;
143
107a0367 144 while (instr < max_instr) {
2d4a7167 145 unsigned char opcode;
1da177e4 146
ab2bf0c1 147 if (probe_kernel_address(instr, opcode))
33cb5243 148 break;
1da177e4 149
1da177e4
LT
150 instr++;
151
107a0367 152 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
1da177e4 153 break;
1da177e4
LT
154 }
155 return prefetch;
156}
157
019132ff
DH
158/*
159 * A protection key fault means that the PKRU value did not allow
160 * access to some PTE. Userspace can figure out what PKRU was
161 * from the XSAVE state, and this function fills out a field in
162 * siginfo so userspace can discover which protection key was set
163 * on the PTE.
164 *
1067f030 165 * If we get here, we know that the hardware signaled a X86_PF_PK
019132ff
DH
166 * fault and that there was a VMA once we got in the fault
167 * handler. It does *not* guarantee that the VMA we find here
168 * was the one that we faulted on.
169 *
170 * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
171 * 2. T1 : set PKRU to deny access to pkey=4, touches page
172 * 3. T1 : faults...
173 * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
174 * 5. T1 : enters fault handler, takes mmap_sem, etc...
175 * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
176 * faulted on a pte with its pkey=4.
177 */
beacd6f7
EB
178static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info,
179 u32 *pkey)
019132ff
DH
180{
181 /* This is effectively an #ifdef */
182 if (!boot_cpu_has(X86_FEATURE_OSPKE))
183 return;
184
185 /* Fault not from Protection Keys: nothing to do */
beacd6f7 186 if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV))
019132ff
DH
187 return;
188 /*
189 * force_sig_info_fault() is called from a number of
190 * contexts, some of which have a VMA and some of which
1067f030 191 * do not. The X86_PF_PK handing happens after we have a
019132ff
DH
192 * valid VMA, so we should never reach this without a
193 * valid VMA.
194 */
a3c4fb7c 195 if (!pkey) {
019132ff
DH
196 WARN_ONCE(1, "PKU fault with no VMA passed in");
197 info->si_pkey = 0;
198 return;
199 }
200 /*
201 * si_pkey should be thought of as a strong hint, but not
202 * absolutely guranteed to be 100% accurate because of
203 * the race explained above.
204 */
a3c4fb7c 205 info->si_pkey = *pkey;
019132ff
DH
206}
207
2d4a7167
IM
208static void
209force_sig_info_fault(int si_signo, int si_code, unsigned long address,
a3c4fb7c 210 struct task_struct *tsk, u32 *pkey, int fault)
c4aba4a8 211{
f672b49b 212 unsigned lsb = 0;
c4aba4a8
HH
213 siginfo_t info;
214
3eb0f519 215 clear_siginfo(&info);
2d4a7167
IM
216 info.si_signo = si_signo;
217 info.si_errno = 0;
218 info.si_code = si_code;
219 info.si_addr = (void __user *)address;
f672b49b
AK
220 if (fault & VM_FAULT_HWPOISON_LARGE)
221 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
222 if (fault & VM_FAULT_HWPOISON)
223 lsb = PAGE_SHIFT;
224 info.si_addr_lsb = lsb;
2d4a7167 225
beacd6f7 226 fill_sig_info_pkey(si_signo, si_code, &info, pkey);
019132ff 227
c4aba4a8
HH
228 force_sig_info(si_signo, &info, tsk);
229}
230
f2f13a85
IM
231DEFINE_SPINLOCK(pgd_lock);
232LIST_HEAD(pgd_list);
233
234#ifdef CONFIG_X86_32
235static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
33cb5243 236{
f2f13a85
IM
237 unsigned index = pgd_index(address);
238 pgd_t *pgd_k;
e0c4f675 239 p4d_t *p4d, *p4d_k;
f2f13a85
IM
240 pud_t *pud, *pud_k;
241 pmd_t *pmd, *pmd_k;
2d4a7167 242
f2f13a85
IM
243 pgd += index;
244 pgd_k = init_mm.pgd + index;
245
246 if (!pgd_present(*pgd_k))
247 return NULL;
248
249 /*
250 * set_pgd(pgd, *pgd_k); here would be useless on PAE
251 * and redundant with the set_pmd() on non-PAE. As would
e0c4f675 252 * set_p4d/set_pud.
f2f13a85 253 */
e0c4f675
KS
254 p4d = p4d_offset(pgd, address);
255 p4d_k = p4d_offset(pgd_k, address);
256 if (!p4d_present(*p4d_k))
257 return NULL;
258
259 pud = pud_offset(p4d, address);
260 pud_k = pud_offset(p4d_k, address);
f2f13a85
IM
261 if (!pud_present(*pud_k))
262 return NULL;
263
264 pmd = pmd_offset(pud, address);
265 pmd_k = pmd_offset(pud_k, address);
266 if (!pmd_present(*pmd_k))
267 return NULL;
268
b8bcfe99 269 if (!pmd_present(*pmd))
f2f13a85 270 set_pmd(pmd, *pmd_k);
b8bcfe99 271 else
f2f13a85 272 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
f2f13a85
IM
273
274 return pmd_k;
275}
276
277void vmalloc_sync_all(void)
278{
279 unsigned long address;
280
281 if (SHARED_KERNEL_PMD)
282 return;
283
284 for (address = VMALLOC_START & PMD_MASK;
dc4fac84 285 address >= TASK_SIZE_MAX && address < FIXADDR_TOP;
f2f13a85 286 address += PMD_SIZE) {
f2f13a85
IM
287 struct page *page;
288
a79e53d8 289 spin_lock(&pgd_lock);
f2f13a85 290 list_for_each_entry(page, &pgd_list, lru) {
617d34d9 291 spinlock_t *pgt_lock;
f01f7c56 292 pmd_t *ret;
617d34d9 293
a79e53d8 294 /* the pgt_lock only for Xen */
617d34d9
JF
295 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
296
297 spin_lock(pgt_lock);
298 ret = vmalloc_sync_one(page_address(page), address);
299 spin_unlock(pgt_lock);
300
301 if (!ret)
f2f13a85
IM
302 break;
303 }
a79e53d8 304 spin_unlock(&pgd_lock);
f2f13a85
IM
305 }
306}
307
308/*
309 * 32-bit:
310 *
311 * Handle a fault on the vmalloc or module mapping area
312 */
9326638c 313static noinline int vmalloc_fault(unsigned long address)
f2f13a85
IM
314{
315 unsigned long pgd_paddr;
316 pmd_t *pmd_k;
317 pte_t *pte_k;
318
319 /* Make sure we are in vmalloc area: */
320 if (!(address >= VMALLOC_START && address < VMALLOC_END))
321 return -1;
322
323 /*
324 * Synchronize this task's top level page-table
325 * with the 'reference' page table.
326 *
327 * Do _not_ use "current" here. We might be inside
328 * an interrupt in the middle of a task switch..
329 */
6c690ee1 330 pgd_paddr = read_cr3_pa();
f2f13a85
IM
331 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
332 if (!pmd_k)
333 return -1;
334
18a95521 335 if (pmd_large(*pmd_k))
f4eafd8b
TK
336 return 0;
337
f2f13a85
IM
338 pte_k = pte_offset_kernel(pmd_k, address);
339 if (!pte_present(*pte_k))
340 return -1;
341
342 return 0;
343}
9326638c 344NOKPROBE_SYMBOL(vmalloc_fault);
f2f13a85
IM
345
346/*
347 * Did it hit the DOS screen memory VA from vm86 mode?
348 */
349static inline void
350check_v8086_mode(struct pt_regs *regs, unsigned long address,
351 struct task_struct *tsk)
352{
9fda6a06 353#ifdef CONFIG_VM86
f2f13a85
IM
354 unsigned long bit;
355
9fda6a06 356 if (!v8086_mode(regs) || !tsk->thread.vm86)
f2f13a85
IM
357 return;
358
359 bit = (address - 0xA0000) >> PAGE_SHIFT;
360 if (bit < 32)
9fda6a06
BG
361 tsk->thread.vm86->screen_bitmap |= 1 << bit;
362#endif
33cb5243 363}
1da177e4 364
087975b0 365static bool low_pfn(unsigned long pfn)
1da177e4 366{
087975b0
AM
367 return pfn < max_low_pfn;
368}
1156e098 369
087975b0
AM
370static void dump_pagetable(unsigned long address)
371{
6c690ee1 372 pgd_t *base = __va(read_cr3_pa());
087975b0 373 pgd_t *pgd = &base[pgd_index(address)];
e0c4f675
KS
374 p4d_t *p4d;
375 pud_t *pud;
087975b0
AM
376 pmd_t *pmd;
377 pte_t *pte;
2d4a7167 378
1156e098 379#ifdef CONFIG_X86_PAE
39e48d9b 380 pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
087975b0
AM
381 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
382 goto out;
39e48d9b
JB
383#define pr_pde pr_cont
384#else
385#define pr_pde pr_info
1156e098 386#endif
e0c4f675
KS
387 p4d = p4d_offset(pgd, address);
388 pud = pud_offset(p4d, address);
389 pmd = pmd_offset(pud, address);
39e48d9b
JB
390 pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
391#undef pr_pde
1156e098
HH
392
393 /*
394 * We must not directly access the pte in the highpte
395 * case if the page table is located in highmem.
396 * And let's rather not kmap-atomic the pte, just in case
2d4a7167 397 * it's allocated already:
1156e098 398 */
087975b0
AM
399 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
400 goto out;
1156e098 401
087975b0 402 pte = pte_offset_kernel(pmd, address);
39e48d9b 403 pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
087975b0 404out:
39e48d9b 405 pr_cont("\n");
f2f13a85
IM
406}
407
408#else /* CONFIG_X86_64: */
409
410void vmalloc_sync_all(void)
411{
5372e155 412 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
f2f13a85
IM
413}
414
415/*
416 * 64-bit:
417 *
418 * Handle a fault on the vmalloc area
f2f13a85 419 */
9326638c 420static noinline int vmalloc_fault(unsigned long address)
f2f13a85 421{
565977a3
TK
422 pgd_t *pgd, *pgd_k;
423 p4d_t *p4d, *p4d_k;
424 pud_t *pud;
425 pmd_t *pmd;
426 pte_t *pte;
f2f13a85
IM
427
428 /* Make sure we are in vmalloc area: */
429 if (!(address >= VMALLOC_START && address < VMALLOC_END))
430 return -1;
431
ebc8827f
FW
432 WARN_ON_ONCE(in_nmi());
433
f2f13a85
IM
434 /*
435 * Copy kernel mappings over when needed. This can also
436 * happen within a race in page table update. In the later
437 * case just flush:
438 */
6c690ee1 439 pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
565977a3
TK
440 pgd_k = pgd_offset_k(address);
441 if (pgd_none(*pgd_k))
f2f13a85
IM
442 return -1;
443
ed7588d5 444 if (pgtable_l5_enabled()) {
36b3a772 445 if (pgd_none(*pgd)) {
565977a3 446 set_pgd(pgd, *pgd_k);
36b3a772
AL
447 arch_flush_lazy_mmu_mode();
448 } else {
565977a3 449 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
36b3a772 450 }
1160c277 451 }
f2f13a85 452
b50858ce
KS
453 /* With 4-level paging, copying happens on the p4d level. */
454 p4d = p4d_offset(pgd, address);
565977a3
TK
455 p4d_k = p4d_offset(pgd_k, address);
456 if (p4d_none(*p4d_k))
b50858ce
KS
457 return -1;
458
ed7588d5 459 if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
565977a3 460 set_p4d(p4d, *p4d_k);
b50858ce
KS
461 arch_flush_lazy_mmu_mode();
462 } else {
565977a3 463 BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
b50858ce
KS
464 }
465
36b3a772 466 BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
f2f13a85 467
b50858ce 468 pud = pud_offset(p4d, address);
565977a3 469 if (pud_none(*pud))
f2f13a85
IM
470 return -1;
471
18a95521 472 if (pud_large(*pud))
f4eafd8b
TK
473 return 0;
474
f2f13a85 475 pmd = pmd_offset(pud, address);
565977a3 476 if (pmd_none(*pmd))
f2f13a85
IM
477 return -1;
478
18a95521 479 if (pmd_large(*pmd))
f4eafd8b
TK
480 return 0;
481
f2f13a85 482 pte = pte_offset_kernel(pmd, address);
565977a3
TK
483 if (!pte_present(*pte))
484 return -1;
f2f13a85
IM
485
486 return 0;
487}
9326638c 488NOKPROBE_SYMBOL(vmalloc_fault);
f2f13a85 489
e05139f2 490#ifdef CONFIG_CPU_SUP_AMD
f2f13a85 491static const char errata93_warning[] =
ad361c98
JP
492KERN_ERR
493"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
494"******* Working around it, but it may cause SEGVs or burn power.\n"
495"******* Please consider a BIOS update.\n"
496"******* Disabling USB legacy in the BIOS may also help.\n";
e05139f2 497#endif
f2f13a85
IM
498
499/*
500 * No vm86 mode in 64-bit mode:
501 */
502static inline void
503check_v8086_mode(struct pt_regs *regs, unsigned long address,
504 struct task_struct *tsk)
505{
506}
507
508static int bad_address(void *p)
509{
510 unsigned long dummy;
511
512 return probe_kernel_address((unsigned long *)p, dummy);
513}
514
515static void dump_pagetable(unsigned long address)
516{
6c690ee1 517 pgd_t *base = __va(read_cr3_pa());
087975b0 518 pgd_t *pgd = base + pgd_index(address);
e0c4f675 519 p4d_t *p4d;
1da177e4
LT
520 pud_t *pud;
521 pmd_t *pmd;
522 pte_t *pte;
523
2d4a7167
IM
524 if (bad_address(pgd))
525 goto bad;
526
39e48d9b 527 pr_info("PGD %lx ", pgd_val(*pgd));
2d4a7167
IM
528
529 if (!pgd_present(*pgd))
530 goto out;
1da177e4 531
e0c4f675
KS
532 p4d = p4d_offset(pgd, address);
533 if (bad_address(p4d))
534 goto bad;
535
39e48d9b 536 pr_cont("P4D %lx ", p4d_val(*p4d));
e0c4f675
KS
537 if (!p4d_present(*p4d) || p4d_large(*p4d))
538 goto out;
539
540 pud = pud_offset(p4d, address);
2d4a7167
IM
541 if (bad_address(pud))
542 goto bad;
543
39e48d9b 544 pr_cont("PUD %lx ", pud_val(*pud));
b5360222 545 if (!pud_present(*pud) || pud_large(*pud))
2d4a7167 546 goto out;
1da177e4
LT
547
548 pmd = pmd_offset(pud, address);
2d4a7167
IM
549 if (bad_address(pmd))
550 goto bad;
551
39e48d9b 552 pr_cont("PMD %lx ", pmd_val(*pmd));
2d4a7167
IM
553 if (!pmd_present(*pmd) || pmd_large(*pmd))
554 goto out;
1da177e4
LT
555
556 pte = pte_offset_kernel(pmd, address);
2d4a7167
IM
557 if (bad_address(pte))
558 goto bad;
559
39e48d9b 560 pr_cont("PTE %lx", pte_val(*pte));
2d4a7167 561out:
39e48d9b 562 pr_cont("\n");
1da177e4
LT
563 return;
564bad:
39e48d9b 565 pr_info("BAD\n");
8c938f9f
IM
566}
567
f2f13a85 568#endif /* CONFIG_X86_64 */
1da177e4 569
2d4a7167
IM
570/*
571 * Workaround for K8 erratum #93 & buggy BIOS.
572 *
573 * BIOS SMM functions are required to use a specific workaround
574 * to avoid corruption of the 64bit RIP register on C stepping K8.
575 *
576 * A lot of BIOS that didn't get tested properly miss this.
577 *
578 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
579 * Try to work around it here.
580 *
581 * Note we only handle faults in kernel here.
582 * Does nothing on 32-bit.
fdfe8aa8 583 */
33cb5243 584static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 585{
e05139f2
JB
586#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
587 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
588 || boot_cpu_data.x86 != 0xf)
589 return 0;
590
65ea5b03 591 if (address != regs->ip)
1da177e4 592 return 0;
2d4a7167 593
33cb5243 594 if ((address >> 32) != 0)
1da177e4 595 return 0;
2d4a7167 596
1da177e4 597 address |= 0xffffffffUL << 32;
33cb5243
HH
598 if ((address >= (u64)_stext && address <= (u64)_etext) ||
599 (address >= MODULES_VADDR && address <= MODULES_END)) {
a454ab31 600 printk_once(errata93_warning);
65ea5b03 601 regs->ip = address;
1da177e4
LT
602 return 1;
603 }
fdfe8aa8 604#endif
1da177e4 605 return 0;
33cb5243 606}
1da177e4 607
35f3266f 608/*
2d4a7167
IM
609 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
610 * to illegal addresses >4GB.
611 *
612 * We catch this in the page fault handler because these addresses
613 * are not reachable. Just detect this case and return. Any code
35f3266f
HH
614 * segment in LDT is compatibility mode.
615 */
616static int is_errata100(struct pt_regs *regs, unsigned long address)
617{
618#ifdef CONFIG_X86_64
2d4a7167 619 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
35f3266f
HH
620 return 1;
621#endif
622 return 0;
623}
624
29caf2f9
HH
625static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
626{
627#ifdef CONFIG_X86_F00F_BUG
628 unsigned long nr;
2d4a7167 629
29caf2f9 630 /*
2d4a7167 631 * Pentium F0 0F C7 C8 bug workaround:
29caf2f9 632 */
e2604b49 633 if (boot_cpu_has_bug(X86_BUG_F00F)) {
29caf2f9
HH
634 nr = (address - idt_descr.address) >> 3;
635
636 if (nr == 6) {
637 do_invalid_op(regs, 0);
638 return 1;
639 }
640 }
641#endif
642 return 0;
643}
644
2d4a7167
IM
645static void
646show_fault_oops(struct pt_regs *regs, unsigned long error_code,
647 unsigned long address)
b3279c7f 648{
1156e098
HH
649 if (!oops_may_print())
650 return;
651
1067f030 652 if (error_code & X86_PF_INSTR) {
93809be8 653 unsigned int level;
426e34cc
MF
654 pgd_t *pgd;
655 pte_t *pte;
2d4a7167 656
6c690ee1 657 pgd = __va(read_cr3_pa());
426e34cc
MF
658 pgd += pgd_index(address);
659
660 pte = lookup_address_in_pgd(pgd, address, &level);
1156e098 661
8f766149 662 if (pte && pte_present(*pte) && !pte_exec(*pte))
d79d0d8a
DV
663 pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
664 from_kuid(&init_user_ns, current_uid()));
eff50c34
JK
665 if (pte && pte_present(*pte) && pte_exec(*pte) &&
666 (pgd_flags(*pgd) & _PAGE_USER) &&
1e02ce4c 667 (__read_cr4() & X86_CR4_SMEP))
d79d0d8a
DV
668 pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
669 from_kuid(&init_user_ns, current_uid()));
1156e098 670 }
1156e098 671
4188f063
DV
672 pr_alert("BUG: unable to handle kernel %s at %px\n",
673 address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
674 (void *)address);
2d4a7167 675
b3279c7f
HH
676 dump_pagetable(address);
677}
678
2d4a7167
IM
679static noinline void
680pgtable_bad(struct pt_regs *regs, unsigned long error_code,
681 unsigned long address)
1da177e4 682{
2d4a7167
IM
683 struct task_struct *tsk;
684 unsigned long flags;
685 int sig;
686
687 flags = oops_begin();
688 tsk = current;
689 sig = SIGKILL;
1209140c 690
1da177e4 691 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
92181f19 692 tsk->comm, address);
1da177e4 693 dump_pagetable(address);
2d4a7167
IM
694
695 tsk->thread.cr2 = address;
51e7dc70 696 tsk->thread.trap_nr = X86_TRAP_PF;
2d4a7167
IM
697 tsk->thread.error_code = error_code;
698
22f5991c 699 if (__die("Bad pagetable", regs, error_code))
874d93d1 700 sig = 0;
2d4a7167 701
874d93d1 702 oops_end(flags, regs, sig);
1da177e4
LT
703}
704
2d4a7167
IM
705static noinline void
706no_context(struct pt_regs *regs, unsigned long error_code,
4fc34901 707 unsigned long address, int signal, int si_code)
92181f19
NP
708{
709 struct task_struct *tsk = current;
92181f19
NP
710 unsigned long flags;
711 int sig;
92181f19 712
2d4a7167 713 /* Are we prepared to handle this kernel fault? */
548acf19 714 if (fixup_exception(regs, X86_TRAP_PF)) {
c026b359
PZ
715 /*
716 * Any interrupt that takes a fault gets the fixup. This makes
717 * the below recursive fault logic only apply to a faults from
718 * task context.
719 */
720 if (in_interrupt())
721 return;
722
723 /*
724 * Per the above we're !in_interrupt(), aka. task context.
725 *
726 * In this case we need to make sure we're not recursively
727 * faulting through the emulate_vsyscall() logic.
728 */
2a53ccbc 729 if (current->thread.sig_on_uaccess_err && signal) {
51e7dc70 730 tsk->thread.trap_nr = X86_TRAP_PF;
1067f030 731 tsk->thread.error_code = error_code | X86_PF_USER;
4fc34901
AL
732 tsk->thread.cr2 = address;
733
734 /* XXX: hwpoison faults will set the wrong code. */
7b2d0dba 735 force_sig_info_fault(signal, si_code, address,
a3c4fb7c 736 tsk, NULL, 0);
4fc34901 737 }
c026b359
PZ
738
739 /*
740 * Barring that, we can do the fixup and be happy.
741 */
92181f19 742 return;
4fc34901 743 }
92181f19 744
6271cfdf
AL
745#ifdef CONFIG_VMAP_STACK
746 /*
747 * Stack overflow? During boot, we can fault near the initial
748 * stack in the direct map, but that's not an overflow -- check
749 * that we're in vmalloc space to avoid this.
750 */
751 if (is_vmalloc_addr((void *)address) &&
752 (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
753 address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
6271cfdf
AL
754 unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
755 /*
756 * We're likely to be running with very little stack space
757 * left. It's plausible that we'd hit this condition but
758 * double-fault even before we get this far, in which case
759 * we're fine: the double-fault handler will deal with it.
760 *
761 * We don't want to make it all the way into the oops code
762 * and then double-fault, though, because we're likely to
763 * break the console driver and lose most of the stack dump.
764 */
765 asm volatile ("movq %[stack], %%rsp\n\t"
766 "call handle_stack_overflow\n\t"
767 "1: jmp 1b"
f5caf621 768 : ASM_CALL_CONSTRAINT
6271cfdf
AL
769 : "D" ("kernel stack overflow (page fault)"),
770 "S" (regs), "d" (address),
771 [stack] "rm" (stack));
772 unreachable();
773 }
774#endif
775
92181f19 776 /*
2d4a7167
IM
777 * 32-bit:
778 *
779 * Valid to do another page fault here, because if this fault
780 * had been triggered by is_prefetch fixup_exception would have
781 * handled it.
782 *
783 * 64-bit:
92181f19 784 *
2d4a7167 785 * Hall of shame of CPU/BIOS bugs.
92181f19
NP
786 */
787 if (is_prefetch(regs, error_code, address))
788 return;
789
790 if (is_errata93(regs, address))
791 return;
792
793 /*
794 * Oops. The kernel tried to access some bad page. We'll have to
2d4a7167 795 * terminate things with extreme prejudice:
92181f19 796 */
92181f19 797 flags = oops_begin();
92181f19
NP
798
799 show_fault_oops(regs, error_code, address);
800
a70857e4 801 if (task_stack_end_corrupted(tsk))
b0f4c4b3 802 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
19803078 803
1cc99544 804 tsk->thread.cr2 = address;
51e7dc70 805 tsk->thread.trap_nr = X86_TRAP_PF;
1cc99544 806 tsk->thread.error_code = error_code;
92181f19 807
92181f19
NP
808 sig = SIGKILL;
809 if (__die("Oops", regs, error_code))
810 sig = 0;
2d4a7167 811
92181f19 812 /* Executive summary in case the body of the oops scrolled away */
b0f4c4b3 813 printk(KERN_DEFAULT "CR2: %016lx\n", address);
2d4a7167 814
92181f19 815 oops_end(flags, regs, sig);
92181f19
NP
816}
817
2d4a7167
IM
818/*
819 * Print out info about fatal segfaults, if the show_unhandled_signals
820 * sysctl is set:
821 */
822static inline void
823show_signal_msg(struct pt_regs *regs, unsigned long error_code,
824 unsigned long address, struct task_struct *tsk)
825{
ba54d856
BP
826 const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
827
2d4a7167
IM
828 if (!unhandled_signal(tsk, SIGSEGV))
829 return;
830
831 if (!printk_ratelimit())
832 return;
833
10a7e9d8 834 printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
ba54d856 835 loglvl, tsk->comm, task_pid_nr(tsk), address,
2d4a7167
IM
836 (void *)regs->ip, (void *)regs->sp, error_code);
837
838 print_vma_addr(KERN_CONT " in ", regs->ip);
839
840 printk(KERN_CONT "\n");
ba54d856 841
342db04a 842 show_opcodes(regs, loglvl);
2d4a7167
IM
843}
844
845static void
846__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
a3c4fb7c 847 unsigned long address, u32 *pkey, int si_code)
92181f19
NP
848{
849 struct task_struct *tsk = current;
850
851 /* User mode accesses just cause a SIGSEGV */
1067f030 852 if (error_code & X86_PF_USER) {
92181f19 853 /*
2d4a7167 854 * It's possible to have interrupts off here:
92181f19
NP
855 */
856 local_irq_enable();
857
858 /*
859 * Valid to do another page fault here because this one came
2d4a7167 860 * from user space:
92181f19
NP
861 */
862 if (is_prefetch(regs, error_code, address))
863 return;
864
865 if (is_errata100(regs, address))
866 return;
867
3ae36655
AL
868#ifdef CONFIG_X86_64
869 /*
870 * Instruction fetch faults in the vsyscall page might need
871 * emulation.
872 */
1067f030 873 if (unlikely((error_code & X86_PF_INSTR) &&
f40c3300 874 ((address & ~0xfff) == VSYSCALL_ADDR))) {
3ae36655
AL
875 if (emulate_vsyscall(regs, address))
876 return;
877 }
878#endif
dc4fac84
AL
879
880 /*
881 * To avoid leaking information about the kernel page table
882 * layout, pretend that user-mode accesses to kernel addresses
883 * are always protection faults.
884 */
885 if (address >= TASK_SIZE_MAX)
1067f030 886 error_code |= X86_PF_PROT;
3ae36655 887
e575a86f 888 if (likely(show_unhandled_signals))
2d4a7167
IM
889 show_signal_msg(regs, error_code, address, tsk);
890
2d4a7167 891 tsk->thread.cr2 = address;
e575a86f 892 tsk->thread.error_code = error_code;
51e7dc70 893 tsk->thread.trap_nr = X86_TRAP_PF;
92181f19 894
a3c4fb7c 895 force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
2d4a7167 896
92181f19
NP
897 return;
898 }
899
900 if (is_f00f_bug(regs, address))
901 return;
902
4fc34901 903 no_context(regs, error_code, address, SIGSEGV, si_code);
92181f19
NP
904}
905
2d4a7167
IM
906static noinline void
907bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
a3c4fb7c 908 unsigned long address, u32 *pkey)
92181f19 909{
a3c4fb7c 910 __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
92181f19
NP
911}
912
2d4a7167
IM
913static void
914__bad_area(struct pt_regs *regs, unsigned long error_code,
7b2d0dba 915 unsigned long address, struct vm_area_struct *vma, int si_code)
92181f19
NP
916{
917 struct mm_struct *mm = current->mm;
a3c4fb7c
LD
918 u32 pkey;
919
920 if (vma)
921 pkey = vma_pkey(vma);
92181f19
NP
922
923 /*
924 * Something tried to access memory that isn't in our memory map..
925 * Fix it, but check if it's kernel or user first..
926 */
927 up_read(&mm->mmap_sem);
928
a3c4fb7c
LD
929 __bad_area_nosemaphore(regs, error_code, address,
930 (vma) ? &pkey : NULL, si_code);
92181f19
NP
931}
932
2d4a7167
IM
933static noinline void
934bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
92181f19 935{
7b2d0dba 936 __bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
92181f19
NP
937}
938
33a709b2
DH
939static inline bool bad_area_access_from_pkeys(unsigned long error_code,
940 struct vm_area_struct *vma)
941{
07f146f5
DH
942 /* This code is always called on the current mm */
943 bool foreign = false;
944
33a709b2
DH
945 if (!boot_cpu_has(X86_FEATURE_OSPKE))
946 return false;
1067f030 947 if (error_code & X86_PF_PK)
33a709b2 948 return true;
07f146f5 949 /* this checks permission keys on the VMA: */
1067f030
RN
950 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
951 (error_code & X86_PF_INSTR), foreign))
07f146f5 952 return true;
33a709b2 953 return false;
92181f19
NP
954}
955
2d4a7167
IM
956static noinline void
957bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
7b2d0dba 958 unsigned long address, struct vm_area_struct *vma)
92181f19 959{
019132ff
DH
960 /*
961 * This OSPKE check is not strictly necessary at runtime.
962 * But, doing it this way allows compiler optimizations
963 * if pkeys are compiled out.
964 */
33a709b2 965 if (bad_area_access_from_pkeys(error_code, vma))
019132ff
DH
966 __bad_area(regs, error_code, address, vma, SEGV_PKUERR);
967 else
968 __bad_area(regs, error_code, address, vma, SEGV_ACCERR);
92181f19
NP
969}
970
2d4a7167 971static void
a6e04aa9 972do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
a3c4fb7c 973 u32 *pkey, unsigned int fault)
92181f19
NP
974{
975 struct task_struct *tsk = current;
a6e04aa9 976 int code = BUS_ADRERR;
92181f19 977
2d4a7167 978 /* Kernel mode? Handle exceptions or die: */
1067f030 979 if (!(error_code & X86_PF_USER)) {
4fc34901 980 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
96054569
LT
981 return;
982 }
2d4a7167 983
cd1b68f0 984 /* User-space => ok to do another page fault: */
92181f19
NP
985 if (is_prefetch(regs, error_code, address))
986 return;
2d4a7167
IM
987
988 tsk->thread.cr2 = address;
989 tsk->thread.error_code = error_code;
51e7dc70 990 tsk->thread.trap_nr = X86_TRAP_PF;
2d4a7167 991
a6e04aa9 992#ifdef CONFIG_MEMORY_FAILURE
f672b49b 993 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
a6e04aa9
AK
994 printk(KERN_ERR
995 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
996 tsk->comm, tsk->pid, address);
997 code = BUS_MCEERR_AR;
998 }
999#endif
a3c4fb7c 1000 force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);
92181f19
NP
1001}
1002
3a13c4d7 1003static noinline void
2d4a7167 1004mm_fault_error(struct pt_regs *regs, unsigned long error_code,
50a7ca3c 1005 unsigned long address, u32 *pkey, vm_fault_t fault)
92181f19 1006{
1067f030 1007 if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
3a13c4d7
JW
1008 no_context(regs, error_code, address, 0, 0);
1009 return;
b80ef10e 1010 }
b80ef10e 1011
2d4a7167 1012 if (fault & VM_FAULT_OOM) {
f8626854 1013 /* Kernel mode? Handle exceptions or die: */
1067f030 1014 if (!(error_code & X86_PF_USER)) {
4fc34901
AL
1015 no_context(regs, error_code, address,
1016 SIGSEGV, SEGV_MAPERR);
3a13c4d7 1017 return;
f8626854
AV
1018 }
1019
c2d23f91
DR
1020 /*
1021 * We ran out of memory, call the OOM killer, and return the
1022 * userspace (which will retry the fault, or kill us if we got
1023 * oom-killed):
1024 */
1025 pagefault_out_of_memory();
2d4a7167 1026 } else {
f672b49b
AK
1027 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
1028 VM_FAULT_HWPOISON_LARGE))
a3c4fb7c 1029 do_sigbus(regs, error_code, address, pkey, fault);
33692f27 1030 else if (fault & VM_FAULT_SIGSEGV)
a3c4fb7c 1031 bad_area_nosemaphore(regs, error_code, address, pkey);
2d4a7167
IM
1032 else
1033 BUG();
1034 }
92181f19
NP
1035}
1036
d8b57bb7
TG
1037static int spurious_fault_check(unsigned long error_code, pte_t *pte)
1038{
1067f030 1039 if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
d8b57bb7 1040 return 0;
2d4a7167 1041
1067f030 1042 if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
d8b57bb7 1043 return 0;
b3ecd515
DH
1044 /*
1045 * Note: We do not do lazy flushing on protection key
1067f030 1046 * changes, so no spurious fault will ever set X86_PF_PK.
b3ecd515 1047 */
1067f030 1048 if ((error_code & X86_PF_PK))
b3ecd515 1049 return 1;
d8b57bb7
TG
1050
1051 return 1;
1052}
1053
5b727a3b 1054/*
2d4a7167
IM
1055 * Handle a spurious fault caused by a stale TLB entry.
1056 *
1057 * This allows us to lazily refresh the TLB when increasing the
1058 * permissions of a kernel page (RO -> RW or NX -> X). Doing it
1059 * eagerly is very expensive since that implies doing a full
1060 * cross-processor TLB flush, even if no stale TLB entries exist
1061 * on other processors.
1062 *
31668511
DV
1063 * Spurious faults may only occur if the TLB contains an entry with
1064 * fewer permission than the page table entry. Non-present (P = 0)
1065 * and reserved bit (R = 1) faults are never spurious.
1066 *
5b727a3b
JF
1067 * There are no security implications to leaving a stale TLB when
1068 * increasing the permissions on a page.
31668511
DV
1069 *
1070 * Returns non-zero if a spurious fault was handled, zero otherwise.
1071 *
1072 * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
1073 * (Optional Invalidation).
5b727a3b 1074 */
9326638c 1075static noinline int
2d4a7167 1076spurious_fault(unsigned long error_code, unsigned long address)
5b727a3b
JF
1077{
1078 pgd_t *pgd;
e0c4f675 1079 p4d_t *p4d;
5b727a3b
JF
1080 pud_t *pud;
1081 pmd_t *pmd;
1082 pte_t *pte;
3c3e5694 1083 int ret;
5b727a3b 1084
31668511
DV
1085 /*
1086 * Only writes to RO or instruction fetches from NX may cause
1087 * spurious faults.
1088 *
1089 * These could be from user or supervisor accesses but the TLB
1090 * is only lazily flushed after a kernel mapping protection
1091 * change, so user accesses are not expected to cause spurious
1092 * faults.
1093 */
1067f030
RN
1094 if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
1095 error_code != (X86_PF_INSTR | X86_PF_PROT))
5b727a3b
JF
1096 return 0;
1097
1098 pgd = init_mm.pgd + pgd_index(address);
1099 if (!pgd_present(*pgd))
1100 return 0;
1101
e0c4f675
KS
1102 p4d = p4d_offset(pgd, address);
1103 if (!p4d_present(*p4d))
1104 return 0;
1105
1106 if (p4d_large(*p4d))
1107 return spurious_fault_check(error_code, (pte_t *) p4d);
1108
1109 pud = pud_offset(p4d, address);
5b727a3b
JF
1110 if (!pud_present(*pud))
1111 return 0;
1112
d8b57bb7
TG
1113 if (pud_large(*pud))
1114 return spurious_fault_check(error_code, (pte_t *) pud);
1115
5b727a3b
JF
1116 pmd = pmd_offset(pud, address);
1117 if (!pmd_present(*pmd))
1118 return 0;
1119
d8b57bb7
TG
1120 if (pmd_large(*pmd))
1121 return spurious_fault_check(error_code, (pte_t *) pmd);
1122
5b727a3b 1123 pte = pte_offset_kernel(pmd, address);
954f8571 1124 if (!pte_present(*pte))
5b727a3b
JF
1125 return 0;
1126
3c3e5694
SR
1127 ret = spurious_fault_check(error_code, pte);
1128 if (!ret)
1129 return 0;
1130
1131 /*
2d4a7167
IM
1132 * Make sure we have permissions in PMD.
1133 * If not, then there's a bug in the page tables:
3c3e5694
SR
1134 */
1135 ret = spurious_fault_check(error_code, (pte_t *) pmd);
1136 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
2d4a7167 1137
3c3e5694 1138 return ret;
5b727a3b 1139}
9326638c 1140NOKPROBE_SYMBOL(spurious_fault);
5b727a3b 1141
abd4f750 1142int show_unhandled_signals = 1;
1da177e4 1143
2d4a7167 1144static inline int
68da336a 1145access_error(unsigned long error_code, struct vm_area_struct *vma)
92181f19 1146{
07f146f5
DH
1147 /* This is only called for the current mm, so: */
1148 bool foreign = false;
e8c6226d
DH
1149
1150 /*
1151 * Read or write was blocked by protection keys. This is
1152 * always an unconditional error and can never result in
1153 * a follow-up action to resolve the fault, like a COW.
1154 */
1067f030 1155 if (error_code & X86_PF_PK)
e8c6226d
DH
1156 return 1;
1157
07f146f5
DH
1158 /*
1159 * Make sure to check the VMA so that we do not perform
1067f030 1160 * faults just to hit a X86_PF_PK as soon as we fill in a
07f146f5
DH
1161 * page.
1162 */
1067f030
RN
1163 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
1164 (error_code & X86_PF_INSTR), foreign))
07f146f5 1165 return 1;
33a709b2 1166
1067f030 1167 if (error_code & X86_PF_WRITE) {
2d4a7167 1168 /* write, present and write, not present: */
92181f19
NP
1169 if (unlikely(!(vma->vm_flags & VM_WRITE)))
1170 return 1;
2d4a7167 1171 return 0;
92181f19
NP
1172 }
1173
2d4a7167 1174 /* read, present: */
1067f030 1175 if (unlikely(error_code & X86_PF_PROT))
2d4a7167
IM
1176 return 1;
1177
1178 /* read, not present: */
1179 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
1180 return 1;
1181
92181f19
NP
1182 return 0;
1183}
1184
0973a06c
HS
1185static int fault_in_kernel_space(unsigned long address)
1186{
d9517346 1187 return address >= TASK_SIZE_MAX;
0973a06c
HS
1188}
1189
40d3cd66
PA
1190static inline bool smap_violation(int error_code, struct pt_regs *regs)
1191{
4640c7ee
PA
1192 if (!IS_ENABLED(CONFIG_X86_SMAP))
1193 return false;
1194
1195 if (!static_cpu_has(X86_FEATURE_SMAP))
1196 return false;
1197
1067f030 1198 if (error_code & X86_PF_USER)
40d3cd66
PA
1199 return false;
1200
f39b6f0e 1201 if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
40d3cd66
PA
1202 return false;
1203
1204 return true;
1205}
1206
1da177e4
LT
1207/*
1208 * This routine handles page faults. It determines the address,
1209 * and the problem, and then passes it off to one of the appropriate
1210 * routines.
1da177e4 1211 */
9326638c 1212static noinline void
0ac09f9f
JO
1213__do_page_fault(struct pt_regs *regs, unsigned long error_code,
1214 unsigned long address)
1da177e4 1215{
2d4a7167 1216 struct vm_area_struct *vma;
1da177e4
LT
1217 struct task_struct *tsk;
1218 struct mm_struct *mm;
50a7ca3c 1219 vm_fault_t fault, major = 0;
759496ba 1220 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
a3c4fb7c 1221 u32 pkey;
1da177e4 1222
a9ba9a3b
AV
1223 tsk = current;
1224 mm = tsk->mm;
2d4a7167 1225
5dfaf90f 1226 prefetchw(&mm->mmap_sem);
f8561296 1227
0fd0e3da 1228 if (unlikely(kmmio_fault(regs, address)))
86069782 1229 return;
1da177e4
LT
1230
1231 /*
1232 * We fault-in kernel-space virtual memory on-demand. The
1233 * 'reference' page table is init_mm.pgd.
1234 *
1235 * NOTE! We MUST NOT take any locks for this case. We may
1236 * be in an interrupt or a critical region, and should
1237 * only copy the information from the master page table,
1238 * nothing more.
1239 *
1240 * This verifies that the fault happens in kernel space
1241 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 1242 * protection error (error_code & 9) == 0.
1da177e4 1243 */
0973a06c 1244 if (unlikely(fault_in_kernel_space(address))) {
1067f030 1245 if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
f8561296
VN
1246 if (vmalloc_fault(address) >= 0)
1247 return;
f8561296 1248 }
5b727a3b 1249
2d4a7167 1250 /* Can handle a stale RO->RW TLB: */
92181f19 1251 if (spurious_fault(error_code, address))
5b727a3b
JF
1252 return;
1253
2d4a7167 1254 /* kprobes don't want to hook the spurious faults: */
e00b12e6 1255 if (kprobes_fault(regs))
9be260a6 1256 return;
f8c2ee22
HH
1257 /*
1258 * Don't take the mm semaphore here. If we fixup a prefetch
2d4a7167 1259 * fault we could otherwise deadlock:
f8c2ee22 1260 */
7b2d0dba 1261 bad_area_nosemaphore(regs, error_code, address, NULL);
2d4a7167 1262
92181f19 1263 return;
f8c2ee22
HH
1264 }
1265
2d4a7167 1266 /* kprobes don't want to hook the spurious faults: */
e00b12e6 1267 if (unlikely(kprobes_fault(regs)))
9be260a6 1268 return;
8c914cb7 1269
1067f030 1270 if (unlikely(error_code & X86_PF_RSVD))
92181f19 1271 pgtable_bad(regs, error_code, address);
1da177e4 1272
4640c7ee 1273 if (unlikely(smap_violation(error_code, regs))) {
7b2d0dba 1274 bad_area_nosemaphore(regs, error_code, address, NULL);
4640c7ee 1275 return;
40d3cd66
PA
1276 }
1277
1da177e4 1278 /*
2d4a7167 1279 * If we're in an interrupt, have no user context or are running
70ffdb93 1280 * in a region with pagefaults disabled then we must not take the fault
1da177e4 1281 */
70ffdb93 1282 if (unlikely(faulthandler_disabled() || !mm)) {
7b2d0dba 1283 bad_area_nosemaphore(regs, error_code, address, NULL);
92181f19
NP
1284 return;
1285 }
1da177e4 1286
e00b12e6
PZ
1287 /*
1288 * It's safe to allow irq's after cr2 has been saved and the
1289 * vmalloc fault has been handled.
1290 *
1291 * User-mode registers count as a user access even for any
1292 * potential system fault or CPU buglet:
1293 */
f39b6f0e 1294 if (user_mode(regs)) {
e00b12e6 1295 local_irq_enable();
1067f030 1296 error_code |= X86_PF_USER;
e00b12e6
PZ
1297 flags |= FAULT_FLAG_USER;
1298 } else {
1299 if (regs->flags & X86_EFLAGS_IF)
1300 local_irq_enable();
1301 }
1302
1303 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1304
1067f030 1305 if (error_code & X86_PF_WRITE)
759496ba 1306 flags |= FAULT_FLAG_WRITE;
1067f030 1307 if (error_code & X86_PF_INSTR)
d61172b4 1308 flags |= FAULT_FLAG_INSTRUCTION;
759496ba 1309
3a1dfe6e
IM
1310 /*
1311 * When running in the kernel we expect faults to occur only to
2d4a7167
IM
1312 * addresses in user space. All other faults represent errors in
1313 * the kernel and should generate an OOPS. Unfortunately, in the
1314 * case of an erroneous fault occurring in a code path which already
1315 * holds mmap_sem we will deadlock attempting to validate the fault
1316 * against the address space. Luckily the kernel only validly
1317 * references user space from well defined areas of code, which are
1318 * listed in the exceptions table.
1da177e4
LT
1319 *
1320 * As the vast majority of faults will be valid we will only perform
2d4a7167
IM
1321 * the source reference check when there is a possibility of a
1322 * deadlock. Attempt to lock the address space, if we cannot we then
1323 * validate the source. If this is invalid we can skip the address
1324 * space check, thus avoiding the deadlock:
1da177e4 1325 */
92181f19 1326 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1067f030 1327 if (!(error_code & X86_PF_USER) &&
92181f19 1328 !search_exception_tables(regs->ip)) {
7b2d0dba 1329 bad_area_nosemaphore(regs, error_code, address, NULL);
92181f19
NP
1330 return;
1331 }
d065bd81 1332retry:
1da177e4 1333 down_read(&mm->mmap_sem);
01006074
PZ
1334 } else {
1335 /*
2d4a7167
IM
1336 * The above down_read_trylock() might have succeeded in
1337 * which case we'll have missed the might_sleep() from
1338 * down_read():
01006074
PZ
1339 */
1340 might_sleep();
1da177e4
LT
1341 }
1342
1343 vma = find_vma(mm, address);
92181f19
NP
1344 if (unlikely(!vma)) {
1345 bad_area(regs, error_code, address);
1346 return;
1347 }
1348 if (likely(vma->vm_start <= address))
1da177e4 1349 goto good_area;
92181f19
NP
1350 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1351 bad_area(regs, error_code, address);
1352 return;
1353 }
1067f030 1354 if (error_code & X86_PF_USER) {
6f4d368e
HH
1355 /*
1356 * Accessing the stack below %sp is always a bug.
1357 * The large cushion allows instructions like enter
2d4a7167 1358 * and pusha to work. ("enter $65535, $31" pushes
6f4d368e 1359 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 1360 */
92181f19
NP
1361 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
1362 bad_area(regs, error_code, address);
1363 return;
1364 }
1da177e4 1365 }
92181f19
NP
1366 if (unlikely(expand_stack(vma, address))) {
1367 bad_area(regs, error_code, address);
1368 return;
1369 }
1370
1371 /*
1372 * Ok, we have a good vm_area for this memory access, so
1373 * we can handle it..
1374 */
1da177e4 1375good_area:
68da336a 1376 if (unlikely(access_error(error_code, vma))) {
7b2d0dba 1377 bad_area_access_error(regs, error_code, address, vma);
92181f19 1378 return;
1da177e4
LT
1379 }
1380
1381 /*
1382 * If for any reason at all we couldn't handle the fault,
1383 * make sure we exit gracefully rather than endlessly redo
9a95f3cf
PC
1384 * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
1385 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
cb0631fd
VB
1386 *
1387 * Note that handle_userfault() may also release and reacquire mmap_sem
1388 * (and not return with VM_FAULT_RETRY), when returning to userland to
1389 * repeat the page fault later with a VM_FAULT_NOPAGE retval
1390 * (potentially after handling any pending signal during the return to
1391 * userland). The return to userland is identified whenever
1392 * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
1393 * Thus we have to be careful about not touching vma after handling the
1394 * fault, so we read the pkey beforehand.
1da177e4 1395 */
cb0631fd 1396 pkey = vma_pkey(vma);
dcddffd4 1397 fault = handle_mm_fault(vma, address, flags);
26178ec1 1398 major |= fault & VM_FAULT_MAJOR;
2d4a7167 1399
3a13c4d7 1400 /*
26178ec1
LT
1401 * If we need to retry the mmap_sem has already been released,
1402 * and if there is a fatal signal pending there is no guarantee
1403 * that we made any progress. Handle this case first.
3a13c4d7 1404 */
26178ec1
LT
1405 if (unlikely(fault & VM_FAULT_RETRY)) {
1406 /* Retry at most once */
1407 if (flags & FAULT_FLAG_ALLOW_RETRY) {
1408 flags &= ~FAULT_FLAG_ALLOW_RETRY;
1409 flags |= FAULT_FLAG_TRIED;
1410 if (!fatal_signal_pending(tsk))
1411 goto retry;
1412 }
1413
1414 /* User mode? Just return to handle the fatal exception */
cf3c0a15 1415 if (flags & FAULT_FLAG_USER)
26178ec1
LT
1416 return;
1417
1418 /* Not returning to user mode? Handle exceptions or die: */
1419 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
3a13c4d7 1420 return;
26178ec1 1421 }
3a13c4d7 1422
26178ec1 1423 up_read(&mm->mmap_sem);
3a13c4d7 1424 if (unlikely(fault & VM_FAULT_ERROR)) {
a3c4fb7c 1425 mm_fault_error(regs, error_code, address, &pkey, fault);
3a13c4d7 1426 return;
37b23e05
KM
1427 }
1428
d065bd81 1429 /*
26178ec1
LT
1430 * Major/minor page fault accounting. If any of the events
1431 * returned VM_FAULT_MAJOR, we account it as a major fault.
d065bd81 1432 */
26178ec1
LT
1433 if (major) {
1434 tsk->maj_flt++;
1435 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
1436 } else {
1437 tsk->min_flt++;
1438 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
ac17dc8e 1439 }
d729ab35 1440
8c938f9f 1441 check_v8086_mode(regs, address, tsk);
1da177e4 1442}
9326638c 1443NOKPROBE_SYMBOL(__do_page_fault);
6ba3c97a 1444
9326638c
MH
1445static nokprobe_inline void
1446trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
1447 unsigned long error_code)
d34603b0
SA
1448{
1449 if (user_mode(regs))
d4078e23 1450 trace_page_fault_user(address, regs, error_code);
d34603b0 1451 else
d4078e23 1452 trace_page_fault_kernel(address, regs, error_code);
d34603b0
SA
1453}
1454
11a7ffb0
TG
1455/*
1456 * We must have this function blacklisted from kprobes, tagged with notrace
1457 * and call read_cr2() before calling anything else. To avoid calling any
1458 * kind of tracing machinery before we've observed the CR2 value.
1459 *
1460 * exception_{enter,exit}() contains all sorts of tracepoints.
1461 */
9326638c 1462dotraplinkage void notrace
11a7ffb0 1463do_page_fault(struct pt_regs *regs, unsigned long error_code)
25c74b10 1464{
11a7ffb0 1465 unsigned long address = read_cr2(); /* Get the faulting address */
d4078e23 1466 enum ctx_state prev_state;
25c74b10
SA
1467
1468 prev_state = exception_enter();
80954747 1469 if (trace_pagefault_enabled())
11a7ffb0
TG
1470 trace_page_fault_entries(address, regs, error_code);
1471
0ac09f9f 1472 __do_page_fault(regs, error_code, address);
25c74b10
SA
1473 exception_exit(prev_state);
1474}
11a7ffb0 1475NOKPROBE_SYMBOL(do_page_fault);