]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - arch/x86/mm/fault.c
x86: fix sparse warnings in cpu/common.c
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
1da177e4
LT
16#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
c61e211d
HH
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 23#include <linux/vmalloc.h>
1da177e4 24#include <linux/module.h>
0f2fbdcb 25#include <linux/kprobes.h>
ab2bf0c1 26#include <linux/uaccess.h>
1eeb66a1 27#include <linux/kdebug.h>
1da177e4
LT
28
29#include <asm/system.h>
c61e211d
HH
30#include <asm/desc.h>
31#include <asm/segment.h>
1da177e4
LT
32#include <asm/pgalloc.h>
33#include <asm/smp.h>
34#include <asm/tlbflush.h>
35#include <asm/proto.h>
1da177e4 36#include <asm-generic/sections.h>
1da177e4 37
33cb5243
HH
38/*
39 * Page fault error code bits
40 * bit 0 == 0 means no page found, 1 means protection fault
41 * bit 1 == 0 means read, 1 means write
42 * bit 2 == 0 means kernel, 1 means user-mode
43 * bit 3 == 1 means use of reserved bit detected
44 * bit 4 == 1 means fault was an instruction fetch
45 */
8a19da7b 46#define PF_PROT (1<<0)
66c58156 47#define PF_WRITE (1<<1)
8a19da7b
IM
48#define PF_USER (1<<2)
49#define PF_RSVD (1<<3)
66c58156
AK
50#define PF_INSTR (1<<4)
51
74a0b576 52static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 53{
33cb5243 54#ifdef CONFIG_KPROBES
74a0b576
CH
55 int ret = 0;
56
57 /* kprobe_running() needs smp_processor_id() */
f8c2ee22
HH
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) {
60#else
74a0b576 61 if (!user_mode(regs)) {
f8c2ee22 62#endif
74a0b576
CH
63 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1;
66 preempt_enable();
67 }
1bd858a5 68
74a0b576 69 return ret;
74a0b576 70#else
74a0b576 71 return 0;
74a0b576 72#endif
33cb5243 73}
1bd858a5 74
1dc85be0
HH
75/*
76 * X86_32
77 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
78 * Check that here and ignore it.
79 *
80 * X86_64
81 * Sometimes the CPU reports invalid exceptions on prefetch.
82 * Check that here and ignore it.
83 *
84 * Opcode checker based on code by Richard Brunner
85 */
86static int is_prefetch(struct pt_regs *regs, unsigned long addr,
87 unsigned long error_code)
33cb5243 88{
ab2bf0c1 89 unsigned char *instr;
1da177e4 90 int scan_more = 1;
33cb5243 91 int prefetch = 0;
f1290ec9 92 unsigned char *max_instr;
1da177e4 93
1dc85be0 94#ifdef CONFIG_X86_32
b406ac61 95 if (!(__supported_pte_mask & _PAGE_NX))
1dc85be0 96 return 0;
b406ac61
HH
97#endif
98
c61e211d 99 /* If it was a exec fault on NX page, ignore */
66c58156 100 if (error_code & PF_INSTR)
1da177e4 101 return 0;
1dc85be0 102
f2857ce9 103 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 104 max_instr = instr + 15;
1da177e4 105
76381fee 106 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
107 return 0;
108
33cb5243 109 while (scan_more && instr < max_instr) {
1da177e4
LT
110 unsigned char opcode;
111 unsigned char instr_hi;
112 unsigned char instr_lo;
113
ab2bf0c1 114 if (probe_kernel_address(instr, opcode))
33cb5243 115 break;
1da177e4 116
33cb5243
HH
117 instr_hi = opcode & 0xf0;
118 instr_lo = opcode & 0x0f;
1da177e4
LT
119 instr++;
120
33cb5243 121 switch (instr_hi) {
1da177e4
LT
122 case 0x20:
123 case 0x30:
33cb5243
HH
124 /*
125 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
126 * In X86_64 long mode, the CPU will signal invalid
127 * opcode if some of these prefixes are present so
128 * X86_64 will never get here anyway
129 */
1da177e4
LT
130 scan_more = ((instr_lo & 7) == 0x6);
131 break;
33cb5243 132#ifdef CONFIG_X86_64
1da177e4 133 case 0x40:
33cb5243
HH
134 /*
135 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
136 * Need to figure out under what instruction mode the
137 * instruction was issued. Could check the LDT for lm,
138 * but for now it's good enough to assume that long
139 * mode only uses well known segments or kernel.
140 */
76381fee 141 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 142 break;
33cb5243 143#endif
1da177e4
LT
144 case 0x60:
145 /* 0x64 thru 0x67 are valid prefixes in all modes. */
146 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 147 break;
1da177e4 148 case 0xF0:
1dc85be0 149 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 150 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 151 break;
1da177e4
LT
152 case 0x00:
153 /* Prefetch instruction is 0x0F0D or 0x0F18 */
154 scan_more = 0;
f2857ce9 155
ab2bf0c1 156 if (probe_kernel_address(instr, opcode))
1da177e4
LT
157 break;
158 prefetch = (instr_lo == 0xF) &&
159 (opcode == 0x0D || opcode == 0x18);
33cb5243 160 break;
1da177e4
LT
161 default:
162 scan_more = 0;
163 break;
33cb5243 164 }
1da177e4
LT
165 }
166 return prefetch;
167}
168
c4aba4a8
HH
169static void force_sig_info_fault(int si_signo, int si_code,
170 unsigned long address, struct task_struct *tsk)
171{
172 siginfo_t info;
173
174 info.si_signo = si_signo;
175 info.si_errno = 0;
176 info.si_code = si_code;
177 info.si_addr = (void __user *)address;
178 force_sig_info(si_signo, &info, tsk);
179}
180
1156e098 181#ifdef CONFIG_X86_64
33cb5243
HH
182static int bad_address(void *p)
183{
1da177e4 184 unsigned long dummy;
ab2bf0c1 185 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 186}
1156e098 187#endif
1da177e4
LT
188
189void dump_pagetable(unsigned long address)
190{
1156e098
HH
191#ifdef CONFIG_X86_32
192 __typeof__(pte_val(__pte(0))) page;
193
194 page = read_cr3();
195 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
196#ifdef CONFIG_X86_PAE
197 printk("*pdpt = %016Lx ", page);
198 if ((page >> PAGE_SHIFT) < max_low_pfn
199 && page & _PAGE_PRESENT) {
200 page &= PAGE_MASK;
201 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
202 & (PTRS_PER_PMD - 1)];
203 printk(KERN_CONT "*pde = %016Lx ", page);
204 page &= ~_PAGE_NX;
205 }
206#else
207 printk("*pde = %08lx ", page);
208#endif
209
210 /*
211 * We must not directly access the pte in the highpte
212 * case if the page table is located in highmem.
213 * And let's rather not kmap-atomic the pte, just in case
214 * it's allocated already.
215 */
216 if ((page >> PAGE_SHIFT) < max_low_pfn
217 && (page & _PAGE_PRESENT)
218 && !(page & _PAGE_PSE)) {
219 page &= PAGE_MASK;
220 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
221 & (PTRS_PER_PTE - 1)];
222 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
223 }
224
225 printk("\n");
226#else /* CONFIG_X86_64 */
1da177e4
LT
227 pgd_t *pgd;
228 pud_t *pud;
229 pmd_t *pmd;
230 pte_t *pte;
231
f51c9452 232 pgd = (pgd_t *)read_cr3();
1da177e4 233
33cb5243 234 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 235 pgd += pgd_index(address);
1da177e4 236 if (bad_address(pgd)) goto bad;
d646bce4 237 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 238 if (!pgd_present(*pgd)) goto ret;
1da177e4 239
d2ae5b5f 240 pud = pud_offset(pgd, address);
1da177e4
LT
241 if (bad_address(pud)) goto bad;
242 printk("PUD %lx ", pud_val(*pud));
243 if (!pud_present(*pud)) goto ret;
244
245 pmd = pmd_offset(pud, address);
246 if (bad_address(pmd)) goto bad;
247 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 248 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
249
250 pte = pte_offset_kernel(pmd, address);
251 if (bad_address(pte)) goto bad;
33cb5243 252 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
253ret:
254 printk("\n");
255 return;
256bad:
257 printk("BAD\n");
1156e098
HH
258#endif
259}
260
261#ifdef CONFIG_X86_32
262static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
263{
264 unsigned index = pgd_index(address);
265 pgd_t *pgd_k;
266 pud_t *pud, *pud_k;
267 pmd_t *pmd, *pmd_k;
268
269 pgd += index;
270 pgd_k = init_mm.pgd + index;
271
272 if (!pgd_present(*pgd_k))
273 return NULL;
274
275 /*
276 * set_pgd(pgd, *pgd_k); here would be useless on PAE
277 * and redundant with the set_pmd() on non-PAE. As would
278 * set_pud.
279 */
280
281 pud = pud_offset(pgd, address);
282 pud_k = pud_offset(pgd_k, address);
283 if (!pud_present(*pud_k))
284 return NULL;
285
286 pmd = pmd_offset(pud, address);
287 pmd_k = pmd_offset(pud_k, address);
288 if (!pmd_present(*pmd_k))
289 return NULL;
290 if (!pmd_present(*pmd)) {
291 set_pmd(pmd, *pmd_k);
292 arch_flush_lazy_mmu_mode();
293 } else
294 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
295 return pmd_k;
1da177e4 296}
1156e098 297#endif
1da177e4 298
1dc85be0 299#ifdef CONFIG_X86_64
33cb5243 300static const char errata93_warning[] =
1da177e4
LT
301KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
302KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
303KERN_ERR "******* Please consider a BIOS update.\n"
304KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 305#endif
1da177e4
LT
306
307/* Workaround for K8 erratum #93 & buggy BIOS.
308 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
309 to avoid corruption of the 64bit RIP register on C stepping K8.
310 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
311 The OS sees this as a page fault with the upper 32bits of RIP cleared.
312 Try to work around it here.
fdfe8aa8
HH
313 Note we only handle faults in kernel here.
314 Does nothing for X86_32
315 */
33cb5243 316static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 317{
fdfe8aa8 318#ifdef CONFIG_X86_64
1da177e4 319 static int warned;
65ea5b03 320 if (address != regs->ip)
1da177e4 321 return 0;
33cb5243 322 if ((address >> 32) != 0)
1da177e4
LT
323 return 0;
324 address |= 0xffffffffUL << 32;
33cb5243
HH
325 if ((address >= (u64)_stext && address <= (u64)_etext) ||
326 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 327 if (!warned) {
33cb5243 328 printk(errata93_warning);
1da177e4
LT
329 warned = 1;
330 }
65ea5b03 331 regs->ip = address;
1da177e4
LT
332 return 1;
333 }
fdfe8aa8 334#endif
1da177e4 335 return 0;
33cb5243 336}
1da177e4 337
35f3266f
HH
338/*
339 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
340 * addresses >4GB. We catch this in the page fault handler because these
341 * addresses are not reachable. Just detect this case and return. Any code
342 * segment in LDT is compatibility mode.
343 */
344static int is_errata100(struct pt_regs *regs, unsigned long address)
345{
346#ifdef CONFIG_X86_64
347 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
348 (address >> 32))
349 return 1;
350#endif
351 return 0;
352}
353
29caf2f9
HH
354void do_invalid_op(struct pt_regs *, unsigned long);
355
356static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
357{
358#ifdef CONFIG_X86_F00F_BUG
359 unsigned long nr;
360 /*
361 * Pentium F0 0F C7 C8 bug workaround.
362 */
363 if (boot_cpu_data.f00f_bug) {
364 nr = (address - idt_descr.address) >> 3;
365
366 if (nr == 6) {
367 do_invalid_op(regs, 0);
368 return 1;
369 }
370 }
371#endif
372 return 0;
373}
374
b3279c7f
HH
375static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
376 unsigned long address)
377{
1156e098
HH
378#ifdef CONFIG_X86_32
379 if (!oops_may_print())
380 return;
fd40d6e3 381#endif
1156e098
HH
382
383#ifdef CONFIG_X86_PAE
384 if (error_code & PF_INSTR) {
385 int level;
386 pte_t *pte = lookup_address(address, &level);
387
388 if (pte && pte_present(*pte) && !pte_exec(*pte))
389 printk(KERN_CRIT "kernel tried to execute "
390 "NX-protected page - exploit attempt? "
391 "(uid: %d)\n", current->uid);
392 }
393#endif
1156e098 394
19f0dda9 395 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 396 if (address < PAGE_SIZE)
19f0dda9 397 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 398 else
19f0dda9 399 printk(KERN_CONT "paging request");
fd40d6e3
HH
400#ifdef CONFIG_X86_32
401 printk(KERN_CONT " at %08lx\n", address);
402#else
19f0dda9 403 printk(KERN_CONT " at %016lx\n", address);
fd40d6e3 404#endif
19f0dda9 405 printk(KERN_ALERT "IP:");
b3279c7f
HH
406 printk_address(regs->ip, 1);
407 dump_pagetable(address);
408}
409
1156e098 410#ifdef CONFIG_X86_64
1da177e4
LT
411static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
412 unsigned long error_code)
413{
1209140c 414 unsigned long flags = oops_begin();
6e3f3617 415 struct task_struct *tsk;
1209140c 416
1da177e4
LT
417 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
418 current->comm, address);
419 dump_pagetable(address);
6e3f3617
JB
420 tsk = current;
421 tsk->thread.cr2 = address;
422 tsk->thread.trap_no = 14;
423 tsk->thread.error_code = error_code;
22f5991c
JB
424 if (__die("Bad pagetable", regs, error_code))
425 regs = NULL;
426 oops_end(flags, regs, SIGKILL);
1da177e4 427}
1156e098 428#endif
1da177e4 429
5b727a3b
JF
430/*
431 * Handle a spurious fault caused by a stale TLB entry. This allows
432 * us to lazily refresh the TLB when increasing the permissions of a
433 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
434 * expensive since that implies doing a full cross-processor TLB
435 * flush, even if no stale TLB entries exist on other processors.
436 * There are no security implications to leaving a stale TLB when
437 * increasing the permissions on a page.
438 */
439static int spurious_fault(unsigned long address,
440 unsigned long error_code)
441{
442 pgd_t *pgd;
443 pud_t *pud;
444 pmd_t *pmd;
445 pte_t *pte;
446
447 /* Reserved-bit violation or user access to kernel space? */
448 if (error_code & (PF_USER | PF_RSVD))
449 return 0;
450
451 pgd = init_mm.pgd + pgd_index(address);
452 if (!pgd_present(*pgd))
453 return 0;
454
455 pud = pud_offset(pgd, address);
456 if (!pud_present(*pud))
457 return 0;
458
459 pmd = pmd_offset(pud, address);
460 if (!pmd_present(*pmd))
461 return 0;
462
463 pte = pte_offset_kernel(pmd, address);
464 if (!pte_present(*pte))
465 return 0;
466
467 if ((error_code & PF_WRITE) && !pte_write(*pte))
468 return 0;
469 if ((error_code & PF_INSTR) && !pte_exec(*pte))
470 return 0;
471
472 return 1;
473}
474
1da177e4 475/*
f8c2ee22
HH
476 * X86_32
477 * Handle a fault on the vmalloc or module mapping area
478 *
479 * X86_64
f95190b2 480 * Handle a fault on the vmalloc area
3b9ba4d5
AK
481 *
482 * This assumes no large pages in there.
1da177e4
LT
483 */
484static int vmalloc_fault(unsigned long address)
485{
fdfe8aa8
HH
486#ifdef CONFIG_X86_32
487 unsigned long pgd_paddr;
488 pmd_t *pmd_k;
489 pte_t *pte_k;
490 /*
491 * Synchronize this task's top level page-table
492 * with the 'reference' page table.
493 *
494 * Do _not_ use "current" here. We might be inside
495 * an interrupt in the middle of a task switch..
496 */
497 pgd_paddr = read_cr3();
498 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
499 if (!pmd_k)
500 return -1;
501 pte_k = pte_offset_kernel(pmd_k, address);
502 if (!pte_present(*pte_k))
503 return -1;
504 return 0;
505#else
1da177e4
LT
506 pgd_t *pgd, *pgd_ref;
507 pud_t *pud, *pud_ref;
508 pmd_t *pmd, *pmd_ref;
509 pte_t *pte, *pte_ref;
510
511 /* Copy kernel mappings over when needed. This can also
512 happen within a race in page table update. In the later
513 case just flush. */
514
515 pgd = pgd_offset(current->mm ?: &init_mm, address);
516 pgd_ref = pgd_offset_k(address);
517 if (pgd_none(*pgd_ref))
518 return -1;
519 if (pgd_none(*pgd))
520 set_pgd(pgd, *pgd_ref);
8c914cb7 521 else
46a82b2d 522 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
523
524 /* Below here mismatches are bugs because these lower tables
525 are shared */
526
527 pud = pud_offset(pgd, address);
528 pud_ref = pud_offset(pgd_ref, address);
529 if (pud_none(*pud_ref))
530 return -1;
46a82b2d 531 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
532 BUG();
533 pmd = pmd_offset(pud, address);
534 pmd_ref = pmd_offset(pud_ref, address);
535 if (pmd_none(*pmd_ref))
536 return -1;
537 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
538 BUG();
539 pte_ref = pte_offset_kernel(pmd_ref, address);
540 if (!pte_present(*pte_ref))
541 return -1;
542 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
543 /* Don't use pte_page here, because the mappings can point
544 outside mem_map, and the NUMA hash lookup cannot handle
545 that. */
546 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 547 BUG();
1da177e4 548 return 0;
fdfe8aa8 549#endif
1da177e4
LT
550}
551
abd4f750 552int show_unhandled_signals = 1;
1da177e4
LT
553
554/*
555 * This routine handles page faults. It determines the address,
556 * and the problem, and then passes it off to one of the appropriate
557 * routines.
1da177e4 558 */
f8c2ee22
HH
559#ifdef CONFIG_X86_64
560asmlinkage
561#endif
562void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4
LT
563{
564 struct task_struct *tsk;
565 struct mm_struct *mm;
33cb5243 566 struct vm_area_struct *vma;
1da177e4 567 unsigned long address;
f8c2ee22
HH
568 int write, si_code;
569 int fault;
570#ifdef CONFIG_X86_64
1209140c 571 unsigned long flags;
f8c2ee22 572#endif
1da177e4 573
143a5d32
PZ
574 /*
575 * We can fault from pretty much anywhere, with unknown IRQ state.
576 */
577 trace_hardirqs_fixup();
578
a9ba9a3b
AV
579 tsk = current;
580 mm = tsk->mm;
581 prefetchw(&mm->mmap_sem);
582
1da177e4 583 /* get the address */
f51c9452 584 address = read_cr2();
1da177e4 585
c4aba4a8 586 si_code = SEGV_MAPERR;
1da177e4 587
608566b4
HH
588 if (notify_page_fault(regs))
589 return;
1da177e4
LT
590
591 /*
592 * We fault-in kernel-space virtual memory on-demand. The
593 * 'reference' page table is init_mm.pgd.
594 *
595 * NOTE! We MUST NOT take any locks for this case. We may
596 * be in an interrupt or a critical region, and should
597 * only copy the information from the master page table,
598 * nothing more.
599 *
600 * This verifies that the fault happens in kernel space
601 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 602 * protection error (error_code & 9) == 0.
1da177e4 603 */
f8c2ee22
HH
604#ifdef CONFIG_X86_32
605 if (unlikely(address >= TASK_SIZE)) {
606 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
607 vmalloc_fault(address) >= 0)
608 return;
5b727a3b
JF
609
610 /* Can handle a stale RO->RW TLB */
611 if (spurious_fault(address, error_code))
612 return;
613
f8c2ee22
HH
614 /*
615 * Don't take the mm semaphore here. If we fixup a prefetch
616 * fault we could otherwise deadlock.
617 */
618 goto bad_area_nosemaphore;
619 }
620
621 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
622 fault has been handled. */
623 if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
624 local_irq_enable();
625
626 /*
627 * If we're in an interrupt, have no user context or are running in an
628 * atomic region then we must not take the fault.
629 */
630 if (in_atomic() || !mm)
631 goto bad_area_nosemaphore;
632#else /* CONFIG_X86_64 */
84929801 633 if (unlikely(address >= TASK_SIZE64)) {
f95190b2
AK
634 /*
635 * Don't check for the module range here: its PML4
636 * is always initialized because it's shared with the main
637 * kernel text. Only vmalloc may need PML4 syncups.
638 */
66c58156 639 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
f95190b2 640 ((address >= VMALLOC_START && address < VMALLOC_END))) {
8c914cb7
JB
641 if (vmalloc_fault(address) >= 0)
642 return;
1da177e4 643 }
5b727a3b
JF
644
645 /* Can handle a stale RO->RW TLB */
646 if (spurious_fault(address, error_code))
647 return;
648
1da177e4
LT
649 /*
650 * Don't take the mm semaphore here. If we fixup a prefetch
651 * fault we could otherwise deadlock.
652 */
653 goto bad_area_nosemaphore;
654 }
65ea5b03 655 if (likely(regs->flags & X86_EFLAGS_IF))
8c914cb7
JB
656 local_irq_enable();
657
66c58156 658 if (unlikely(error_code & PF_RSVD))
1da177e4
LT
659 pgtable_bad(address, regs, error_code);
660
661 /*
33cb5243
HH
662 * If we're in an interrupt, have no user context or are running in an
663 * atomic region then we must not take the fault.
1da177e4
LT
664 */
665 if (unlikely(in_atomic() || !mm))
666 goto bad_area_nosemaphore;
667
dbe3ed1c
LT
668 /*
669 * User-mode registers count as a user access even for any
670 * potential system fault or CPU buglet.
671 */
672 if (user_mode_vm(regs))
673 error_code |= PF_USER;
f8c2ee22
HH
674again:
675#endif
1da177e4
LT
676 /* When running in the kernel we expect faults to occur only to
677 * addresses in user space. All other faults represent errors in the
676b1855 678 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 679 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
680 * we will deadlock attempting to validate the fault against the
681 * address space. Luckily the kernel only validly references user
682 * space from well defined areas of code, which are listed in the
683 * exceptions table.
684 *
685 * As the vast majority of faults will be valid we will only perform
676b1855 686 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
687 * Attempt to lock the address space, if we cannot we then validate the
688 * source. If this is invalid we can skip the address space check,
689 * thus avoiding the deadlock.
690 */
691 if (!down_read_trylock(&mm->mmap_sem)) {
66c58156 692 if ((error_code & PF_USER) == 0 &&
65ea5b03 693 !search_exception_tables(regs->ip))
1da177e4
LT
694 goto bad_area_nosemaphore;
695 down_read(&mm->mmap_sem);
696 }
697
698 vma = find_vma(mm, address);
699 if (!vma)
700 goto bad_area;
f8c2ee22 701 if (vma->vm_start <= address)
1da177e4
LT
702 goto good_area;
703 if (!(vma->vm_flags & VM_GROWSDOWN))
704 goto bad_area;
33cb5243 705 if (error_code & PF_USER) {
6f4d368e
HH
706 /*
707 * Accessing the stack below %sp is always a bug.
708 * The large cushion allows instructions like enter
709 * and pusha to work. ("enter $65535,$31" pushes
710 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 711 */
65ea5b03 712 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
1da177e4
LT
713 goto bad_area;
714 }
715 if (expand_stack(vma, address))
716 goto bad_area;
717/*
718 * Ok, we have a good vm_area for this memory access, so
719 * we can handle it..
720 */
721good_area:
c4aba4a8 722 si_code = SEGV_ACCERR;
1da177e4 723 write = 0;
66c58156 724 switch (error_code & (PF_PROT|PF_WRITE)) {
33cb5243
HH
725 default: /* 3: write, present */
726 /* fall through */
727 case PF_WRITE: /* write, not present */
728 if (!(vma->vm_flags & VM_WRITE))
729 goto bad_area;
730 write++;
731 break;
732 case PF_PROT: /* read, present */
733 goto bad_area;
734 case 0: /* read, not present */
735 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1da177e4 736 goto bad_area;
1da177e4
LT
737 }
738
f8c2ee22
HH
739#ifdef CONFIG_X86_32
740survive:
741#endif
1da177e4
LT
742 /*
743 * If for any reason at all we couldn't handle the fault,
744 * make sure we exit gracefully rather than endlessly redo
745 * the fault.
746 */
83c54070
NP
747 fault = handle_mm_fault(mm, vma, address, write);
748 if (unlikely(fault & VM_FAULT_ERROR)) {
749 if (fault & VM_FAULT_OOM)
750 goto out_of_memory;
751 else if (fault & VM_FAULT_SIGBUS)
752 goto do_sigbus;
753 BUG();
1da177e4 754 }
83c54070
NP
755 if (fault & VM_FAULT_MAJOR)
756 tsk->maj_flt++;
757 else
758 tsk->min_flt++;
d729ab35
HH
759
760#ifdef CONFIG_X86_32
761 /*
762 * Did it hit the DOS screen memory VA from vm86 mode?
763 */
764 if (v8086_mode(regs)) {
765 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
766 if (bit < 32)
767 tsk->thread.screen_bitmap |= 1 << bit;
768 }
769#endif
1da177e4
LT
770 up_read(&mm->mmap_sem);
771 return;
772
773/*
774 * Something tried to access memory that isn't in our memory map..
775 * Fix it, but check if it's kernel or user first..
776 */
777bad_area:
778 up_read(&mm->mmap_sem);
779
780bad_area_nosemaphore:
1da177e4 781 /* User mode accesses just cause a SIGSEGV */
66c58156 782 if (error_code & PF_USER) {
e5e3c84b
SR
783 /*
784 * It's possible to have interrupts off here.
785 */
786 local_irq_enable();
787
1156e098
HH
788 /*
789 * Valid to do another page fault here because this one came
790 * from user space.
791 */
1da177e4
LT
792 if (is_prefetch(regs, address, error_code))
793 return;
794
35f3266f 795 if (is_errata100(regs, address))
1da177e4
LT
796 return;
797
abd4f750
MAS
798 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
799 printk_ratelimit()) {
1da177e4 800 printk(
6f4d368e 801#ifdef CONFIG_X86_32
edcd8119 802 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
6f4d368e 803#else
03252919 804 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
6f4d368e
HH
805#endif
806 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
807 tsk->comm, task_pid_nr(tsk), address, regs->ip,
808 regs->sp, error_code);
03252919
AK
809 print_vma_addr(" in ", regs->ip);
810 printk("\n");
1da177e4 811 }
33cb5243 812
1da177e4
LT
813 tsk->thread.cr2 = address;
814 /* Kernel addresses are always protection faults */
815 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
816 tsk->thread.trap_no = 14;
c4aba4a8 817 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
1da177e4
LT
818 return;
819 }
820
29caf2f9
HH
821 if (is_f00f_bug(regs, address))
822 return;
823
1da177e4 824no_context:
1da177e4 825 /* Are we prepared to handle this kernel fault? */
33cb5243 826 if (fixup_exception(regs))
1da177e4 827 return;
1da177e4 828
33cb5243 829 /*
f8c2ee22
HH
830 * X86_32
831 * Valid to do another page fault here, because if this fault
832 * had been triggered by is_prefetch fixup_exception would have
833 * handled it.
834 *
835 * X86_64
1da177e4
LT
836 * Hall of shame of CPU/BIOS bugs.
837 */
33cb5243
HH
838 if (is_prefetch(regs, address, error_code))
839 return;
1da177e4
LT
840
841 if (is_errata93(regs, address))
33cb5243 842 return;
1da177e4
LT
843
844/*
845 * Oops. The kernel tried to access some bad page. We'll have to
846 * terminate things with extreme prejudice.
847 */
f8c2ee22
HH
848#ifdef CONFIG_X86_32
849 bust_spinlocks(1);
fd40d6e3
HH
850#else
851 flags = oops_begin();
852#endif
f8c2ee22
HH
853
854 show_fault_oops(regs, error_code, address);
1da177e4 855
f8c2ee22
HH
856 tsk->thread.cr2 = address;
857 tsk->thread.trap_no = 14;
858 tsk->thread.error_code = error_code;
fd40d6e3
HH
859
860#ifdef CONFIG_X86_32
f8c2ee22
HH
861 die("Oops", regs, error_code);
862 bust_spinlocks(0);
863 do_exit(SIGKILL);
fd40d6e3 864#else
22f5991c
JB
865 if (__die("Oops", regs, error_code))
866 regs = NULL;
1da177e4
LT
867 /* Executive summary in case the body of the oops scrolled away */
868 printk(KERN_EMERG "CR2: %016lx\n", address);
22f5991c 869 oops_end(flags, regs, SIGKILL);
f8c2ee22 870#endif
1da177e4
LT
871
872/*
873 * We ran out of memory, or some other thing happened to us that made
874 * us unable to handle the page fault gracefully.
875 */
876out_of_memory:
877 up_read(&mm->mmap_sem);
f8c2ee22
HH
878 if (is_global_init(tsk)) {
879 yield();
fd40d6e3 880#ifdef CONFIG_X86_32
f8c2ee22
HH
881 down_read(&mm->mmap_sem);
882 goto survive;
f8c2ee22 883#else
1da177e4 884 goto again;
f8c2ee22 885#endif
fd40d6e3
HH
886 }
887
1da177e4 888 printk("VM: killing process %s\n", tsk->comm);
318aa296 889 if (error_code & PF_USER)
021daae2 890 do_group_exit(SIGKILL);
1da177e4
LT
891 goto no_context;
892
893do_sigbus:
894 up_read(&mm->mmap_sem);
895
896 /* Kernel mode? Handle exceptions or die */
66c58156 897 if (!(error_code & PF_USER))
1da177e4 898 goto no_context;
f8c2ee22
HH
899#ifdef CONFIG_X86_32
900 /* User space => ok to do another page fault */
901 if (is_prefetch(regs, address, error_code))
902 return;
903#endif
1da177e4
LT
904 tsk->thread.cr2 = address;
905 tsk->thread.error_code = error_code;
906 tsk->thread.trap_no = 14;
c4aba4a8 907 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
1da177e4 908}
9e43e1b7 909
8c914cb7 910DEFINE_SPINLOCK(pgd_lock);
2bff7383 911LIST_HEAD(pgd_list);
8c914cb7
JB
912
913void vmalloc_sync_all(void)
914{
1156e098
HH
915#ifdef CONFIG_X86_32
916 /*
917 * Note that races in the updates of insync and start aren't
918 * problematic: insync can only get set bits added, and updates to
919 * start are only improving performance (without affecting correctness
920 * if undone).
921 */
922 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
923 static unsigned long start = TASK_SIZE;
924 unsigned long address;
925
926 if (SHARED_KERNEL_PMD)
927 return;
928
929 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
930 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
931 if (!test_bit(pgd_index(address), insync)) {
932 unsigned long flags;
933 struct page *page;
934
935 spin_lock_irqsave(&pgd_lock, flags);
e3ed910d 936 list_for_each_entry(page, &pgd_list, lru) {
1156e098 937 if (!vmalloc_sync_one(page_address(page),
e3ed910d 938 address))
1156e098 939 break;
e3ed910d 940 }
1156e098
HH
941 spin_unlock_irqrestore(&pgd_lock, flags);
942 if (!page)
943 set_bit(pgd_index(address), insync);
944 }
945 if (address == start && test_bit(pgd_index(address), insync))
946 start = address + PGDIR_SIZE;
947 }
948#else /* CONFIG_X86_64 */
6f4d368e
HH
949 /*
950 * Note that races in the updates of insync and start aren't
951 * problematic: insync can only get set bits added, and updates to
952 * start are only improving performance (without affecting correctness
953 * if undone).
954 */
8c914cb7
JB
955 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
956 static unsigned long start = VMALLOC_START & PGDIR_MASK;
957 unsigned long address;
958
959 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
960 if (!test_bit(pgd_index(address), insync)) {
961 const pgd_t *pgd_ref = pgd_offset_k(address);
962 struct page *page;
963
964 if (pgd_none(*pgd_ref))
965 continue;
966 spin_lock(&pgd_lock);
2bff7383 967 list_for_each_entry(page, &pgd_list, lru) {
8c914cb7
JB
968 pgd_t *pgd;
969 pgd = (pgd_t *)page_address(page) + pgd_index(address);
970 if (pgd_none(*pgd))
971 set_pgd(pgd, *pgd_ref);
972 else
46a82b2d 973 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7
JB
974 }
975 spin_unlock(&pgd_lock);
976 set_bit(pgd_index(address), insync);
977 }
978 if (address == start)
979 start = address + PGDIR_SIZE;
980 }
981 /* Check that there is no need to do the same for the modules area. */
982 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
33cb5243 983 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
8c914cb7 984 (__START_KERNEL & PGDIR_MASK)));
1156e098 985#endif
8c914cb7 986}