]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - arch/x86/mm/fault_64.c
x86: begin fault_{32|64}.c unification
[mirror_ubuntu-zesty-kernel.git] / arch / x86 / mm / fault_64.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
1da177e4
LT
16#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
1eeb66a1 21#include <linux/vmalloc.h>
1da177e4 22#include <linux/module.h>
0f2fbdcb 23#include <linux/kprobes.h>
ab2bf0c1 24#include <linux/uaccess.h>
1eeb66a1 25#include <linux/kdebug.h>
1da177e4
LT
26
27#include <asm/system.h>
1da177e4
LT
28#include <asm/pgalloc.h>
29#include <asm/smp.h>
30#include <asm/tlbflush.h>
31#include <asm/proto.h>
1da177e4 32#include <asm-generic/sections.h>
1da177e4 33
33cb5243
HH
34/*
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
41 */
42#define PF_PROT (1<<0)
66c58156
AK
43#define PF_WRITE (1<<1)
44#define PF_USER (1<<2)
45#define PF_RSVD (1<<3)
46#define PF_INSTR (1<<4)
47
74a0b576 48static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 49{
33cb5243 50#ifdef CONFIG_KPROBES
74a0b576
CH
51 int ret = 0;
52
53 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
59 }
1bd858a5 60
74a0b576 61 return ret;
74a0b576 62#else
74a0b576 63 return 0;
74a0b576 64#endif
33cb5243 65}
1bd858a5 66
1dc85be0
HH
67#ifdef CONFIG_X86_32
68/*
69 * Return EIP plus the CS segment base. The segment limit is also
70 * adjusted, clamped to the kernel/user address space (whichever is
71 * appropriate), and returned in *eip_limit.
72 *
73 * The segment is checked, because it might have been changed by another
74 * task between the original faulting instruction and here.
75 *
76 * If CS is no longer a valid code segment, or if EIP is beyond the
77 * limit, or if it is a kernel address when CS is not a kernel segment,
78 * then the returned value will be greater than *eip_limit.
79 *
80 * This is slow, but is very rarely executed.
81 */
82static inline unsigned long get_segment_eip(struct pt_regs *regs,
83 unsigned long *eip_limit)
84{
85 unsigned long ip = regs->ip;
86 unsigned seg = regs->cs & 0xffff;
87 u32 seg_ar, seg_limit, base, *desc;
88
89 /* Unlikely, but must come before segment checks. */
90 if (unlikely(regs->flags & VM_MASK)) {
91 base = seg << 4;
92 *eip_limit = base + 0xffff;
93 return base + (ip & 0xffff);
94 }
95
96 /* The standard kernel/user address space limit. */
97 *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
98
99 /* By far the most common cases. */
100 if (likely(SEGMENT_IS_FLAT_CODE(seg)))
101 return ip;
102
103 /* Check the segment exists, is within the current LDT/GDT size,
104 that kernel/user (ring 0..3) has the appropriate privilege,
105 that it's a code segment, and get the limit. */
106 __asm__("larl %3,%0; lsll %3,%1"
107 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
108 if ((~seg_ar & 0x9800) || ip > seg_limit) {
109 *eip_limit = 0;
110 return 1; /* So that returned ip > *eip_limit. */
111 }
112
113 /* Get the GDT/LDT descriptor base.
114 When you look for races in this code remember that
115 LDT and other horrors are only used in user space. */
116 if (seg & (1<<2)) {
117 /* Must lock the LDT while reading it. */
118 mutex_lock(&current->mm->context.lock);
119 desc = current->mm->context.ldt;
120 desc = (void *)desc + (seg & ~7);
121 } else {
122 /* Must disable preemption while reading the GDT. */
123 desc = (u32 *)get_cpu_gdt_table(get_cpu());
124 desc = (void *)desc + (seg & ~7);
125 }
126
127 /* Decode the code segment base from the descriptor */
128 base = get_desc_base((struct desc_struct *)desc);
129
130 if (seg & (1<<2))
131 mutex_unlock(&current->mm->context.lock);
132 else
133 put_cpu();
134
135 /* Adjust EIP and segment limit, and clamp at the kernel limit.
136 It's legitimate for segments to wrap at 0xffffffff. */
137 seg_limit += base;
138 if (seg_limit < *eip_limit && seg_limit >= base)
139 *eip_limit = seg_limit;
140 return ip + base;
141}
142#endif
143
144/*
145 * X86_32
146 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
147 * Check that here and ignore it.
148 *
149 * X86_64
150 * Sometimes the CPU reports invalid exceptions on prefetch.
151 * Check that here and ignore it.
152 *
153 * Opcode checker based on code by Richard Brunner
154 */
155static int is_prefetch(struct pt_regs *regs, unsigned long addr,
156 unsigned long error_code)
33cb5243 157{
ab2bf0c1 158 unsigned char *instr;
1da177e4 159 int scan_more = 1;
33cb5243 160 int prefetch = 0;
f1290ec9 161 unsigned char *max_instr;
1da177e4 162
1dc85be0
HH
163#ifdef CONFIG_X86_32
164 unsigned long limit;
165 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
166 boot_cpu_data.x86 >= 6)) {
167 /* Catch an obscure case of prefetch inside an NX page. */
168 if (nx_enabled && (error_code & PF_INSTR))
169 return 0;
170 } else {
171 return 0;
172 }
173 instr = (unsigned char *)get_segment_eip(regs, &limit);
174#else
1da177e4 175 /* If it was a exec fault ignore */
66c58156 176 if (error_code & PF_INSTR)
1da177e4 177 return 0;
dd2994f6 178 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
1dc85be0
HH
179#endif
180
f1290ec9 181 max_instr = instr + 15;
1da177e4 182
1dc85be0 183#ifdef CONFIG_X86_64
76381fee 184 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4 185 return 0;
1dc85be0 186#endif
1da177e4 187
33cb5243 188 while (scan_more && instr < max_instr) {
1da177e4
LT
189 unsigned char opcode;
190 unsigned char instr_hi;
191 unsigned char instr_lo;
192
1dc85be0
HH
193#ifdef CONFIG_X86_32
194 if (instr > (unsigned char *)limit)
195 break;
196#endif
ab2bf0c1 197 if (probe_kernel_address(instr, opcode))
33cb5243 198 break;
1da177e4 199
33cb5243
HH
200 instr_hi = opcode & 0xf0;
201 instr_lo = opcode & 0x0f;
1da177e4
LT
202 instr++;
203
33cb5243 204 switch (instr_hi) {
1da177e4
LT
205 case 0x20:
206 case 0x30:
33cb5243
HH
207 /*
208 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
209 * In X86_64 long mode, the CPU will signal invalid
210 * opcode if some of these prefixes are present so
211 * X86_64 will never get here anyway
212 */
1da177e4
LT
213 scan_more = ((instr_lo & 7) == 0x6);
214 break;
33cb5243 215#ifdef CONFIG_X86_64
1da177e4 216 case 0x40:
33cb5243
HH
217 /*
218 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
219 * Need to figure out under what instruction mode the
220 * instruction was issued. Could check the LDT for lm,
221 * but for now it's good enough to assume that long
222 * mode only uses well known segments or kernel.
223 */
76381fee 224 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 225 break;
33cb5243 226#endif
1da177e4
LT
227 case 0x60:
228 /* 0x64 thru 0x67 are valid prefixes in all modes. */
229 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 230 break;
1da177e4 231 case 0xF0:
1dc85be0 232 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 233 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 234 break;
1da177e4
LT
235 case 0x00:
236 /* Prefetch instruction is 0x0F0D or 0x0F18 */
237 scan_more = 0;
1dc85be0
HH
238#ifdef CONFIG_X86_32
239 if (instr > (unsigned char *)limit)
240 break;
241#endif
ab2bf0c1 242 if (probe_kernel_address(instr, opcode))
1da177e4
LT
243 break;
244 prefetch = (instr_lo == 0xF) &&
245 (opcode == 0x0D || opcode == 0x18);
33cb5243 246 break;
1da177e4
LT
247 default:
248 scan_more = 0;
249 break;
33cb5243 250 }
1da177e4
LT
251 }
252 return prefetch;
253}
254
33cb5243
HH
255static int bad_address(void *p)
256{
1da177e4 257 unsigned long dummy;
ab2bf0c1 258 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 259}
1da177e4
LT
260
261void dump_pagetable(unsigned long address)
262{
263 pgd_t *pgd;
264 pud_t *pud;
265 pmd_t *pmd;
266 pte_t *pte;
267
f51c9452 268 pgd = (pgd_t *)read_cr3();
1da177e4 269
33cb5243 270 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 271 pgd += pgd_index(address);
1da177e4 272 if (bad_address(pgd)) goto bad;
d646bce4 273 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 274 if (!pgd_present(*pgd)) goto ret;
1da177e4 275
d2ae5b5f 276 pud = pud_offset(pgd, address);
1da177e4
LT
277 if (bad_address(pud)) goto bad;
278 printk("PUD %lx ", pud_val(*pud));
279 if (!pud_present(*pud)) goto ret;
280
281 pmd = pmd_offset(pud, address);
282 if (bad_address(pmd)) goto bad;
283 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 284 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
285
286 pte = pte_offset_kernel(pmd, address);
287 if (bad_address(pte)) goto bad;
33cb5243 288 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
289ret:
290 printk("\n");
291 return;
292bad:
293 printk("BAD\n");
294}
295
1dc85be0 296#ifdef CONFIG_X86_64
33cb5243 297static const char errata93_warning[] =
1da177e4
LT
298KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
299KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
300KERN_ERR "******* Please consider a BIOS update.\n"
301KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
302
303/* Workaround for K8 erratum #93 & buggy BIOS.
304 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
305 to avoid corruption of the 64bit RIP register on C stepping K8.
306 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
307 The OS sees this as a page fault with the upper 32bits of RIP cleared.
308 Try to work around it here.
309 Note we only handle faults in kernel here. */
310
33cb5243 311static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4
LT
312{
313 static int warned;
65ea5b03 314 if (address != regs->ip)
1da177e4 315 return 0;
33cb5243 316 if ((address >> 32) != 0)
1da177e4
LT
317 return 0;
318 address |= 0xffffffffUL << 32;
33cb5243
HH
319 if ((address >= (u64)_stext && address <= (u64)_etext) ||
320 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 321 if (!warned) {
33cb5243 322 printk(errata93_warning);
1da177e4
LT
323 warned = 1;
324 }
65ea5b03 325 regs->ip = address;
1da177e4
LT
326 return 1;
327 }
328 return 0;
33cb5243 329}
1dc85be0 330#endif
1da177e4 331
1da177e4
LT
332static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
333 unsigned long error_code)
334{
1209140c 335 unsigned long flags = oops_begin();
6e3f3617 336 struct task_struct *tsk;
1209140c 337
1da177e4
LT
338 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
339 current->comm, address);
340 dump_pagetable(address);
6e3f3617
JB
341 tsk = current;
342 tsk->thread.cr2 = address;
343 tsk->thread.trap_no = 14;
344 tsk->thread.error_code = error_code;
22f5991c
JB
345 if (__die("Bad pagetable", regs, error_code))
346 regs = NULL;
347 oops_end(flags, regs, SIGKILL);
1da177e4
LT
348}
349
350/*
f95190b2 351 * Handle a fault on the vmalloc area
3b9ba4d5
AK
352 *
353 * This assumes no large pages in there.
1da177e4
LT
354 */
355static int vmalloc_fault(unsigned long address)
356{
357 pgd_t *pgd, *pgd_ref;
358 pud_t *pud, *pud_ref;
359 pmd_t *pmd, *pmd_ref;
360 pte_t *pte, *pte_ref;
361
362 /* Copy kernel mappings over when needed. This can also
363 happen within a race in page table update. In the later
364 case just flush. */
365
366 pgd = pgd_offset(current->mm ?: &init_mm, address);
367 pgd_ref = pgd_offset_k(address);
368 if (pgd_none(*pgd_ref))
369 return -1;
370 if (pgd_none(*pgd))
371 set_pgd(pgd, *pgd_ref);
8c914cb7 372 else
46a82b2d 373 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
374
375 /* Below here mismatches are bugs because these lower tables
376 are shared */
377
378 pud = pud_offset(pgd, address);
379 pud_ref = pud_offset(pgd_ref, address);
380 if (pud_none(*pud_ref))
381 return -1;
46a82b2d 382 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
383 BUG();
384 pmd = pmd_offset(pud, address);
385 pmd_ref = pmd_offset(pud_ref, address);
386 if (pmd_none(*pmd_ref))
387 return -1;
388 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
389 BUG();
390 pte_ref = pte_offset_kernel(pmd_ref, address);
391 if (!pte_present(*pte_ref))
392 return -1;
393 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
394 /* Don't use pte_page here, because the mappings can point
395 outside mem_map, and the NUMA hash lookup cannot handle
396 that. */
397 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 398 BUG();
1da177e4
LT
399 return 0;
400}
401
abd4f750 402int show_unhandled_signals = 1;
1da177e4
LT
403
404/*
405 * This routine handles page faults. It determines the address,
406 * and the problem, and then passes it off to one of the appropriate
407 * routines.
1da177e4 408 */
0f2fbdcb
PP
409asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
410 unsigned long error_code)
1da177e4
LT
411{
412 struct task_struct *tsk;
413 struct mm_struct *mm;
33cb5243 414 struct vm_area_struct *vma;
1da177e4 415 unsigned long address;
83c54070 416 int write, fault;
1209140c 417 unsigned long flags;
1da177e4
LT
418 siginfo_t info;
419
143a5d32
PZ
420 /*
421 * We can fault from pretty much anywhere, with unknown IRQ state.
422 */
423 trace_hardirqs_fixup();
424
a9ba9a3b
AV
425 tsk = current;
426 mm = tsk->mm;
427 prefetchw(&mm->mmap_sem);
428
1da177e4 429 /* get the address */
f51c9452 430 address = read_cr2();
1da177e4 431
1da177e4
LT
432 info.si_code = SEGV_MAPERR;
433
434
435 /*
436 * We fault-in kernel-space virtual memory on-demand. The
437 * 'reference' page table is init_mm.pgd.
438 *
439 * NOTE! We MUST NOT take any locks for this case. We may
440 * be in an interrupt or a critical region, and should
441 * only copy the information from the master page table,
442 * nothing more.
443 *
444 * This verifies that the fault happens in kernel space
445 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 446 * protection error (error_code & 9) == 0.
1da177e4 447 */
84929801 448 if (unlikely(address >= TASK_SIZE64)) {
f95190b2
AK
449 /*
450 * Don't check for the module range here: its PML4
451 * is always initialized because it's shared with the main
452 * kernel text. Only vmalloc may need PML4 syncups.
453 */
66c58156 454 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
f95190b2 455 ((address >= VMALLOC_START && address < VMALLOC_END))) {
8c914cb7
JB
456 if (vmalloc_fault(address) >= 0)
457 return;
1da177e4 458 }
74a0b576 459 if (notify_page_fault(regs))
8c914cb7 460 return;
1da177e4
LT
461 /*
462 * Don't take the mm semaphore here. If we fixup a prefetch
463 * fault we could otherwise deadlock.
464 */
465 goto bad_area_nosemaphore;
466 }
467
74a0b576 468 if (notify_page_fault(regs))
8c914cb7
JB
469 return;
470
65ea5b03 471 if (likely(regs->flags & X86_EFLAGS_IF))
8c914cb7
JB
472 local_irq_enable();
473
66c58156 474 if (unlikely(error_code & PF_RSVD))
1da177e4
LT
475 pgtable_bad(address, regs, error_code);
476
477 /*
33cb5243
HH
478 * If we're in an interrupt, have no user context or are running in an
479 * atomic region then we must not take the fault.
1da177e4
LT
480 */
481 if (unlikely(in_atomic() || !mm))
482 goto bad_area_nosemaphore;
483
dbe3ed1c
LT
484 /*
485 * User-mode registers count as a user access even for any
486 * potential system fault or CPU buglet.
487 */
488 if (user_mode_vm(regs))
489 error_code |= PF_USER;
490
1da177e4
LT
491 again:
492 /* When running in the kernel we expect faults to occur only to
493 * addresses in user space. All other faults represent errors in the
676b1855 494 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 495 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
496 * we will deadlock attempting to validate the fault against the
497 * address space. Luckily the kernel only validly references user
498 * space from well defined areas of code, which are listed in the
499 * exceptions table.
500 *
501 * As the vast majority of faults will be valid we will only perform
676b1855 502 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
503 * Attempt to lock the address space, if we cannot we then validate the
504 * source. If this is invalid we can skip the address space check,
505 * thus avoiding the deadlock.
506 */
507 if (!down_read_trylock(&mm->mmap_sem)) {
66c58156 508 if ((error_code & PF_USER) == 0 &&
65ea5b03 509 !search_exception_tables(regs->ip))
1da177e4
LT
510 goto bad_area_nosemaphore;
511 down_read(&mm->mmap_sem);
512 }
513
514 vma = find_vma(mm, address);
515 if (!vma)
516 goto bad_area;
517 if (likely(vma->vm_start <= address))
518 goto good_area;
519 if (!(vma->vm_flags & VM_GROWSDOWN))
520 goto bad_area;
33cb5243 521 if (error_code & PF_USER) {
03fdc2c2
CE
522 /* Allow userspace just enough access below the stack pointer
523 * to let the 'enter' instruction work.
524 */
65ea5b03 525 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
1da177e4
LT
526 goto bad_area;
527 }
528 if (expand_stack(vma, address))
529 goto bad_area;
530/*
531 * Ok, we have a good vm_area for this memory access, so
532 * we can handle it..
533 */
534good_area:
535 info.si_code = SEGV_ACCERR;
536 write = 0;
66c58156 537 switch (error_code & (PF_PROT|PF_WRITE)) {
33cb5243
HH
538 default: /* 3: write, present */
539 /* fall through */
540 case PF_WRITE: /* write, not present */
541 if (!(vma->vm_flags & VM_WRITE))
542 goto bad_area;
543 write++;
544 break;
545 case PF_PROT: /* read, present */
546 goto bad_area;
547 case 0: /* read, not present */
548 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1da177e4 549 goto bad_area;
1da177e4
LT
550 }
551
552 /*
553 * If for any reason at all we couldn't handle the fault,
554 * make sure we exit gracefully rather than endlessly redo
555 * the fault.
556 */
83c54070
NP
557 fault = handle_mm_fault(mm, vma, address, write);
558 if (unlikely(fault & VM_FAULT_ERROR)) {
559 if (fault & VM_FAULT_OOM)
560 goto out_of_memory;
561 else if (fault & VM_FAULT_SIGBUS)
562 goto do_sigbus;
563 BUG();
1da177e4 564 }
83c54070
NP
565 if (fault & VM_FAULT_MAJOR)
566 tsk->maj_flt++;
567 else
568 tsk->min_flt++;
1da177e4
LT
569 up_read(&mm->mmap_sem);
570 return;
571
572/*
573 * Something tried to access memory that isn't in our memory map..
574 * Fix it, but check if it's kernel or user first..
575 */
576bad_area:
577 up_read(&mm->mmap_sem);
578
579bad_area_nosemaphore:
1da177e4 580 /* User mode accesses just cause a SIGSEGV */
66c58156 581 if (error_code & PF_USER) {
e5e3c84b
SR
582
583 /*
584 * It's possible to have interrupts off here.
585 */
586 local_irq_enable();
587
1da177e4
LT
588 if (is_prefetch(regs, address, error_code))
589 return;
590
591 /* Work around K8 erratum #100 K8 in compat mode
592 occasionally jumps to illegal addresses >4GB. We
593 catch this here in the page fault handler because
594 these addresses are not reachable. Just detect this
595 case and return. Any code segment in LDT is
596 compatibility mode. */
597 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
598 (address >> 32))
599 return;
600
abd4f750
MAS
601 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
602 printk_ratelimit()) {
1da177e4 603 printk(
65ea5b03 604 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
1da177e4 605 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
65ea5b03
PA
606 tsk->comm, tsk->pid, address, regs->ip,
607 regs->sp, error_code);
1da177e4 608 }
33cb5243 609
1da177e4
LT
610 tsk->thread.cr2 = address;
611 /* Kernel addresses are always protection faults */
612 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
613 tsk->thread.trap_no = 14;
614 info.si_signo = SIGSEGV;
615 info.si_errno = 0;
616 /* info.si_code has been set above */
617 info.si_addr = (void __user *)address;
618 force_sig_info(SIGSEGV, &info, tsk);
619 return;
620 }
621
622no_context:
1da177e4 623 /* Are we prepared to handle this kernel fault? */
33cb5243 624 if (fixup_exception(regs))
1da177e4 625 return;
1da177e4 626
33cb5243 627 /*
1da177e4
LT
628 * Hall of shame of CPU/BIOS bugs.
629 */
630
33cb5243
HH
631 if (is_prefetch(regs, address, error_code))
632 return;
1da177e4
LT
633
634 if (is_errata93(regs, address))
33cb5243 635 return;
1da177e4
LT
636
637/*
638 * Oops. The kernel tried to access some bad page. We'll have to
639 * terminate things with extreme prejudice.
640 */
641
1209140c 642 flags = oops_begin();
1da177e4
LT
643
644 if (address < PAGE_SIZE)
645 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
646 else
647 printk(KERN_ALERT "Unable to handle kernel paging request");
33cb5243 648 printk(" at %016lx RIP: \n" KERN_ALERT, address);
65ea5b03 649 printk_address(regs->ip);
1da177e4 650 dump_pagetable(address);
6e3f3617
JB
651 tsk->thread.cr2 = address;
652 tsk->thread.trap_no = 14;
653 tsk->thread.error_code = error_code;
22f5991c
JB
654 if (__die("Oops", regs, error_code))
655 regs = NULL;
1da177e4
LT
656 /* Executive summary in case the body of the oops scrolled away */
657 printk(KERN_EMERG "CR2: %016lx\n", address);
22f5991c 658 oops_end(flags, regs, SIGKILL);
1da177e4
LT
659
660/*
661 * We ran out of memory, or some other thing happened to us that made
662 * us unable to handle the page fault gracefully.
663 */
664out_of_memory:
665 up_read(&mm->mmap_sem);
b460cbc5 666 if (is_global_init(current)) {
1da177e4
LT
667 yield();
668 goto again;
669 }
670 printk("VM: killing process %s\n", tsk->comm);
671 if (error_code & 4)
021daae2 672 do_group_exit(SIGKILL);
1da177e4
LT
673 goto no_context;
674
675do_sigbus:
676 up_read(&mm->mmap_sem);
677
678 /* Kernel mode? Handle exceptions or die */
66c58156 679 if (!(error_code & PF_USER))
1da177e4
LT
680 goto no_context;
681
682 tsk->thread.cr2 = address;
683 tsk->thread.error_code = error_code;
684 tsk->thread.trap_no = 14;
685 info.si_signo = SIGBUS;
686 info.si_errno = 0;
687 info.si_code = BUS_ADRERR;
688 info.si_addr = (void __user *)address;
689 force_sig_info(SIGBUS, &info, tsk);
690 return;
691}
9e43e1b7 692
8c914cb7 693DEFINE_SPINLOCK(pgd_lock);
2bff7383 694LIST_HEAD(pgd_list);
8c914cb7
JB
695
696void vmalloc_sync_all(void)
697{
33cb5243 698 /* Note that races in the updates of insync and start aren't
8c914cb7
JB
699 problematic:
700 insync can only get set bits added, and updates to start are only
701 improving performance (without affecting correctness if undone). */
702 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
703 static unsigned long start = VMALLOC_START & PGDIR_MASK;
704 unsigned long address;
705
706 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
707 if (!test_bit(pgd_index(address), insync)) {
708 const pgd_t *pgd_ref = pgd_offset_k(address);
709 struct page *page;
710
711 if (pgd_none(*pgd_ref))
712 continue;
713 spin_lock(&pgd_lock);
2bff7383 714 list_for_each_entry(page, &pgd_list, lru) {
8c914cb7
JB
715 pgd_t *pgd;
716 pgd = (pgd_t *)page_address(page) + pgd_index(address);
717 if (pgd_none(*pgd))
718 set_pgd(pgd, *pgd_ref);
719 else
46a82b2d 720 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7
JB
721 }
722 spin_unlock(&pgd_lock);
723 set_bit(pgd_index(address), insync);
724 }
725 if (address == start)
726 start = address + PGDIR_SIZE;
727 }
728 /* Check that there is no need to do the same for the modules area. */
729 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
33cb5243 730 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
8c914cb7
JB
731 (__START_KERNEL & PGDIR_MASK)));
732}