x86: optimise x86's do_page_fault (C entry point for the page fault path)

author Nick Piggin <npiggin@suse.de>

Tue, 20 Jan 2009 03:24:26 +0000 (04:24 +0100)

committer Ingo Molnar <mingo@elte.hu>

Tue, 20 Jan 2009 12:14:23 +0000 (13:14 +0100)
author Nick Piggin <npiggin@suse.de>
Tue, 20 Jan 2009 03:24:26 +0000 (04:24 +0100)
committer Ingo Molnar <mingo@elte.hu>
Tue, 20 Jan 2009 12:14:23 +0000 (13:14 +0100)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c

index 90dfae511a41e1dcf7fcf188b94d13c123780606..033292dc9e214d96b28ff6154bca2af366b4b711 100644 (file)
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -91,8 +91,8 @@ static inline int notify_page_fault(struct pt_regs *regs)
   *
   * Opcode checker based on code by Richard Brunner
   */
-static int is_prefetch(struct pt_regs *regs, unsigned long addr,
-                      unsigned long error_code)
+static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
+                       unsigned long addr)
  {
         unsigned char *instr;
         int scan_more = 1;
@@ -409,15 +409,15 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
  }
  
  #ifdef CONFIG_X86_64
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
-                                unsigned long error_code)
+static noinline void pgtable_bad(struct pt_regs *regs,
+                        unsigned long error_code, unsigned long address)
  {
         unsigned long flags = oops_begin();
         int sig = SIGKILL;
-       struct task_struct *tsk;
+       struct task_struct *tsk = current;
  
         printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
-              current->comm, address);
+              tsk->comm, address);
         dump_pagetable(address);
         tsk = current;
         tsk->thread.cr2 = address;
@@ -429,6 +429,190 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
  }
  #endif
  
+static noinline void no_context(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address)
+{
+       struct task_struct *tsk = current;
+#ifdef CONFIG_X86_64
+       unsigned long flags;
+       int sig;
+#endif
+
+       /* Are we prepared to handle this kernel fault?  */
+       if (fixup_exception(regs))
+               return;
+
+       /*
+        * X86_32
+        * Valid to do another page fault here, because if this fault
+        * had been triggered by is_prefetch fixup_exception would have
+        * handled it.
+        *
+        * X86_64
+        * Hall of shame of CPU/BIOS bugs.
+        */
+       if (is_prefetch(regs, error_code, address))
+               return;
+
+       if (is_errata93(regs, address))
+               return;
+
+       /*
+        * Oops. The kernel tried to access some bad page. We'll have to
+        * terminate things with extreme prejudice.
+        */
+#ifdef CONFIG_X86_32
+       bust_spinlocks(1);
+#else
+       flags = oops_begin();
+#endif
+
+       show_fault_oops(regs, error_code, address);
+
+       tsk->thread.cr2 = address;
+       tsk->thread.trap_no = 14;
+       tsk->thread.error_code = error_code;
+
+#ifdef CONFIG_X86_32
+       die("Oops", regs, error_code);
+       bust_spinlocks(0);
+       do_exit(SIGKILL);
+#else
+       sig = SIGKILL;
+       if (__die("Oops", regs, error_code))
+               sig = 0;
+       /* Executive summary in case the body of the oops scrolled away */
+       printk(KERN_EMERG "CR2: %016lx\n", address);
+       oops_end(flags, regs, sig);
+#endif
+}
+
+static void __bad_area_nosemaphore(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address,
+                       int si_code)
+{
+       struct task_struct *tsk = current;
+
+       /* User mode accesses just cause a SIGSEGV */
+       if (error_code & PF_USER) {
+               /*
+                * It's possible to have interrupts off here.
+                */
+               local_irq_enable();
+
+               /*
+                * Valid to do another page fault here because this one came
+                * from user space.
+                */
+               if (is_prefetch(regs, error_code, address))
+                       return;
+
+               if (is_errata100(regs, address))
+                       return;
+
+               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+                   printk_ratelimit()) {
+                       printk(
+                       "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+                       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+                       tsk->comm, task_pid_nr(tsk), address,
+                       (void *) regs->ip, (void *) regs->sp, error_code);
+                       print_vma_addr(" in ", regs->ip);
+                       printk("\n");
+               }
+
+               tsk->thread.cr2 = address;
+               /* Kernel addresses are always protection faults */
+               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+               tsk->thread.trap_no = 14;
+               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+               return;
+       }
+
+       if (is_f00f_bug(regs, address))
+               return;
+
+       no_context(regs, error_code, address);
+}
+
+static noinline void bad_area_nosemaphore(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address)
+{
+       __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+
+static void __bad_area(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address,
+                       int si_code)
+{
+       struct mm_struct *mm = current->mm;
+
+       /*
+        * Something tried to access memory that isn't in our memory map..
+        * Fix it, but check if it's kernel or user first..
+        */
+       up_read(&mm->mmap_sem);
+
+       __bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+
+static noinline void bad_area(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address)
+{
+       __bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+
+static noinline void bad_area_access_error(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address)
+{
+       __bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+
+/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
+static void out_of_memory(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address)
+{
+       /*
+        * We ran out of memory, call the OOM killer, and return the userspace
+        * (which will retry the fault, or kill us if we got oom-killed).
+        */
+       up_read(&current->mm->mmap_sem);
+       pagefault_out_of_memory();
+}
+
+static void do_sigbus(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address)
+{
+       struct task_struct *tsk = current;
+       struct mm_struct *mm = tsk->mm;
+
+       up_read(&mm->mmap_sem);
+
+       /* Kernel mode? Handle exceptions or die */
+       if (!(error_code & PF_USER))
+               no_context(regs, error_code, address);
+#ifdef CONFIG_X86_32
+       /* User space => ok to do another page fault */
+       if (is_prefetch(regs, error_code, address))
+               return;
+#endif
+       tsk->thread.cr2 = address;
+       tsk->thread.error_code = error_code;
+       tsk->thread.trap_no = 14;
+       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+static noinline void mm_fault_error(struct pt_regs *regs,
+               unsigned long error_code, unsigned long address, unsigned int fault)
+{
+       if (fault & VM_FAULT_OOM)
+               out_of_memory(regs, error_code, address);
+       else if (fault & VM_FAULT_SIGBUS)
+               do_sigbus(regs, error_code, address);
+       else
+               BUG();
+}
+
  static int spurious_fault_check(unsigned long error_code, pte_t *pte)
  {
         if ((error_code & PF_WRITE) && !pte_write(*pte))
@@ -448,8 +632,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
   * There are no security implications to leaving a stale TLB when
   * increasing the permissions on a page.
   */
-static int spurious_fault(unsigned long address,
-                         unsigned long error_code)
+static noinline int spurious_fault(unsigned long error_code,
+                               unsigned long address)
  {
         pgd_t *pgd;
         pud_t *pud;
@@ -494,7 +678,7 @@ static int spurious_fault(unsigned long address,
   *
   * This assumes no large pages in there.
   */
-static int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
  {
  #ifdef CONFIG_X86_32
         unsigned long pgd_paddr;
@@ -573,6 +757,25 @@ static int vmalloc_fault(unsigned long address)
  
  int show_unhandled_signals = 1;
  
+static inline int access_error(unsigned long error_code, int write,
+                               struct vm_area_struct *vma)
+{
+       if (write) {
+               /* write, present and write, not present */
+               if (unlikely(!(vma->vm_flags & VM_WRITE)))
+                       return 1;
+       } else if (unlikely(error_code & PF_PROT)) {
+               /* read, present */
+               return 1;
+       } else {
+               /* read, not present */
+               if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+                       return 1;
+       }
+
+       return 0;
+}
+
  /*
   * This routine handles page faults.  It determines the address,
   * and the problem, and then passes it off to one of the appropriate
@@ -583,16 +786,12 @@ asmlinkage
  #endif
  void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
  {
+       unsigned long address;
         struct task_struct *tsk;
         struct mm_struct *mm;
         struct vm_area_struct *vma;
-       unsigned long address;
-       int write, si_code;
+       int write;
         int fault;
-#ifdef CONFIG_X86_64
-       unsigned long flags;
-       int sig;
-#endif
  
         tsk = current;
         mm = tsk->mm;
@@ -601,9 +800,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
         /* get the address */
         address = read_cr2();
  
-       si_code = SEGV_MAPERR;
-
-       if (notify_page_fault(regs))
+       if (unlikely(notify_page_fault(regs)))
                 return;
         if (unlikely(kmmio_fault(regs, address)))
                 return;
@@ -631,17 +828,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
                         return;
  
                 /* Can handle a stale RO->RW TLB */
-               if (spurious_fault(address, error_code))
+               if (spurious_fault(error_code, address))
                         return;
  
                 /*
                  * Don't take the mm semaphore here. If we fixup a prefetch
                  * fault we could otherwise deadlock.
                  */
-               goto bad_area_nosemaphore;
+               bad_area_nosemaphore(regs, error_code, address);
+               return;
         }
  
-
         /*
          * It's safe to allow irq's after cr2 has been saved and the
          * vmalloc fault has been handled.
@@ -657,15 +854,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
  
  #ifdef CONFIG_X86_64
         if (unlikely(error_code & PF_RSVD))
-               pgtable_bad(address, regs, error_code);
+               pgtable_bad(regs, error_code, address);
  #endif
  
         /*
          * If we're in an interrupt, have no user context or are running in an
          * atomic region then we must not take the fault.
          */
-       if (unlikely(in_atomic() || !mm))
-               goto bad_area_nosemaphore;
+       if (unlikely(in_atomic() || !mm)) {
+               bad_area_nosemaphore(regs, error_code, address);
+               return;
+       }
  
         /*
          * When running in the kernel we expect faults to occur only to
@@ -683,20 +882,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
          * source.  If this is invalid we can skip the address space check,
          * thus avoiding the deadlock.
          */
-       if (!down_read_trylock(&mm->mmap_sem)) {
+       if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
                 if ((error_code & PF_USER) == 0 &&
-                   !search_exception_tables(regs->ip))
-                       goto bad_area_nosemaphore;
+                   !search_exception_tables(regs->ip)) {
+                       bad_area_nosemaphore(regs, error_code, address);
+                       return;
+               }
                 down_read(&mm->mmap_sem);
         }
  
         vma = find_vma(mm, address);
-       if (!vma)
-               goto bad_area;
-       if (vma->vm_start <= address)
+       if (unlikely(!vma)) {
+               bad_area(regs, error_code, address);
+               return;
+       }
+       if (likely(vma->vm_start <= address))
                 goto good_area;
-       if (!(vma->vm_flags & VM_GROWSDOWN))
-               goto bad_area;
+       if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+               bad_area(regs, error_code, address);
+               return;
+       }
         if (error_code & PF_USER) {
                 /*
                  * Accessing the stack below %sp is always a bug.
@@ -704,31 +909,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
                  * and pusha to work.  ("enter $65535,$31" pushes
                  * 32 pointers and then decrements %sp by 65535.)
                  */
-               if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
-                       goto bad_area;
+               if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
+                       bad_area(regs, error_code, address);
+                       return;
+               }
         }
-       if (expand_stack(vma, address))
-               goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
+       if (unlikely(expand_stack(vma, address))) {
+               bad_area(regs, error_code, address);
+               return;
+       }
+
+       /*
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
  good_area:
-       si_code = SEGV_ACCERR;
-       write = 0;
-       switch (error_code & (PF_PROT|PF_WRITE)) {
-       default:        /* 3: write, present */
-               /* fall through */
-       case PF_WRITE:          /* write, not present */
-               if (!(vma->vm_flags & VM_WRITE))
-                       goto bad_area;
-               write++;
-               break;
-       case PF_PROT:           /* read, present */
-               goto bad_area;
-       case 0:                 /* read, not present */
-               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-                       goto bad_area;
+       write = error_code & PF_WRITE;
+       if (unlikely(access_error(error_code, write, vma))) {
+               bad_area_access_error(regs, error_code, address);
+               return;
         }
  
         /*
@@ -738,11 +937,8 @@ good_area:
          */
         fault = handle_mm_fault(mm, vma, address, write);
         if (unlikely(fault & VM_FAULT_ERROR)) {
-               if (fault & VM_FAULT_OOM)
-                       goto out_of_memory;
-               else if (fault & VM_FAULT_SIGBUS)
-                       goto do_sigbus;
-               BUG();
+               mm_fault_error(regs, error_code, address, fault);
+               return;
         }
         if (fault & VM_FAULT_MAJOR)
                 tsk->maj_flt++;
@@ -760,128 +956,6 @@ good_area:
         }
  #endif
         up_read(&mm->mmap_sem);
-       return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-       up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-       /* User mode accesses just cause a SIGSEGV */
-       if (error_code & PF_USER) {
-               /*
-                * It's possible to have interrupts off here.
-                */
-               local_irq_enable();
-
-               /*
-                * Valid to do another page fault here because this one came
-                * from user space.
-                */
-               if (is_prefetch(regs, address, error_code))
-                       return;
-
-               if (is_errata100(regs, address))
-                       return;
-
-               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-                   printk_ratelimit()) {
-                       printk(
-                       "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
-                       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-                       tsk->comm, task_pid_nr(tsk), address,
-                       (void *) regs->ip, (void *) regs->sp, error_code);
-                       print_vma_addr(" in ", regs->ip);
-                       printk("\n");
-               }
-
-               tsk->thread.cr2 = address;
-               /* Kernel addresses are always protection faults */
-               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-               tsk->thread.trap_no = 14;
-               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
-               return;
-       }
-
-       if (is_f00f_bug(regs, address))
-               return;
-
-no_context:
-       /* Are we prepared to handle this kernel fault?  */
-       if (fixup_exception(regs))
-               return;
-
-       /*
-        * X86_32
-        * Valid to do another page fault here, because if this fault
-        * had been triggered by is_prefetch fixup_exception would have
-        * handled it.
-        *
-        * X86_64
-        * Hall of shame of CPU/BIOS bugs.
-        */
-       if (is_prefetch(regs, address, error_code))
-               return;
-
-       if (is_errata93(regs, address))
-               return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-#ifdef CONFIG_X86_32
-       bust_spinlocks(1);
-#else
-       flags = oops_begin();
-#endif
-
-       show_fault_oops(regs, error_code, address);
-
-       tsk->thread.cr2 = address;
-       tsk->thread.trap_no = 14;
-       tsk->thread.error_code = error_code;
-
-#ifdef CONFIG_X86_32
-       die("Oops", regs, error_code);
-       bust_spinlocks(0);
-       do_exit(SIGKILL);
-#else
-       sig = SIGKILL;
-       if (__die("Oops", regs, error_code))
-               sig = 0;
-       /* Executive summary in case the body of the oops scrolled away */
-       printk(KERN_EMERG "CR2: %016lx\n", address);
-       oops_end(flags, regs, sig);
-#endif
-
-out_of_memory:
-       /*
-        * We ran out of memory, call the OOM killer, and return the userspace
-        * (which will retry the fault, or kill us if we got oom-killed).
-        */
-       up_read(&mm->mmap_sem);
-       pagefault_out_of_memory();
-       return;
-
-do_sigbus:
-       up_read(&mm->mmap_sem);
-
-       /* Kernel mode? Handle exceptions or die */
-       if (!(error_code & PF_USER))
-               goto no_context;
-#ifdef CONFIG_X86_32
-       /* User space => ok to do another page fault */
-       if (is_prefetch(regs, address, error_code))
-               return;
-#endif
-       tsk->thread.cr2 = address;
-       tsk->thread.error_code = error_code;
-       tsk->thread.trap_no = 14;
-       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
  }
  
  DEFINE_SPINLOCK(pgd_lock);
author	Nick Piggin <npiggin@suse.de>
	Tue, 20 Jan 2009 03:24:26 +0000 (04:24 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Tue, 20 Jan 2009 12:14:23 +0000 (13:14 +0100)