arch/x86/kernel/traps.c

   1 /*
   2  *  Copyright (C) 1991, 1992  Linus Torvalds
   3  *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
   4  *
   5  *  Pentium III FXSR, SSE support
   6  *      Gareth Hughes <gareth@valinux.com>, May 2000
   7  */
   8
   9 /*
  10  * Handle hardware traps and faults.
  11  */
  12
  13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15 #include <linux/context_tracking.h>
  16 #include <linux/interrupt.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/spinlock.h>
  19 #include <linux/kprobes.h>
  20 #include <linux/uaccess.h>
  21 #include <linux/kdebug.h>
  22 #include <linux/kgdb.h>
  23 #include <linux/kernel.h>
  24 #include <linux/export.h>
  25 #include <linux/ptrace.h>
  26 #include <linux/uprobes.h>
  27 #include <linux/string.h>
  28 #include <linux/delay.h>
  29 #include <linux/errno.h>
  30 #include <linux/kexec.h>
  31 #include <linux/sched.h>
  32 #include <linux/sched/task_stack.h>
  33 #include <linux/timer.h>
  34 #include <linux/init.h>
  35 #include <linux/bug.h>
  36 #include <linux/nmi.h>
  37 #include <linux/mm.h>
  38 #include <linux/smp.h>
  39 #include <linux/io.h>
  40
  41 #ifdef CONFIG_EISA
  42 #include <linux/ioport.h>
  43 #include <linux/eisa.h>
  44 #endif
  45
  46 #if defined(CONFIG_EDAC)
  47 #include <linux/edac.h>
  48 #endif
  49
  50 #include <asm/kmemcheck.h>
  51 #include <asm/stacktrace.h>
  52 #include <asm/processor.h>
  53 #include <asm/debugreg.h>
  54 #include <linux/atomic.h>
  55 #include <asm/text-patching.h>
  56 #include <asm/ftrace.h>
  57 #include <asm/traps.h>
  58 #include <asm/desc.h>
  59 #include <asm/fpu/internal.h>
  60 #include <asm/mce.h>
  61 #include <asm/fixmap.h>
  62 #include <asm/mach_traps.h>
  63 #include <asm/alternative.h>
  64 #include <asm/fpu/xstate.h>
  65 #include <asm/trace/mpx.h>
  66 #include <asm/mpx.h>
  67 #include <asm/vm86.h>
  68
  69 #ifdef CONFIG_X86_64
  70 #include <asm/x86_init.h>
  71 #include <asm/pgalloc.h>
  72 #include <asm/proto.h>
  73
  74 /* No need to be aligned, but done to keep all IDTs defined the same way. */
  75 gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
  76 #else
  77 #include <asm/processor-flags.h>
  78 #include <asm/setup.h>
  79 #include <asm/proto.h>
  80 #endif
  81
  82 /* Must be page-aligned because the real IDT is used in a fixmap. */
  83 gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
  84
  85 DECLARE_BITMAP(used_vectors, NR_VECTORS);
  86 EXPORT_SYMBOL_GPL(used_vectors);
  87
  88 static inline void cond_local_irq_enable(struct pt_regs *regs)
  89 {
  90         if (regs->flags & X86_EFLAGS_IF)
  91                 local_irq_enable();
  92 }
  93
  94 static inline void cond_local_irq_disable(struct pt_regs *regs)
  95 {
  96         if (regs->flags & X86_EFLAGS_IF)
  97                 local_irq_disable();
  98 }
  99
 100 /*
 101  * In IST context, we explicitly disable preemption.  This serves two
 102  * purposes: it makes it much less likely that we would accidentally
 103  * schedule in IST context and it will force a warning if we somehow
 104  * manage to schedule by accident.
 105  */
 106 void ist_enter(struct pt_regs *regs)
 107 {
 108         if (user_mode(regs)) {
 109                 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 110         } else {
 111                 /*
 112                  * We might have interrupted pretty much anything.  In
 113                  * fact, if we're a machine check, we can even interrupt
 114                  * NMI processing.  We don't want in_nmi() to return true,
 115                  * but we need to notify RCU.
 116                  */
 117                 rcu_nmi_enter();
 118         }
 119
 120         preempt_disable();
 121
 122         /* This code is a bit fragile.  Test it. */
 123         RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
 124 }
 125
 126 void ist_exit(struct pt_regs *regs)
 127 {
 128         preempt_enable_no_resched();
 129
 130         if (!user_mode(regs))
 131                 rcu_nmi_exit();
 132 }
 133
 134 /**
 135  * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
 136  * @regs:       regs passed to the IST exception handler
 137  *
 138  * IST exception handlers normally cannot schedule.  As a special
 139  * exception, if the exception interrupted userspace code (i.e.
 140  * user_mode(regs) would return true) and the exception was not
 141  * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
 142  * begins a non-atomic section within an ist_enter()/ist_exit() region.
 143  * Callers are responsible for enabling interrupts themselves inside
 144  * the non-atomic section, and callers must call ist_end_non_atomic()
 145  * before ist_exit().
 146  */
 147 void ist_begin_non_atomic(struct pt_regs *regs)
 148 {
 149         BUG_ON(!user_mode(regs));
 150
 151         /*
 152          * Sanity check: we need to be on the normal thread stack.  This
 153          * will catch asm bugs and any attempt to use ist_preempt_enable
 154          * from double_fault.
 155          */
 156         BUG_ON(!on_thread_stack());
 157
 158         preempt_enable_no_resched();
 159 }
 160
 161 /**
 162  * ist_end_non_atomic() - begin a non-atomic section in an IST exception
 163  *
 164  * Ends a non-atomic section started with ist_begin_non_atomic().
 165  */
 166 void ist_end_non_atomic(void)
 167 {
 168         preempt_disable();
 169 }
 170
 171 int is_valid_bugaddr(unsigned long addr)
 172 {
 173         unsigned short ud;
 174
 175         if (addr < TASK_SIZE_MAX)
 176                 return 0;
 177
 178         if (probe_kernel_address((unsigned short *)addr, ud))
 179                 return 0;
 180
 181         return ud == INSN_UD0 || ud == INSN_UD2;
 182 }
 183
 184 int fixup_bug(struct pt_regs *regs, int trapnr)
 185 {
 186         if (trapnr != X86_TRAP_UD)
 187                 return 0;
 188
 189         switch (report_bug(regs->ip, regs)) {
 190         case BUG_TRAP_TYPE_NONE:
 191         case BUG_TRAP_TYPE_BUG:
 192                 break;
 193
 194         case BUG_TRAP_TYPE_WARN:
 195                 regs->ip += LEN_UD0;
 196                 return 1;
 197         }
 198
 199         return 0;
 200 }
 201
 202 static nokprobe_inline int
 203 do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 204                   struct pt_regs *regs, long error_code)
 205 {
 206         if (v8086_mode(regs)) {
 207                 /*
 208                  * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
 209                  * On nmi (interrupt 2), do_trap should not be called.
 210                  */
 211                 if (trapnr < X86_TRAP_UD) {
 212                         if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
 213                                                 error_code, trapnr))
 214                                 return 0;
 215                 }
 216                 return -1;
 217         }
 218
 219         if (!user_mode(regs)) {
 220                 if (fixup_exception(regs, trapnr))
 221                         return 0;
 222
 223                 tsk->thread.error_code = error_code;
 224                 tsk->thread.trap_nr = trapnr;
 225                 die(str, regs, error_code);
 226         }
 227
 228         return -1;
 229 }
 230
 231 static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
 232                                 siginfo_t *info)
 233 {
 234         unsigned long siaddr;
 235         int sicode;
 236
 237         switch (trapnr) {
 238         default:
 239                 return SEND_SIG_PRIV;
 240
 241         case X86_TRAP_DE:
 242                 sicode = FPE_INTDIV;
 243                 siaddr = uprobe_get_trap_addr(regs);
 244                 break;
 245         case X86_TRAP_UD:
 246                 sicode = ILL_ILLOPN;
 247                 siaddr = uprobe_get_trap_addr(regs);
 248                 break;
 249         case X86_TRAP_AC:
 250                 sicode = BUS_ADRALN;
 251                 siaddr = 0;
 252                 break;
 253         }
 254
 255         info->si_signo = signr;
 256         info->si_errno = 0;
 257         info->si_code = sicode;
 258         info->si_addr = (void __user *)siaddr;
 259         return info;
 260 }
 261
 262 static void
 263 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 264         long error_code, siginfo_t *info)
 265 {
 266         struct task_struct *tsk = current;
 267
 268
 269         if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
 270                 return;
 271         /*
 272          * We want error_code and trap_nr set for userspace faults and
 273          * kernelspace faults which result in die(), but not
 274          * kernelspace faults which are fixed up.  die() gives the
 275          * process no chance to handle the signal and notice the
 276          * kernel fault information, so that won't result in polluting
 277          * the information about previously queued, but not yet
 278          * delivered, faults.  See also do_general_protection below.
 279          */
 280         tsk->thread.error_code = error_code;
 281         tsk->thread.trap_nr = trapnr;
 282
 283         if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
 284             printk_ratelimit()) {
 285                 pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
 286                         tsk->comm, tsk->pid, str,
 287                         regs->ip, regs->sp, error_code);
 288                 print_vma_addr(KERN_CONT " in ", regs->ip);
 289                 pr_cont("\n");
 290         }
 291
 292         force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
 293 }
 294 NOKPROBE_SYMBOL(do_trap);
 295
 296 static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 297                           unsigned long trapnr, int signr)
 298 {
 299         siginfo_t info;
 300
 301         RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 302
 303         /*
 304          * WARN*()s end up here; fix them up before we call the
 305          * notifier chain.
 306          */
 307         if (!user_mode(regs) && fixup_bug(regs, trapnr))
 308                 return;
 309
 310         if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 311                         NOTIFY_STOP) {
 312                 cond_local_irq_enable(regs);
 313                 do_trap(trapnr, signr, str, regs, error_code,
 314                         fill_trap_info(regs, signr, trapnr, &info));
 315         }
 316 }
 317
 318 #define DO_ERROR(trapnr, signr, str, name)                              \
 319 dotraplinkage void do_##name(struct pt_regs *regs, long error_code)     \
 320 {                                                                       \
 321         do_error_trap(regs, error_code, str, trapnr, signr);            \
 322 }
 323
 324 DO_ERROR(X86_TRAP_DE,     SIGFPE,  "divide error",              divide_error)
 325 DO_ERROR(X86_TRAP_OF,     SIGSEGV, "overflow",                  overflow)
 326 DO_ERROR(X86_TRAP_UD,     SIGILL,  "invalid opcode",            invalid_op)
 327 DO_ERROR(X86_TRAP_OLD_MF, SIGFPE,  "coprocessor segment overrun",coprocessor_segment_overrun)
 328 DO_ERROR(X86_TRAP_TS,     SIGSEGV, "invalid TSS",               invalid_TSS)
 329 DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present",       segment_not_present)
 330 DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",             stack_segment)
 331 DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",           alignment_check)
 332
 333 #ifdef CONFIG_VMAP_STACK
 334 __visible void __noreturn handle_stack_overflow(const char *message,
 335                                                 struct pt_regs *regs,
 336                                                 unsigned long fault_address)
 337 {
 338         printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
 339                  (void *)fault_address, current->stack,
 340                  (char *)current->stack + THREAD_SIZE - 1);
 341         die(message, regs, 0);
 342
 343         /* Be absolutely certain we don't return. */
 344         panic(message);
 345 }
 346 #endif
 347
 348 #ifdef CONFIG_X86_64
 349 /* Runs on IST stack */
 350 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 351 {
 352         static const char str[] = "double fault";
 353         struct task_struct *tsk = current;
 354 #ifdef CONFIG_VMAP_STACK
 355         unsigned long cr2;
 356 #endif
 357
 358 #ifdef CONFIG_X86_ESPFIX64
 359         extern unsigned char native_irq_return_iret[];
 360
 361         /*
 362          * If IRET takes a non-IST fault on the espfix64 stack, then we
 363          * end up promoting it to a doublefault.  In that case, modify
 364          * the stack to make it look like we just entered the #GP
 365          * handler from user space, similar to bad_iret.
 366          *
 367          * No need for ist_enter here because we don't use RCU.
 368          */
 369         if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
 370                 regs->cs == __KERNEL_CS &&
 371                 regs->ip == (unsigned long)native_irq_return_iret)
 372         {
 373                 struct pt_regs *normal_regs = task_pt_regs(current);
 374
 375                 /* Fake a #GP(0) from userspace. */
 376                 memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
 377                 normal_regs->orig_ax = 0;  /* Missing (lost) #GP error code */
 378                 regs->ip = (unsigned long)general_protection;
 379                 regs->sp = (unsigned long)&normal_regs->orig_ax;
 380
 381                 return;
 382         }
 383 #endif
 384
 385         ist_enter(regs);
 386         notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 387
 388         tsk->thread.error_code = error_code;
 389         tsk->thread.trap_nr = X86_TRAP_DF;
 390
 391 #ifdef CONFIG_VMAP_STACK
 392         /*
 393          * If we overflow the stack into a guard page, the CPU will fail
 394          * to deliver #PF and will send #DF instead.  Similarly, if we
 395          * take any non-IST exception while too close to the bottom of
 396          * the stack, the processor will get a page fault while
 397          * delivering the exception and will generate a double fault.
 398          *
 399          * According to the SDM (footnote in 6.15 under "Interrupt 14 -
 400          * Page-Fault Exception (#PF):
 401          *
 402          *   Processors update CR2 whenever a page fault is detected. If a
 403          *   second page fault occurs while an earlier page fault is being
 404          *   deliv- ered, the faulting linear address of the second fault will
 405          *   overwrite the contents of CR2 (replacing the previous
 406          *   address). These updates to CR2 occur even if the page fault
 407          *   results in a double fault or occurs during the delivery of a
 408          *   double fault.
 409          *
 410          * The logic below has a small possibility of incorrectly diagnosing
 411          * some errors as stack overflows.  For example, if the IDT or GDT
 412          * gets corrupted such that #GP delivery fails due to a bad descriptor
 413          * causing #GP and we hit this condition while CR2 coincidentally
 414          * points to the stack guard page, we'll think we overflowed the
 415          * stack.  Given that we're going to panic one way or another
 416          * if this happens, this isn't necessarily worth fixing.
 417          *
 418          * If necessary, we could improve the test by only diagnosing
 419          * a stack overflow if the saved RSP points within 47 bytes of
 420          * the bottom of the stack: if RSP == tsk_stack + 48 and we
 421          * take an exception, the stack is already aligned and there
 422          * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
 423          * possible error code, so a stack overflow would *not* double
 424          * fault.  With any less space left, exception delivery could
 425          * fail, and, as a practical matter, we've overflowed the
 426          * stack even if the actual trigger for the double fault was
 427          * something else.
 428          */
 429         cr2 = read_cr2();
 430         if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
 431                 handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
 432 #endif
 433
 434 #ifdef CONFIG_DOUBLEFAULT
 435         df_debug(regs, error_code);
 436 #endif
 437         /*
 438          * This is always a kernel trap and never fixable (and thus must
 439          * never return).
 440          */
 441         for (;;)
 442                 die(str, regs, error_code);
 443 }
 444 #endif
 445
 446 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 447 {
 448         const struct mpx_bndcsr *bndcsr;
 449         siginfo_t *info;
 450
 451         RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 452         if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 453                         X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
 454                 return;
 455         cond_local_irq_enable(regs);
 456
 457         if (!user_mode(regs))
 458                 die("bounds", regs, error_code);
 459
 460         if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
 461                 /* The exception is not from Intel MPX */
 462                 goto exit_trap;
 463         }
 464
 465         /*
 466          * We need to look at BNDSTATUS to resolve this exception.
 467          * A NULL here might mean that it is in its 'init state',
 468          * which is all zeros which indicates MPX was not
 469          * responsible for the exception.
 470          */
 471         bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
 472         if (!bndcsr)
 473                 goto exit_trap;
 474
 475         trace_bounds_exception_mpx(bndcsr);
 476         /*
 477          * The error code field of the BNDSTATUS register communicates status
 478          * information of a bound range exception #BR or operation involving
 479          * bound directory.
 480          */
 481         switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
 482         case 2: /* Bound directory has invalid entry. */
 483                 if (mpx_handle_bd_fault())
 484                         goto exit_trap;
 485                 break; /* Success, it was handled */
 486         case 1: /* Bound violation. */
 487                 info = mpx_generate_siginfo(regs);
 488                 if (IS_ERR(info)) {
 489                         /*
 490                          * We failed to decode the MPX instruction.  Act as if
 491                          * the exception was not caused by MPX.
 492                          */
 493                         goto exit_trap;
 494                 }
 495                 /*
 496                  * Success, we decoded the instruction and retrieved
 497                  * an 'info' containing the address being accessed
 498                  * which caused the exception.  This information
 499                  * allows and application to possibly handle the
 500                  * #BR exception itself.
 501                  */
 502                 do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info);
 503                 kfree(info);
 504                 break;
 505         case 0: /* No exception caused by Intel MPX operations. */
 506                 goto exit_trap;
 507         default:
 508                 die("bounds", regs, error_code);
 509         }
 510
 511         return;
 512
 513 exit_trap:
 514         /*
 515          * This path out is for all the cases where we could not
 516          * handle the exception in some way (like allocating a
 517          * table or telling userspace about it.  We will also end
 518          * up here if the kernel has MPX turned off at compile
 519          * time..
 520          */
 521         do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
 522 }
 523
 524 dotraplinkage void
 525 do_general_protection(struct pt_regs *regs, long error_code)
 526 {
 527         struct task_struct *tsk;
 528
 529         RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 530         cond_local_irq_enable(regs);
 531
 532         if (v8086_mode(regs)) {
 533                 local_irq_enable();
 534                 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
 535                 return;
 536         }
 537
 538         tsk = current;
 539         if (!user_mode(regs)) {
 540                 if (fixup_exception(regs, X86_TRAP_GP))
 541                         return;
 542
 543                 tsk->thread.error_code = error_code;
 544                 tsk->thread.trap_nr = X86_TRAP_GP;
 545                 if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
 546                                X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
 547                         die("general protection fault", regs, error_code);
 548                 return;
 549         }
 550
 551         tsk->thread.error_code = error_code;
 552         tsk->thread.trap_nr = X86_TRAP_GP;
 553
 554         if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 555                         printk_ratelimit()) {
 556                 pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
 557                         tsk->comm, task_pid_nr(tsk),
 558                         regs->ip, regs->sp, error_code);
 559                 print_vma_addr(KERN_CONT " in ", regs->ip);
 560                 pr_cont("\n");
 561         }
 562
 563         force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
 564 }
 565 NOKPROBE_SYMBOL(do_general_protection);
 566
 567 /* May run on IST stack. */
 568 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 569 {
 570 #ifdef CONFIG_DYNAMIC_FTRACE
 571         /*
 572          * ftrace must be first, everything else may cause a recursive crash.
 573          * See note by declaration of modifying_ftrace_code in ftrace.c
 574          */
 575         if (unlikely(atomic_read(&modifying_ftrace_code)) &&
 576             ftrace_int3_handler(regs))
 577                 return;
 578 #endif
 579         if (poke_int3_handler(regs))
 580                 return;
 581
 582         ist_enter(regs);
 583         RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 584 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 585         if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 586                                 SIGTRAP) == NOTIFY_STOP)
 587                 goto exit;
 588 #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
 589
 590 #ifdef CONFIG_KPROBES
 591         if (kprobe_int3_handler(regs))
 592                 goto exit;
 593 #endif
 594
 595         if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 596                         SIGTRAP) == NOTIFY_STOP)
 597                 goto exit;
 598
 599         /*
 600          * Let others (NMI) know that the debug stack is in use
 601          * as we may switch to the interrupt stack.
 602          */
 603         debug_stack_usage_inc();
 604         cond_local_irq_enable(regs);
 605         do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
 606         cond_local_irq_disable(regs);
 607         debug_stack_usage_dec();
 608 exit:
 609         ist_exit(regs);
 610 }
 611 NOKPROBE_SYMBOL(do_int3);
 612
 613 #ifdef CONFIG_X86_64
 614 /*
 615  * Help handler running on IST stack to switch off the IST stack if the
 616  * interrupted code was in user mode. The actual stack switch is done in
 617  * entry_64.S
 618  */
 619 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
 620 {
 621         struct pt_regs *regs = task_pt_regs(current);
 622         *regs = *eregs;
 623         return regs;
 624 }
 625 NOKPROBE_SYMBOL(sync_regs);
 626
 627 struct bad_iret_stack {
 628         void *error_entry_ret;
 629         struct pt_regs regs;
 630 };
 631
 632 asmlinkage __visible notrace
 633 struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
 634 {
 635         /*
 636          * This is called from entry_64.S early in handling a fault
 637          * caused by a bad iret to user mode.  To handle the fault
 638          * correctly, we want move our stack frame to task_pt_regs
 639          * and we want to pretend that the exception came from the
 640          * iret target.
 641          */
 642         struct bad_iret_stack *new_stack =
 643                 container_of(task_pt_regs(current),
 644                              struct bad_iret_stack, regs);
 645
 646         /* Copy the IRET target to the new stack. */
 647         memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
 648
 649         /* Copy the remainder of the stack from the current stack. */
 650         memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
 651
 652         BUG_ON(!user_mode(&new_stack->regs));
 653         return new_stack;
 654 }
 655 NOKPROBE_SYMBOL(fixup_bad_iret);
 656 #endif
 657
 658 static bool is_sysenter_singlestep(struct pt_regs *regs)
 659 {
 660         /*
 661          * We don't try for precision here.  If we're anywhere in the region of
 662          * code that can be single-stepped in the SYSENTER entry path, then
 663          * assume that this is a useless single-step trap due to SYSENTER
 664          * being invoked with TF set.  (We don't know in advance exactly
 665          * which instructions will be hit because BTF could plausibly
 666          * be set.)
 667          */
 668 #ifdef CONFIG_X86_32
 669         return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
 670                 (unsigned long)__end_SYSENTER_singlestep_region -
 671                 (unsigned long)__begin_SYSENTER_singlestep_region;
 672 #elif defined(CONFIG_IA32_EMULATION)
 673         return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
 674                 (unsigned long)__end_entry_SYSENTER_compat -
 675                 (unsigned long)entry_SYSENTER_compat;
 676 #else
 677         return false;
 678 #endif
 679 }
 680
 681 /*
 682  * Our handling of the processor debug registers is non-trivial.
 683  * We do not clear them on entry and exit from the kernel. Therefore
 684  * it is possible to get a watchpoint trap here from inside the kernel.
 685  * However, the code in ./ptrace.c has ensured that the user can
 686  * only set watchpoints on userspace addresses. Therefore the in-kernel
 687  * watchpoint trap can only occur in code which is reading/writing
 688  * from user space. Such code must not hold kernel locks (since it
 689  * can equally take a page fault), therefore it is safe to call
 690  * force_sig_info even though that claims and releases locks.
 691  *
 692  * Code in ./signal.c ensures that the debug control register
 693  * is restored before we deliver any signal, and therefore that
 694  * user code runs with the correct debug control register even though
 695  * we clear it here.
 696  *
 697  * Being careful here means that we don't have to be as careful in a
 698  * lot of more complicated places (task switching can be a bit lazy
 699  * about restoring all the debug state, and ptrace doesn't have to
 700  * find every occurrence of the TF bit that could be saved away even
 701  * by user code)
 702  *
 703  * May run on IST stack.
 704  */
 705 dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 706 {
 707         struct task_struct *tsk = current;
 708         int user_icebp = 0;
 709         unsigned long dr6;
 710         int si_code;
 711
 712         ist_enter(regs);
 713
 714         get_debugreg(dr6, 6);
 715         /*
 716          * The Intel SDM says:
 717          *
 718          *   Certain debug exceptions may clear bits 0-3. The remaining
 719          *   contents of the DR6 register are never cleared by the
 720          *   processor. To avoid confusion in identifying debug
 721          *   exceptions, debug handlers should clear the register before
 722          *   returning to the interrupted task.
 723          *
 724          * Keep it simple: clear DR6 immediately.
 725          */
 726         set_debugreg(0, 6);
 727
 728         /* Filter out all the reserved bits which are preset to 1 */
 729         dr6 &= ~DR6_RESERVED;
 730
 731         /*
 732          * The SDM says "The processor clears the BTF flag when it
 733          * generates a debug exception."  Clear TIF_BLOCKSTEP to keep
 734          * TIF_BLOCKSTEP in sync with the hardware BTF flag.
 735          */
 736         clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
 737
 738         if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
 739                      is_sysenter_singlestep(regs))) {
 740                 dr6 &= ~DR_STEP;
 741                 if (!dr6)
 742                         goto exit;
 743                 /*
 744                  * else we might have gotten a single-step trap and hit a
 745                  * watchpoint at the same time, in which case we should fall
 746                  * through and handle the watchpoint.
 747                  */
 748         }
 749
 750         /*
 751          * If dr6 has no reason to give us about the origin of this trap,
 752          * then it's very likely the result of an icebp/int01 trap.
 753          * User wants a sigtrap for that.
 754          */
 755         if (!dr6 && user_mode(regs))
 756                 user_icebp = 1;
 757
 758         /* Catch kmemcheck conditions! */
 759         if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
 760                 goto exit;
 761
 762         /* Store the virtualized DR6 value */
 763         tsk->thread.debugreg6 = dr6;
 764
 765 #ifdef CONFIG_KPROBES
 766         if (kprobe_debug_handler(regs))
 767                 goto exit;
 768 #endif
 769
 770         if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
 771                                                         SIGTRAP) == NOTIFY_STOP)
 772                 goto exit;
 773
 774         /*
 775          * Let others (NMI) know that the debug stack is in use
 776          * as we may switch to the interrupt stack.
 777          */
 778         debug_stack_usage_inc();
 779
 780         /* It's safe to allow irq's after DR6 has been saved */
 781         cond_local_irq_enable(regs);
 782
 783         if (v8086_mode(regs)) {
 784                 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 785                                         X86_TRAP_DB);
 786                 cond_local_irq_disable(regs);
 787                 debug_stack_usage_dec();
 788                 goto exit;
 789         }
 790
 791         if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
 792                 /*
 793                  * Historical junk that used to handle SYSENTER single-stepping.
 794                  * This should be unreachable now.  If we survive for a while
 795                  * without anyone hitting this warning, we'll turn this into
 796                  * an oops.
 797                  */
 798                 tsk->thread.debugreg6 &= ~DR_STEP;
 799                 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 800                 regs->flags &= ~X86_EFLAGS_TF;
 801         }
 802         si_code = get_si_code(tsk->thread.debugreg6);
 803         if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 804                 send_sigtrap(tsk, regs, error_code, si_code);
 805         cond_local_irq_disable(regs);
 806         debug_stack_usage_dec();
 807
 808 exit:
 809         /*
 810          * This is the most likely code path that involves non-trivial use
 811          * of the SYSENTER stack.  Check that we haven't overrun it.
 812          */
 813         WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
 814              "Overran or corrupted SYSENTER stack\n");
 815
 816         ist_exit(regs);
 817 }
 818 NOKPROBE_SYMBOL(do_debug);
 819
 820 /*
 821  * Note that we play around with the 'TS' bit in an attempt to get
 822  * the correct behaviour even in the presence of the asynchronous
 823  * IRQ13 behaviour
 824  */
 825 static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 826 {
 827         struct task_struct *task = current;
 828         struct fpu *fpu = &task->thread.fpu;
 829         siginfo_t info;
 830         char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
 831                                                 "simd exception";
 832
 833         if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
 834                 return;
 835         cond_local_irq_enable(regs);
 836
 837         if (!user_mode(regs)) {
 838                 if (!fixup_exception(regs, trapnr)) {
 839                         task->thread.error_code = error_code;
 840                         task->thread.trap_nr = trapnr;
 841                         die(str, regs, error_code);
 842                 }
 843                 return;
 844         }
 845
 846         /*
 847          * Save the info for the exception handler and clear the error.
 848          */
 849         fpu__save(fpu);
 850
 851         task->thread.trap_nr    = trapnr;
 852         task->thread.error_code = error_code;
 853         info.si_signo           = SIGFPE;
 854         info.si_errno           = 0;
 855         info.si_addr            = (void __user *)uprobe_get_trap_addr(regs);
 856
 857         info.si_code = fpu__exception_code(fpu, trapnr);
 858
 859         /* Retry when we get spurious exceptions: */
 860         if (!info.si_code)
 861                 return;
 862
 863         force_sig_info(SIGFPE, &info, task);
 864 }
 865
 866 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 867 {
 868         RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 869         math_error(regs, error_code, X86_TRAP_MF);
 870 }
 871
 872 dotraplinkage void
 873 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 874 {
 875         RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 876         math_error(regs, error_code, X86_TRAP_XF);
 877 }
 878
 879 dotraplinkage void
 880 do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 881 {
 882         cond_local_irq_enable(regs);
 883 }
 884
 885 dotraplinkage void
 886 do_device_not_available(struct pt_regs *regs, long error_code)
 887 {
 888         unsigned long cr0;
 889
 890         RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 891
 892 #ifdef CONFIG_MATH_EMULATION
 893         if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
 894                 struct math_emu_info info = { };
 895
 896                 cond_local_irq_enable(regs);
 897
 898                 info.regs = regs;
 899                 math_emulate(&info);
 900                 return;
 901         }
 902 #endif
 903
 904         /* This should not happen. */
 905         cr0 = read_cr0();
 906         if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
 907                 /* Try to fix it up and carry on. */
 908                 write_cr0(cr0 & ~X86_CR0_TS);
 909         } else {
 910                 /*
 911                  * Something terrible happened, and we're better off trying
 912                  * to kill the task than getting stuck in a never-ending
 913                  * loop of #NM faults.
 914                  */
 915                 die("unexpected #NM exception", regs, error_code);
 916         }
 917 }
 918 NOKPROBE_SYMBOL(do_device_not_available);
 919
 920 #ifdef CONFIG_X86_32
 921 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 922 {
 923         siginfo_t info;
 924
 925         RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 926         local_irq_enable();
 927
 928         info.si_signo = SIGILL;
 929         info.si_errno = 0;
 930         info.si_code = ILL_BADSTK;
 931         info.si_addr = NULL;
 932         if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
 933                         X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
 934                 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
 935                         &info);
 936         }
 937 }
 938 #endif
 939
 940 /* Set of traps needed for early debugging. */
 941 void __init early_trap_init(void)
 942 {
 943         /*
 944          * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
 945          * is ready in cpu_init() <-- trap_init(). Before trap_init(),
 946          * CPU runs at ring 0 so it is impossible to hit an invalid
 947          * stack.  Using the original stack works well enough at this
 948          * early stage. DEBUG_STACK will be equipped after cpu_init() in
 949          * trap_init().
 950          *
 951          * We don't need to set trace_idt_table like set_intr_gate(),
 952          * since we don't have trace_debug and it will be reset to
 953          * 'debug' in trap_init() by set_intr_gate_ist().
 954          */
 955         set_intr_gate_notrace(X86_TRAP_DB, debug);
 956         /* int3 can be called from all */
 957         set_system_intr_gate(X86_TRAP_BP, &int3);
 958 #ifdef CONFIG_X86_32
 959         set_intr_gate(X86_TRAP_PF, page_fault);
 960 #endif
 961         load_idt(&idt_descr);
 962 }
 963
 964 void __init early_trap_pf_init(void)
 965 {
 966 #ifdef CONFIG_X86_64
 967         set_intr_gate(X86_TRAP_PF, page_fault);
 968 #endif
 969 }
 970
 971 void __init trap_init(void)
 972 {
 973         int i;
 974
 975 #ifdef CONFIG_EISA
 976         void __iomem *p = early_ioremap(0x0FFFD9, 4);
 977
 978         if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
 979                 EISA_bus = 1;
 980         early_iounmap(p, 4);
 981 #endif
 982
 983         set_intr_gate(X86_TRAP_DE, divide_error);
 984         set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
 985         /* int4 can be called from all */
 986         set_system_intr_gate(X86_TRAP_OF, &overflow);
 987         set_intr_gate(X86_TRAP_BR, bounds);
 988         set_intr_gate(X86_TRAP_UD, invalid_op);
 989         set_intr_gate(X86_TRAP_NM, device_not_available);
 990 #ifdef CONFIG_X86_32
 991         set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
 992 #else
 993         set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
 994 #endif
 995         set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
 996         set_intr_gate(X86_TRAP_TS, invalid_TSS);
 997         set_intr_gate(X86_TRAP_NP, segment_not_present);
 998         set_intr_gate(X86_TRAP_SS, stack_segment);
 999         set_intr_gate(X86_TRAP_GP, general_protection);
1000         set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
1001         set_intr_gate(X86_TRAP_MF, coprocessor_error);
1002         set_intr_gate(X86_TRAP_AC, alignment_check);
1003 #ifdef CONFIG_X86_MCE
1004         set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
1005 #endif
1006         set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
1007
1008         /* Reserve all the builtin and the syscall vector: */
1009         for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
1010                 set_bit(i, used_vectors);
1011
1012 #ifdef CONFIG_IA32_EMULATION
1013         set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat);
1014         set_bit(IA32_SYSCALL_VECTOR, used_vectors);
1015 #endif
1016
1017 #ifdef CONFIG_X86_32
1018         set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
1019         set_bit(IA32_SYSCALL_VECTOR, used_vectors);
1020 #endif
1021
1022         /*
1023          * Set the IDT descriptor to a fixed read-only location, so that the
1024          * "sidt" instruction will not leak the location of the kernel, and
1025          * to defend the IDT against arbitrary memory write vulnerabilities.
1026          * It will be reloaded in cpu_init() */
1027         __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
1028         idt_descr.address = fix_to_virt(FIX_RO_IDT);
1029
1030         /*
1031          * Should be a barrier for any external CPU state:
1032          */
1033         cpu_init();
1034
1035         /*
1036          * X86_TRAP_DB and X86_TRAP_BP have been set
1037          * in early_trap_init(). However, ITS works only after
1038          * cpu_init() loads TSS. See comments in early_trap_init().
1039          */
1040         set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
1041         /* int3 can be called from all */
1042         set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
1043
1044         x86_init.irqs.trap_init();
1045
1046 #ifdef CONFIG_X86_64
1047         memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
1048         set_nmi_gate(X86_TRAP_DB, &debug);
1049         set_nmi_gate(X86_TRAP_BP, &int3);
1050 #endif
1051 }