]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - arch/x86/kernel/traps.c
UBUNTU: SAUCE: Synchronize MDS mitigations with upstream
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kernel / traps.c
CommitLineData
1da177e4 1/*
1da177e4 2 * Copyright (C) 1991, 1992 Linus Torvalds
a8c1be9d 3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
1da177e4
LT
4 *
5 * Pentium III FXSR, SSE support
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9/*
c1d518c8 10 * Handle hardware traps and faults.
1da177e4 11 */
c767a54b
JP
12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
56dd9470 15#include <linux/context_tracking.h>
b5964405
IM
16#include <linux/interrupt.h>
17#include <linux/kallsyms.h>
18#include <linux/spinlock.h>
b5964405
IM
19#include <linux/kprobes.h>
20#include <linux/uaccess.h>
b5964405 21#include <linux/kdebug.h>
f503b5ae 22#include <linux/kgdb.h>
1da177e4 23#include <linux/kernel.h>
186f4360 24#include <linux/export.h>
b5964405 25#include <linux/ptrace.h>
b02ef20a 26#include <linux/uprobes.h>
1da177e4 27#include <linux/string.h>
b5964405 28#include <linux/delay.h>
1da177e4 29#include <linux/errno.h>
b5964405
IM
30#include <linux/kexec.h>
31#include <linux/sched.h>
68db0cf1 32#include <linux/sched/task_stack.h>
1da177e4 33#include <linux/timer.h>
1da177e4 34#include <linux/init.h>
91768d6c 35#include <linux/bug.h>
b5964405
IM
36#include <linux/nmi.h>
37#include <linux/mm.h>
c1d518c8
AH
38#include <linux/smp.h>
39#include <linux/io.h>
1da177e4 40
c0d12172
DJ
41#if defined(CONFIG_EDAC)
42#include <linux/edac.h>
43#endif
44
b5964405 45#include <asm/stacktrace.h>
1da177e4 46#include <asm/processor.h>
1da177e4 47#include <asm/debugreg.h>
60063497 48#include <linux/atomic.h>
35de5b06 49#include <asm/text-patching.h>
08d636b6 50#include <asm/ftrace.h>
c1d518c8 51#include <asm/traps.h>
1da177e4 52#include <asm/desc.h>
78f7f1e5 53#include <asm/fpu/internal.h>
ed1bbc40 54#include <asm/cpu_entry_area.h>
9e55e44e 55#include <asm/mce.h>
4eefbe79 56#include <asm/fixmap.h>
1164dd00 57#include <asm/mach_traps.h>
17f41571 58#include <asm/alternative.h>
a84eeaa9 59#include <asm/fpu/xstate.h>
e7126cf5 60#include <asm/trace/mpx.h>
c7ed1048 61#include <asm/nospec-branch.h>
fe3d197f 62#include <asm/mpx.h>
ba3e127e 63#include <asm/vm86.h>
6fc9dc81 64#include <asm/umip.h>
c1d518c8 65
081f75bb 66#ifdef CONFIG_X86_64
428cf902 67#include <asm/x86_init.h>
081f75bb
AH
68#include <asm/pgalloc.h>
69#include <asm/proto.h>
081f75bb 70#else
c1d518c8 71#include <asm/processor-flags.h>
8e6dafd6 72#include <asm/setup.h>
b2502b41 73#include <asm/proto.h>
081f75bb 74#endif
1da177e4 75
7854f822 76DECLARE_BITMAP(system_vectors, NR_VECTORS);
b77b881f 77
d99e1bd1 78static inline void cond_local_irq_enable(struct pt_regs *regs)
762db434
AH
79{
80 if (regs->flags & X86_EFLAGS_IF)
81 local_irq_enable();
82}
83
d99e1bd1 84static inline void cond_local_irq_disable(struct pt_regs *regs)
3d2a71a5
AH
85{
86 if (regs->flags & X86_EFLAGS_IF)
87 local_irq_disable();
3d2a71a5
AH
88}
89
aaee8c3c
AL
90/*
91 * In IST context, we explicitly disable preemption. This serves two
92 * purposes: it makes it much less likely that we would accidentally
93 * schedule in IST context and it will force a warning if we somehow
94 * manage to schedule by accident.
95 */
8c84014f 96void ist_enter(struct pt_regs *regs)
95927475 97{
f39b6f0e 98 if (user_mode(regs)) {
5778077d 99 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
95927475
AL
100 } else {
101 /*
102 * We might have interrupted pretty much anything. In
103 * fact, if we're a machine check, we can even interrupt
104 * NMI processing. We don't want in_nmi() to return true,
105 * but we need to notify RCU.
106 */
107 rcu_nmi_enter();
95927475 108 }
b926e6f6 109
aaee8c3c 110 preempt_disable();
b926e6f6
AL
111
112 /* This code is a bit fragile. Test it. */
f78f5b90 113 RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
95927475
AL
114}
115
8c84014f 116void ist_exit(struct pt_regs *regs)
95927475 117{
aaee8c3c 118 preempt_enable_no_resched();
95927475 119
8c84014f 120 if (!user_mode(regs))
95927475
AL
121 rcu_nmi_exit();
122}
123
bced35b6
AL
124/**
125 * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
126 * @regs: regs passed to the IST exception handler
127 *
128 * IST exception handlers normally cannot schedule. As a special
129 * exception, if the exception interrupted userspace code (i.e.
f39b6f0e 130 * user_mode(regs) would return true) and the exception was not
bced35b6
AL
131 * a double fault, it can be safe to schedule. ist_begin_non_atomic()
132 * begins a non-atomic section within an ist_enter()/ist_exit() region.
133 * Callers are responsible for enabling interrupts themselves inside
8c84014f 134 * the non-atomic section, and callers must call ist_end_non_atomic()
bced35b6
AL
135 * before ist_exit().
136 */
137void ist_begin_non_atomic(struct pt_regs *regs)
138{
f39b6f0e 139 BUG_ON(!user_mode(regs));
bced35b6
AL
140
141 /*
142 * Sanity check: we need to be on the normal thread stack. This
143 * will catch asm bugs and any attempt to use ist_preempt_enable
144 * from double_fault.
145 */
3383642c 146 BUG_ON(!on_thread_stack());
bced35b6 147
aaee8c3c 148 preempt_enable_no_resched();
bced35b6
AL
149}
150
151/**
152 * ist_end_non_atomic() - begin a non-atomic section in an IST exception
153 *
154 * Ends a non-atomic section started with ist_begin_non_atomic().
155 */
156void ist_end_non_atomic(void)
157{
aaee8c3c 158 preempt_disable();
bced35b6
AL
159}
160
9a93848f
PZ
161int is_valid_bugaddr(unsigned long addr)
162{
163 unsigned short ud;
164
165 if (addr < TASK_SIZE_MAX)
166 return 0;
167
168 if (probe_kernel_address((unsigned short *)addr, ud))
169 return 0;
170
171 return ud == INSN_UD0 || ud == INSN_UD2;
172}
173
8a524f80 174int fixup_bug(struct pt_regs *regs, int trapnr)
9a93848f
PZ
175{
176 if (trapnr != X86_TRAP_UD)
177 return 0;
178
179 switch (report_bug(regs->ip, regs)) {
180 case BUG_TRAP_TYPE_NONE:
181 case BUG_TRAP_TYPE_BUG:
182 break;
183
184 case BUG_TRAP_TYPE_WARN:
f5b10c36 185 regs->ip += LEN_UD2;
9a93848f
PZ
186 return 1;
187 }
188
189 return 0;
190}
191
9326638c 192static nokprobe_inline int
c416ddf5
FW
193do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
194 struct pt_regs *regs, long error_code)
1da177e4 195{
d74ef111 196 if (v8086_mode(regs)) {
3c1326f8 197 /*
c416ddf5 198 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
3c1326f8
AH
199 * On nmi (interrupt 2), do_trap should not be called.
200 */
c416ddf5
FW
201 if (trapnr < X86_TRAP_UD) {
202 if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
203 error_code, trapnr))
204 return 0;
205 }
206 return -1;
1da177e4 207 }
d74ef111 208
55474c48 209 if (!user_mode(regs)) {
9a93848f
PZ
210 if (fixup_exception(regs, trapnr))
211 return 0;
212
9a93848f
PZ
213 tsk->thread.error_code = error_code;
214 tsk->thread.trap_nr = trapnr;
215 die(str, regs, error_code);
c416ddf5 216 }
1da177e4 217
c416ddf5
FW
218 return -1;
219}
1da177e4 220
1c326c4d
ON
221static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
222 siginfo_t *info)
958d3d72
ON
223{
224 unsigned long siaddr;
225 int sicode;
226
227 switch (trapnr) {
1c326c4d
ON
228 default:
229 return SEND_SIG_PRIV;
230
958d3d72
ON
231 case X86_TRAP_DE:
232 sicode = FPE_INTDIV;
b02ef20a 233 siaddr = uprobe_get_trap_addr(regs);
958d3d72
ON
234 break;
235 case X86_TRAP_UD:
236 sicode = ILL_ILLOPN;
b02ef20a 237 siaddr = uprobe_get_trap_addr(regs);
958d3d72
ON
238 break;
239 case X86_TRAP_AC:
240 sicode = BUS_ADRALN;
241 siaddr = 0;
242 break;
243 }
244
245 info->si_signo = signr;
246 info->si_errno = 0;
247 info->si_code = sicode;
248 info->si_addr = (void __user *)siaddr;
1c326c4d 249 return info;
958d3d72
ON
250}
251
9326638c 252static void
c416ddf5
FW
253do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
254 long error_code, siginfo_t *info)
255{
256 struct task_struct *tsk = current;
257
258
259 if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
260 return;
b5964405 261 /*
51e7dc70 262 * We want error_code and trap_nr set for userspace faults and
b5964405
IM
263 * kernelspace faults which result in die(), but not
264 * kernelspace faults which are fixed up. die() gives the
265 * process no chance to handle the signal and notice the
266 * kernel fault information, so that won't result in polluting
267 * the information about previously queued, but not yet
268 * delivered, faults. See also do_general_protection below.
269 */
270 tsk->thread.error_code = error_code;
51e7dc70 271 tsk->thread.trap_nr = trapnr;
d1895183 272
081f75bb
AH
273 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
274 printk_ratelimit()) {
c767a54b
JP
275 pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
276 tsk->comm, tsk->pid, str,
277 regs->ip, regs->sp, error_code);
1c99a687 278 print_vma_addr(KERN_CONT " in ", regs->ip);
c767a54b 279 pr_cont("\n");
081f75bb 280 }
081f75bb 281
38cad57b 282 force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
1da177e4 283}
9326638c 284NOKPROBE_SYMBOL(do_trap);
1da177e4 285
dff0796e 286static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
1c326c4d 287 unsigned long trapnr, int signr)
dff0796e 288{
1c326c4d 289 siginfo_t info;
dff0796e 290
5778077d 291 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
02fdcd5e 292
b8347c21
AS
293 /*
294 * WARN*()s end up here; fix them up before we call the
295 * notifier chain.
296 */
297 if (!user_mode(regs) && fixup_bug(regs, trapnr))
298 return;
299
dff0796e
ON
300 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
301 NOTIFY_STOP) {
d99e1bd1 302 cond_local_irq_enable(regs);
1c326c4d
ON
303 do_trap(trapnr, signr, str, regs, error_code,
304 fill_trap_info(regs, signr, trapnr, &info));
dff0796e 305 }
dff0796e
ON
306}
307
b5964405 308#define DO_ERROR(trapnr, signr, str, name) \
e407d620 309dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
b5964405 310{ \
1c326c4d 311 do_error_trap(regs, error_code, str, trapnr, signr); \
1da177e4
LT
312}
313
0eb14833
ON
314DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error)
315DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
0eb14833
ON
316DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op)
317DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun)
318DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
319DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
0eb14833 320DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
0eb14833 321DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
1da177e4 322
e37e43a4 323#ifdef CONFIG_VMAP_STACK
6271cfdf
AL
324__visible void __noreturn handle_stack_overflow(const char *message,
325 struct pt_regs *regs,
326 unsigned long fault_address)
e37e43a4
AL
327{
328 printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
329 (void *)fault_address, current->stack,
330 (char *)current->stack + THREAD_SIZE - 1);
331 die(message, regs, 0);
332
333 /* Be absolutely certain we don't return. */
334 panic(message);
335}
336#endif
337
081f75bb
AH
338#ifdef CONFIG_X86_64
339/* Runs on IST stack */
081f75bb
AH
340dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
341{
342 static const char str[] = "double fault";
343 struct task_struct *tsk = current;
e37e43a4
AL
344#ifdef CONFIG_VMAP_STACK
345 unsigned long cr2;
346#endif
081f75bb 347
af726f21
AL
348#ifdef CONFIG_X86_ESPFIX64
349 extern unsigned char native_irq_return_iret[];
350
351 /*
352 * If IRET takes a non-IST fault on the espfix64 stack, then we
6d9256f0
AL
353 * end up promoting it to a doublefault. In that case, take
354 * advantage of the fact that we're not using the normal (TSS.sp0)
355 * stack right now. We can write a fake #GP(0) frame at TSS.sp0
356 * and then modify our own IRET frame so that, when we return,
357 * we land directly at the #GP(0) vector with the stack already
358 * set up according to its expectations.
359 *
360 * The net result is that our #GP handler will think that we
361 * entered from usermode with the bad user context.
95927475
AL
362 *
363 * No need for ist_enter here because we don't use RCU.
af726f21 364 */
c739f930 365 if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
af726f21
AL
366 regs->cs == __KERNEL_CS &&
367 regs->ip == (unsigned long)native_irq_return_iret)
368 {
c482feef 369 struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
af726f21 370
6d9256f0
AL
371 /*
372 * regs->sp points to the failing IRET frame on the
373 * ESPFIX64 stack. Copy it to the entry stack. This fills
374 * in gpregs->ss through gpregs->ip.
375 *
376 */
377 memmove(&gpregs->ip, (void *)regs->sp, 5*8);
378 gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
af726f21 379
6d9256f0
AL
380 /*
381 * Adjust our frame so that we return straight to the #GP
382 * vector with the expected RSP value. This is safe because
383 * we won't enable interupts or schedule before we invoke
384 * general_protection, so nothing will clobber the stack
385 * frame we just set up.
386 */
af726f21 387 regs->ip = (unsigned long)general_protection;
6d9256f0 388 regs->sp = (unsigned long)&gpregs->orig_ax;
95927475 389
5ab15133
TG
390 /*
391 * This situation can be triggered by userspace via
392 * modify_ldt(2) and the return does not take the regular
393 * user space exit, so a CPU buffer clear is required when
394 * MDS mitigation is enabled.
395 */
396 mds_user_clear_cpu_buffers();
af726f21
AL
397 return;
398 }
399#endif
400
8c84014f 401 ist_enter(regs);
c9408265 402 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
081f75bb
AH
403
404 tsk->thread.error_code = error_code;
51e7dc70 405 tsk->thread.trap_nr = X86_TRAP_DF;
081f75bb 406
e37e43a4
AL
407#ifdef CONFIG_VMAP_STACK
408 /*
409 * If we overflow the stack into a guard page, the CPU will fail
410 * to deliver #PF and will send #DF instead. Similarly, if we
411 * take any non-IST exception while too close to the bottom of
412 * the stack, the processor will get a page fault while
413 * delivering the exception and will generate a double fault.
414 *
415 * According to the SDM (footnote in 6.15 under "Interrupt 14 -
416 * Page-Fault Exception (#PF):
417 *
418 * Processors update CR2 whenever a page fault is detected. If a
419 * second page fault occurs while an earlier page fault is being
6d9256f0 420 * delivered, the faulting linear address of the second fault will
e37e43a4
AL
421 * overwrite the contents of CR2 (replacing the previous
422 * address). These updates to CR2 occur even if the page fault
423 * results in a double fault or occurs during the delivery of a
424 * double fault.
425 *
426 * The logic below has a small possibility of incorrectly diagnosing
427 * some errors as stack overflows. For example, if the IDT or GDT
428 * gets corrupted such that #GP delivery fails due to a bad descriptor
429 * causing #GP and we hit this condition while CR2 coincidentally
430 * points to the stack guard page, we'll think we overflowed the
431 * stack. Given that we're going to panic one way or another
432 * if this happens, this isn't necessarily worth fixing.
433 *
434 * If necessary, we could improve the test by only diagnosing
435 * a stack overflow if the saved RSP points within 47 bytes of
436 * the bottom of the stack: if RSP == tsk_stack + 48 and we
437 * take an exception, the stack is already aligned and there
438 * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
439 * possible error code, so a stack overflow would *not* double
440 * fault. With any less space left, exception delivery could
441 * fail, and, as a practical matter, we've overflowed the
442 * stack even if the actual trigger for the double fault was
443 * something else.
444 */
445 cr2 = read_cr2();
446 if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
447 handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
448#endif
449
4d067d8e
BP
450#ifdef CONFIG_DOUBLEFAULT
451 df_debug(regs, error_code);
452#endif
bd8b96df
IM
453 /*
454 * This is always a kernel trap and never fixable (and thus must
455 * never return).
456 */
081f75bb
AH
457 for (;;)
458 die(str, regs, error_code);
459}
460#endif
461
fe3d197f
DH
462dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
463{
1126cb45 464 const struct mpx_bndcsr *bndcsr;
fe3d197f
DH
465 siginfo_t *info;
466
5778077d 467 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
fe3d197f
DH
468 if (notify_die(DIE_TRAP, "bounds", regs, error_code,
469 X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
8c84014f 470 return;
d99e1bd1 471 cond_local_irq_enable(regs);
fe3d197f 472
f39b6f0e 473 if (!user_mode(regs))
fe3d197f
DH
474 die("bounds", regs, error_code);
475
476 if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
477 /* The exception is not from Intel MPX */
478 goto exit_trap;
479 }
480
481 /*
482 * We need to look at BNDSTATUS to resolve this exception.
a84eeaa9
DH
483 * A NULL here might mean that it is in its 'init state',
484 * which is all zeros which indicates MPX was not
485 * responsible for the exception.
fe3d197f 486 */
d91cab78 487 bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
fe3d197f
DH
488 if (!bndcsr)
489 goto exit_trap;
490
e7126cf5 491 trace_bounds_exception_mpx(bndcsr);
fe3d197f
DH
492 /*
493 * The error code field of the BNDSTATUS register communicates status
494 * information of a bound range exception #BR or operation involving
495 * bound directory.
496 */
497 switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
498 case 2: /* Bound directory has invalid entry. */
46a6e0cf 499 if (mpx_handle_bd_fault())
fe3d197f
DH
500 goto exit_trap;
501 break; /* Success, it was handled */
502 case 1: /* Bound violation. */
46a6e0cf 503 info = mpx_generate_siginfo(regs);
e10abb2f 504 if (IS_ERR(info)) {
fe3d197f
DH
505 /*
506 * We failed to decode the MPX instruction. Act as if
507 * the exception was not caused by MPX.
508 */
509 goto exit_trap;
510 }
511 /*
512 * Success, we decoded the instruction and retrieved
513 * an 'info' containing the address being accessed
514 * which caused the exception. This information
515 * allows and application to possibly handle the
516 * #BR exception itself.
517 */
518 do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info);
519 kfree(info);
520 break;
521 case 0: /* No exception caused by Intel MPX operations. */
522 goto exit_trap;
523 default:
524 die("bounds", regs, error_code);
525 }
526
fe3d197f 527 return;
8c84014f 528
fe3d197f
DH
529exit_trap:
530 /*
531 * This path out is for all the cases where we could not
532 * handle the exception in some way (like allocating a
533 * table or telling userspace about it. We will also end
534 * up here if the kernel has MPX turned off at compile
535 * time..
536 */
537 do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
fe3d197f
DH
538}
539
9326638c 540dotraplinkage void
13485ab5 541do_general_protection(struct pt_regs *regs, long error_code)
1da177e4 542{
13485ab5 543 struct task_struct *tsk;
b5964405 544
5778077d 545 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
d99e1bd1 546 cond_local_irq_enable(regs);
c6df0d71 547
6fc9dc81
RN
548 if (static_cpu_has(X86_FEATURE_UMIP)) {
549 if (user_mode(regs) && fixup_umip_exception(regs))
550 return;
551 }
552
d74ef111 553 if (v8086_mode(regs)) {
ef3f6288
FW
554 local_irq_enable();
555 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
8c84014f 556 return;
ef3f6288 557 }
1da177e4 558
13485ab5 559 tsk = current;
55474c48 560 if (!user_mode(regs)) {
548acf19 561 if (fixup_exception(regs, X86_TRAP_GP))
8c84014f 562 return;
ef3f6288
FW
563
564 tsk->thread.error_code = error_code;
565 tsk->thread.trap_nr = X86_TRAP_GP;
6ba3c97a
FW
566 if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
567 X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
ef3f6288 568 die("general protection fault", regs, error_code);
8c84014f 569 return;
ef3f6288 570 }
1da177e4 571
13485ab5 572 tsk->thread.error_code = error_code;
51e7dc70 573 tsk->thread.trap_nr = X86_TRAP_GP;
b5964405 574
13485ab5
AH
575 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
576 printk_ratelimit()) {
c767a54b 577 pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
13485ab5
AH
578 tsk->comm, task_pid_nr(tsk),
579 regs->ip, regs->sp, error_code);
1c99a687 580 print_vma_addr(KERN_CONT " in ", regs->ip);
c767a54b 581 pr_cont("\n");
03252919 582 }
abd4f750 583
38cad57b 584 force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
1da177e4 585}
9326638c 586NOKPROBE_SYMBOL(do_general_protection);
1da177e4 587
9326638c 588dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
1da177e4 589{
08d636b6 590#ifdef CONFIG_DYNAMIC_FTRACE
a192cd04
SR
591 /*
592 * ftrace must be first, everything else may cause a recursive crash.
593 * See note by declaration of modifying_ftrace_code in ftrace.c
594 */
595 if (unlikely(atomic_read(&modifying_ftrace_code)) &&
596 ftrace_int3_handler(regs))
08d636b6
SR
597 return;
598#endif
17f41571
JK
599 if (poke_int3_handler(regs))
600 return;
601
9cf982d9
AL
602 /*
603 * Use ist_enter despite the fact that we don't use an IST stack.
604 * We can be called from a kprobe in non-CONTEXT_KERNEL kernel
605 * mode or even during context tracking state changes.
606 *
607 * This means that we can't schedule. That's okay.
608 */
8c84014f 609 ist_enter(regs);
5778077d 610 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
f503b5ae 611#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
c9408265
KC
612 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
613 SIGTRAP) == NOTIFY_STOP)
6ba3c97a 614 goto exit;
f503b5ae 615#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
cc3a1bf5 616
6f6343f5
MH
617#ifdef CONFIG_KPROBES
618 if (kprobe_int3_handler(regs))
4cdf77a8 619 goto exit;
6f6343f5
MH
620#endif
621
c9408265
KC
622 if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
623 SIGTRAP) == NOTIFY_STOP)
6ba3c97a 624 goto exit;
b5964405 625
d99e1bd1 626 cond_local_irq_enable(regs);
c9408265 627 do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
d99e1bd1 628 cond_local_irq_disable(regs);
9cf982d9 629
6ba3c97a 630exit:
8c84014f 631 ist_exit(regs);
1da177e4 632}
9326638c 633NOKPROBE_SYMBOL(do_int3);
1da177e4 634
081f75bb 635#ifdef CONFIG_X86_64
bd8b96df 636/*
7f2590a1
AL
637 * Help handler running on a per-cpu (IST or entry trampoline) stack
638 * to switch to the normal thread stack if the interrupted code was in
639 * user mode. The actual stack switch is done in entry_64.S
bd8b96df 640 */
7ddc6a21 641asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
081f75bb 642{
7f2590a1
AL
643 struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
644 if (regs != eregs)
645 *regs = *eregs;
081f75bb
AH
646 return regs;
647}
9326638c 648NOKPROBE_SYMBOL(sync_regs);
b645af2d
AL
649
650struct bad_iret_stack {
651 void *error_entry_ret;
652 struct pt_regs regs;
653};
654
7ddc6a21 655asmlinkage __visible notrace
b645af2d
AL
656struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
657{
658 /*
659 * This is called from entry_64.S early in handling a fault
660 * caused by a bad iret to user mode. To handle the fault
7f2590a1
AL
661 * correctly, we want to move our stack frame to where it would
662 * be had we entered directly on the entry stack (rather than
663 * just below the IRET frame) and we want to pretend that the
664 * exception came from the IRET target.
b645af2d
AL
665 */
666 struct bad_iret_stack *new_stack =
c482feef 667 (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
b645af2d
AL
668
669 /* Copy the IRET target to the new stack. */
670 memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
671
672 /* Copy the remainder of the stack from the current stack. */
673 memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
674
f39b6f0e 675 BUG_ON(!user_mode(&new_stack->regs));
b645af2d
AL
676 return new_stack;
677}
7ddc6a21 678NOKPROBE_SYMBOL(fixup_bad_iret);
081f75bb
AH
679#endif
680
f2b37575
AL
681static bool is_sysenter_singlestep(struct pt_regs *regs)
682{
683 /*
684 * We don't try for precision here. If we're anywhere in the region of
685 * code that can be single-stepped in the SYSENTER entry path, then
686 * assume that this is a useless single-step trap due to SYSENTER
687 * being invoked with TF set. (We don't know in advance exactly
688 * which instructions will be hit because BTF could plausibly
689 * be set.)
690 */
691#ifdef CONFIG_X86_32
692 return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
693 (unsigned long)__end_SYSENTER_singlestep_region -
694 (unsigned long)__begin_SYSENTER_singlestep_region;
695#elif defined(CONFIG_IA32_EMULATION)
696 return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
697 (unsigned long)__end_entry_SYSENTER_compat -
698 (unsigned long)entry_SYSENTER_compat;
699#else
700 return false;
701#endif
702}
703
1da177e4
LT
704/*
705 * Our handling of the processor debug registers is non-trivial.
706 * We do not clear them on entry and exit from the kernel. Therefore
707 * it is possible to get a watchpoint trap here from inside the kernel.
708 * However, the code in ./ptrace.c has ensured that the user can
709 * only set watchpoints on userspace addresses. Therefore the in-kernel
710 * watchpoint trap can only occur in code which is reading/writing
711 * from user space. Such code must not hold kernel locks (since it
712 * can equally take a page fault), therefore it is safe to call
713 * force_sig_info even though that claims and releases locks.
b5964405 714 *
1da177e4
LT
715 * Code in ./signal.c ensures that the debug control register
716 * is restored before we deliver any signal, and therefore that
717 * user code runs with the correct debug control register even though
718 * we clear it here.
719 *
720 * Being careful here means that we don't have to be as careful in a
721 * lot of more complicated places (task switching can be a bit lazy
722 * about restoring all the debug state, and ptrace doesn't have to
723 * find every occurrence of the TF bit that could be saved away even
724 * by user code)
c1d518c8
AH
725 *
726 * May run on IST stack.
1da177e4 727 */
9326638c 728dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
1da177e4 729{
1da177e4 730 struct task_struct *tsk = current;
a1e80faf 731 int user_icebp = 0;
08d68323 732 unsigned long dr6;
da654b74 733 int si_code;
1da177e4 734
8c84014f 735 ist_enter(regs);
4cdf77a8 736
08d68323 737 get_debugreg(dr6, 6);
8bb56436
AL
738 /*
739 * The Intel SDM says:
740 *
741 * Certain debug exceptions may clear bits 0-3. The remaining
742 * contents of the DR6 register are never cleared by the
743 * processor. To avoid confusion in identifying debug
744 * exceptions, debug handlers should clear the register before
745 * returning to the interrupted task.
746 *
747 * Keep it simple: clear DR6 immediately.
748 */
749 set_debugreg(0, 6);
1da177e4 750
40f9249a
P
751 /* Filter out all the reserved bits which are preset to 1 */
752 dr6 &= ~DR6_RESERVED;
753
81edd9f6
AL
754 /*
755 * The SDM says "The processor clears the BTF flag when it
756 * generates a debug exception." Clear TIF_BLOCKSTEP to keep
757 * TIF_BLOCKSTEP in sync with the hardware BTF flag.
758 */
759 clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
760
f2b37575
AL
761 if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
762 is_sysenter_singlestep(regs))) {
763 dr6 &= ~DR_STEP;
764 if (!dr6)
765 goto exit;
766 /*
767 * else we might have gotten a single-step trap and hit a
768 * watchpoint at the same time, in which case we should fall
769 * through and handle the watchpoint.
770 */
771 }
772
a1e80faf
FW
773 /*
774 * If dr6 has no reason to give us about the origin of this trap,
775 * then it's very likely the result of an icebp/int01 trap.
776 * User wants a sigtrap for that.
777 */
f39b6f0e 778 if (!dr6 && user_mode(regs))
a1e80faf
FW
779 user_icebp = 1;
780
08d68323
P
781 /* Store the virtualized DR6 value */
782 tsk->thread.debugreg6 = dr6;
783
6f6343f5
MH
784#ifdef CONFIG_KPROBES
785 if (kprobe_debug_handler(regs))
786 goto exit;
787#endif
788
5a802e15 789 if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
62edab90 790 SIGTRAP) == NOTIFY_STOP)
6ba3c97a 791 goto exit;
3d2a71a5 792
42181186
SR
793 /*
794 * Let others (NMI) know that the debug stack is in use
795 * as we may switch to the interrupt stack.
796 */
797 debug_stack_usage_inc();
798
1da177e4 799 /* It's safe to allow irq's after DR6 has been saved */
d99e1bd1 800 cond_local_irq_enable(regs);
1da177e4 801
d74ef111 802 if (v8086_mode(regs)) {
c9408265
KC
803 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
804 X86_TRAP_DB);
d99e1bd1 805 cond_local_irq_disable(regs);
42181186 806 debug_stack_usage_dec();
6ba3c97a 807 goto exit;
1da177e4
LT
808 }
809
f2b37575
AL
810 if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
811 /*
812 * Historical junk that used to handle SYSENTER single-stepping.
813 * This should be unreachable now. If we survive for a while
814 * without anyone hitting this warning, we'll turn this into
815 * an oops.
816 */
08d68323
P
817 tsk->thread.debugreg6 &= ~DR_STEP;
818 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
819 regs->flags &= ~X86_EFLAGS_TF;
1da177e4 820 }
08d68323 821 si_code = get_si_code(tsk->thread.debugreg6);
a1e80faf 822 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
08d68323 823 send_sigtrap(tsk, regs, error_code, si_code);
d99e1bd1 824 cond_local_irq_disable(regs);
42181186 825 debug_stack_usage_dec();
1da177e4 826
6ba3c97a 827exit:
8c84014f 828 ist_exit(regs);
1da177e4 829}
9326638c 830NOKPROBE_SYMBOL(do_debug);
1da177e4
LT
831
832/*
833 * Note that we play around with the 'TS' bit in an attempt to get
834 * the correct behaviour even in the presence of the asynchronous
835 * IRQ13 behaviour
836 */
5e1b05be 837static void math_error(struct pt_regs *regs, int error_code, int trapnr)
1da177e4 838{
e2e75c91 839 struct task_struct *task = current;
e1cebad4 840 struct fpu *fpu = &task->thread.fpu;
1da177e4 841 siginfo_t info;
c9408265
KC
842 char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
843 "simd exception";
e2e75c91 844
d99e1bd1 845 cond_local_irq_enable(regs);
e2e75c91 846
e1cebad4 847 if (!user_mode(regs)) {
69507fa7
SL
848 if (fixup_exception(regs, trapnr))
849 return;
850
851 task->thread.error_code = error_code;
852 task->thread.trap_nr = trapnr;
853
854 if (notify_die(DIE_TRAP, str, regs, error_code,
855 trapnr, SIGFPE) != NOTIFY_STOP)
e2e75c91 856 die(str, regs, error_code);
e2e75c91
BG
857 return;
858 }
1da177e4
LT
859
860 /*
861 * Save the info for the exception handler and clear the error.
862 */
e1cebad4
IM
863 fpu__save(fpu);
864
865 task->thread.trap_nr = trapnr;
9b6dba9e 866 task->thread.error_code = error_code;
e1cebad4
IM
867 info.si_signo = SIGFPE;
868 info.si_errno = 0;
869 info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
adf77bac 870
e1cebad4 871 info.si_code = fpu__exception_code(fpu, trapnr);
adf77bac 872
e1cebad4
IM
873 /* Retry when we get spurious exceptions: */
874 if (!info.si_code)
c9408265 875 return;
e1cebad4 876
1da177e4
LT
877 force_sig_info(SIGFPE, &info, task);
878}
879
e407d620 880dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
1da177e4 881{
5778077d 882 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
c9408265 883 math_error(regs, error_code, X86_TRAP_MF);
1da177e4
LT
884}
885
e407d620
AH
886dotraplinkage void
887do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1da177e4 888{
5778077d 889 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
c9408265 890 math_error(regs, error_code, X86_TRAP_XF);
1da177e4
LT
891}
892
e407d620
AH
893dotraplinkage void
894do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
1da177e4 895{
d99e1bd1 896 cond_local_irq_enable(regs);
081f75bb
AH
897}
898
9326638c 899dotraplinkage void
aa78bcfa 900do_device_not_available(struct pt_regs *regs, long error_code)
7643e9b9 901{
bef8b6da
AL
902 unsigned long cr0;
903
5778077d 904 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
304bceda 905
a334fe43 906#ifdef CONFIG_MATH_EMULATION
c6ab109f 907 if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
d315760f
TH
908 struct math_emu_info info = { };
909
d99e1bd1 910 cond_local_irq_enable(regs);
d315760f 911
aa78bcfa 912 info.regs = regs;
d315760f 913 math_emulate(&info);
a334fe43 914 return;
7643e9b9 915 }
a334fe43 916#endif
bef8b6da
AL
917
918 /* This should not happen. */
919 cr0 = read_cr0();
920 if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
921 /* Try to fix it up and carry on. */
922 write_cr0(cr0 & ~X86_CR0_TS);
923 } else {
924 /*
925 * Something terrible happened, and we're better off trying
926 * to kill the task than getting stuck in a never-ending
927 * loop of #NM faults.
928 */
929 die("unexpected #NM exception", regs, error_code);
930 }
7643e9b9 931}
9326638c 932NOKPROBE_SYMBOL(do_device_not_available);
7643e9b9 933
081f75bb 934#ifdef CONFIG_X86_32
e407d620 935dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
f8e0870f
AH
936{
937 siginfo_t info;
6ba3c97a 938
5778077d 939 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
f8e0870f
AH
940 local_irq_enable();
941
942 info.si_signo = SIGILL;
943 info.si_errno = 0;
944 info.si_code = ILL_BADSTK;
fc6fcdfb 945 info.si_addr = NULL;
c9408265 946 if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
6ba3c97a
FW
947 X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
948 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
949 &info);
950 }
f8e0870f 951}
081f75bb 952#endif
f8e0870f 953
1da177e4
LT
954void __init trap_init(void)
955{
40e7f949
AL
956 /* Init cpu_entry_area before IST entries are set up */
957 setup_cpu_entry_areas();
958
b70543a0 959 idt_setup_traps();
bb3f0b59 960
4eefbe79
KC
961 /*
962 * Set the IDT descriptor to a fixed read-only location, so that the
963 * "sidt" instruction will not leak the location of the kernel, and
964 * to defend the IDT against arbitrary memory write vulnerabilities.
965 * It will be reloaded in cpu_init() */
92a0f81d
TG
966 cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
967 PAGE_KERNEL_RO);
968 idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
4eefbe79 969
1da177e4 970 /*
b5964405 971 * Should be a barrier for any external CPU state:
1da177e4
LT
972 */
973 cpu_init();
974
90f6225f 975 idt_setup_ist_traps();
b4d83270 976
428cf902 977 x86_init.irqs.trap_init();
228bdaa9 978
0a30908b 979 idt_setup_debugidt_traps();
1da177e4 980}