]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - arch/x86/entry/entry_64.S
x86/entry/64: Really create an error-entry-from-usermode code path
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / entry / entry_64.S
CommitLineData
1da177e4
LT
1/*
2 * linux/arch/x86_64/entry.S
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
4d732138 7 *
1da177e4
LT
8 * entry.S contains the system-call and fault low-level handling routines.
9 *
8b4777a4
AL
10 * Some of this is documented in Documentation/x86/entry_64.txt
11 *
0bd7b798 12 * A note on terminology:
4d732138
IM
13 * - iret frame: Architecture defined interrupt frame from SS to RIP
14 * at the top of the kernel process stack.
2e91a17b
AK
15 *
16 * Some macro usage:
4d732138
IM
17 * - ENTRY/END: Define functions in the symbol table.
18 * - TRACE_IRQ_*: Trace hardirq state for lock debugging.
19 * - idtentry: Define exception entry points.
1da177e4 20 */
1da177e4
LT
21#include <linux/linkage.h>
22#include <asm/segment.h>
1da177e4
LT
23#include <asm/cache.h>
24#include <asm/errno.h>
d36f9479 25#include "calling.h"
e2d5df93 26#include <asm/asm-offsets.h>
1da177e4
LT
27#include <asm/msr.h>
28#include <asm/unistd.h>
29#include <asm/thread_info.h>
30#include <asm/hw_irq.h>
0341c14d 31#include <asm/page_types.h>
2601e64d 32#include <asm/irqflags.h>
72fe4858 33#include <asm/paravirt.h>
9939ddaf 34#include <asm/percpu.h>
d7abc0fa 35#include <asm/asm.h>
91d1aa43 36#include <asm/context_tracking.h>
63bcff2a 37#include <asm/smap.h>
3891a04a 38#include <asm/pgtable_types.h>
d7e7528b 39#include <linux/err.h>
1da177e4 40
86a1c34a
RM
41/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
42#include <linux/elf-em.h>
4d732138
IM
43#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
44#define __AUDIT_ARCH_64BIT 0x80000000
45#define __AUDIT_ARCH_LE 0x40000000
ea714547 46
4d732138
IM
47.code64
48.section .entry.text, "ax"
16444a8a 49
72fe4858 50#ifdef CONFIG_PARAVIRT
2be29982 51ENTRY(native_usergs_sysret64)
72fe4858
GOC
52 swapgs
53 sysretq
b3baaa13 54ENDPROC(native_usergs_sysret64)
72fe4858
GOC
55#endif /* CONFIG_PARAVIRT */
56
f2db9382 57.macro TRACE_IRQS_IRETQ
2601e64d 58#ifdef CONFIG_TRACE_IRQFLAGS
4d732138
IM
59 bt $9, EFLAGS(%rsp) /* interrupts off? */
60 jnc 1f
2601e64d
IM
61 TRACE_IRQS_ON
621:
63#endif
64.endm
65
5963e317
SR
66/*
67 * When dynamic function tracer is enabled it will add a breakpoint
68 * to all locations that it is about to modify, sync CPUs, update
69 * all the code, sync CPUs, then remove the breakpoints. In this time
70 * if lockdep is enabled, it might jump back into the debug handler
71 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
72 *
73 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
74 * make sure the stack pointer does not get reset back to the top
75 * of the debug stack, and instead just reuses the current stack.
76 */
77#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
78
79.macro TRACE_IRQS_OFF_DEBUG
4d732138 80 call debug_stack_set_zero
5963e317 81 TRACE_IRQS_OFF
4d732138 82 call debug_stack_reset
5963e317
SR
83.endm
84
85.macro TRACE_IRQS_ON_DEBUG
4d732138 86 call debug_stack_set_zero
5963e317 87 TRACE_IRQS_ON
4d732138 88 call debug_stack_reset
5963e317
SR
89.endm
90
f2db9382 91.macro TRACE_IRQS_IRETQ_DEBUG
4d732138
IM
92 bt $9, EFLAGS(%rsp) /* interrupts off? */
93 jnc 1f
5963e317
SR
94 TRACE_IRQS_ON_DEBUG
951:
96.endm
97
98#else
4d732138
IM
99# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
100# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
101# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
5963e317
SR
102#endif
103
1da177e4 104/*
4d732138 105 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
1da177e4 106 *
4d732138 107 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
b87cf63e
DV
108 * then loads new ss, cs, and rip from previously programmed MSRs.
109 * rflags gets masked by a value from another MSR (so CLD and CLAC
110 * are not needed). SYSCALL does not save anything on the stack
111 * and does not change rsp.
112 *
113 * Registers on entry:
1da177e4 114 * rax system call number
b87cf63e
DV
115 * rcx return address
116 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
1da177e4 117 * rdi arg0
1da177e4 118 * rsi arg1
0bd7b798 119 * rdx arg2
b87cf63e 120 * r10 arg3 (needs to be moved to rcx to conform to C ABI)
1da177e4
LT
121 * r8 arg4
122 * r9 arg5
4d732138 123 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
0bd7b798 124 *
1da177e4
LT
125 * Only called from user space.
126 *
7fcb3bc3 127 * When user can change pt_regs->foo always force IRET. That is because
7bf36bbc
AK
128 * it deals with uncanonical addresses better. SYSRET has trouble
129 * with them due to bugs in both AMD and Intel CPUs.
0bd7b798 130 */
1da177e4 131
b2502b41 132ENTRY(entry_SYSCALL_64)
9ed8e7d8
DV
133 /*
134 * Interrupts are off on entry.
135 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
136 * it is too small to ever cause noticeable irq latency.
137 */
72fe4858
GOC
138 SWAPGS_UNSAFE_STACK
139 /*
140 * A hypervisor implementation might want to use a label
141 * after the swapgs, so that it can do the swapgs
142 * for the guest and jump here on syscall.
143 */
b2502b41 144GLOBAL(entry_SYSCALL_64_after_swapgs)
72fe4858 145
4d732138
IM
146 movq %rsp, PER_CPU_VAR(rsp_scratch)
147 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
9ed8e7d8
DV
148
149 /* Construct struct pt_regs on stack */
4d732138
IM
150 pushq $__USER_DS /* pt_regs->ss */
151 pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
33db1fd4 152 /*
9ed8e7d8
DV
153 * Re-enable interrupts.
154 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
155 * must execute atomically in the face of possible interrupt-driven
156 * task preemption. We must enable interrupts only after we're done
157 * with using rsp_scratch:
33db1fd4
DV
158 */
159 ENABLE_INTERRUPTS(CLBR_NONE)
4d732138
IM
160 pushq %r11 /* pt_regs->flags */
161 pushq $__USER_CS /* pt_regs->cs */
162 pushq %rcx /* pt_regs->ip */
163 pushq %rax /* pt_regs->orig_ax */
164 pushq %rdi /* pt_regs->di */
165 pushq %rsi /* pt_regs->si */
166 pushq %rdx /* pt_regs->dx */
167 pushq %rcx /* pt_regs->cx */
168 pushq $-ENOSYS /* pt_regs->ax */
169 pushq %r8 /* pt_regs->r8 */
170 pushq %r9 /* pt_regs->r9 */
171 pushq %r10 /* pt_regs->r10 */
172 pushq %r11 /* pt_regs->r11 */
173 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
174
175 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
176 jnz tracesys
b2502b41 177entry_SYSCALL_64_fastpath:
fca460f9 178#if __SYSCALL_MASK == ~0
4d732138 179 cmpq $__NR_syscall_max, %rax
fca460f9 180#else
4d732138
IM
181 andl $__SYSCALL_MASK, %eax
182 cmpl $__NR_syscall_max, %eax
fca460f9 183#endif
4d732138
IM
184 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
185 movq %r10, %rcx
186 call *sys_call_table(, %rax, 8)
187 movq %rax, RAX(%rsp)
146b2b09 1881:
1da177e4 189/*
146b2b09
DV
190 * Syscall return path ending with SYSRET (fast path).
191 * Has incompletely filled pt_regs.
0bd7b798 192 */
10cd706d 193 LOCKDEP_SYS_EXIT
4416c5a6
DV
194 /*
195 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
196 * it is too small to ever cause noticeable irq latency.
197 */
72fe4858 198 DISABLE_INTERRUPTS(CLBR_NONE)
b3494a4a
AL
199
200 /*
201 * We must check ti flags with interrupts (or at least preemption)
202 * off because we must *never* return to userspace without
203 * processing exit work that is enqueued if we're preempted here.
204 * In particular, returning to userspace with any of the one-shot
205 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
206 * very bad.
207 */
4d732138
IM
208 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
209 jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
b3494a4a 210
29722cd4 211 RESTORE_C_REGS_EXCEPT_RCX_R11
4d732138
IM
212 movq RIP(%rsp), %rcx
213 movq EFLAGS(%rsp), %r11
214 movq RSP(%rsp), %rsp
b87cf63e 215 /*
4d732138 216 * 64-bit SYSRET restores rip from rcx,
b87cf63e
DV
217 * rflags from r11 (but RF and VM bits are forced to 0),
218 * cs and ss are loaded from MSRs.
4416c5a6 219 * Restoration of rflags re-enables interrupts.
61f01dd9
AL
220 *
221 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
222 * descriptor is not reinitialized. This means that we should
223 * avoid SYSRET with SS == NULL, which could happen if we schedule,
224 * exit the kernel, and re-enter using an interrupt vector. (All
225 * interrupt entries on x86_64 set SS to NULL.) We prevent that
226 * from happening by reloading SS in __switch_to. (Actually
227 * detecting the failure in 64-bit userspace is tricky but can be
228 * done.)
b87cf63e 229 */
2be29982 230 USERGS_SYSRET64
1da177e4 231
7fcb3bc3 232 /* Do syscall entry tracing */
0bd7b798 233tracesys:
4d732138
IM
234 movq %rsp, %rdi
235 movl $AUDIT_ARCH_X86_64, %esi
236 call syscall_trace_enter_phase1
237 test %rax, %rax
238 jnz tracesys_phase2 /* if needed, run the slow path */
239 RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
240 movq ORIG_RAX(%rsp), %rax
241 jmp entry_SYSCALL_64_fastpath /* and return to the fast path */
1dcf74f6
AL
242
243tracesys_phase2:
76f5df43 244 SAVE_EXTRA_REGS
4d732138
IM
245 movq %rsp, %rdi
246 movl $AUDIT_ARCH_X86_64, %esi
247 movq %rax, %rdx
248 call syscall_trace_enter_phase2
1dcf74f6 249
d4d67150 250 /*
e90e147c 251 * Reload registers from stack in case ptrace changed them.
1dcf74f6 252 * We don't reload %rax because syscall_trace_entry_phase2() returned
d4d67150
RM
253 * the value it wants us to use in the table lookup.
254 */
76f5df43
DV
255 RESTORE_C_REGS_EXCEPT_RAX
256 RESTORE_EXTRA_REGS
fca460f9 257#if __SYSCALL_MASK == ~0
4d732138 258 cmpq $__NR_syscall_max, %rax
fca460f9 259#else
4d732138
IM
260 andl $__SYSCALL_MASK, %eax
261 cmpl $__NR_syscall_max, %eax
fca460f9 262#endif
4d732138
IM
263 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
264 movq %r10, %rcx /* fixup for C */
265 call *sys_call_table(, %rax, 8)
266 movq %rax, RAX(%rsp)
a6de5a21 2671:
7fcb3bc3 268 /* Use IRET because user could have changed pt_regs->foo */
0bd7b798
AH
269
270/*
1da177e4 271 * Syscall return path ending with IRET.
7fcb3bc3 272 * Has correct iret frame.
bcddc015 273 */
bc8b2b92 274GLOBAL(int_ret_from_sys_call)
72fe4858 275 DISABLE_INTERRUPTS(CLBR_NONE)
4416c5a6 276int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
2601e64d 277 TRACE_IRQS_OFF
4d732138 278 movl $_TIF_ALLWORK_MASK, %edi
1da177e4 279 /* edi: mask to check */
bc8b2b92 280GLOBAL(int_with_check)
10cd706d 281 LOCKDEP_SYS_EXIT_IRQ
1da177e4 282 GET_THREAD_INFO(%rcx)
4d732138
IM
283 movl TI_flags(%rcx), %edx
284 andl %edi, %edx
285 jnz int_careful
286 andl $~TS_COMPAT, TI_status(%rcx)
fffbb5dc 287 jmp syscall_return
1da177e4 288
4d732138
IM
289 /*
290 * Either reschedule or signal or syscall exit tracking needed.
291 * First do a reschedule test.
292 * edx: work, edi: workmask
293 */
1da177e4 294int_careful:
4d732138
IM
295 bt $TIF_NEED_RESCHED, %edx
296 jnc int_very_careful
2601e64d 297 TRACE_IRQS_ON
72fe4858 298 ENABLE_INTERRUPTS(CLBR_NONE)
4d732138 299 pushq %rdi
0430499c 300 SCHEDULE_USER
4d732138 301 popq %rdi
72fe4858 302 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 303 TRACE_IRQS_OFF
4d732138 304 jmp int_with_check
1da177e4 305
7fcb3bc3 306 /* handle signals and tracing -- both require a full pt_regs */
1da177e4 307int_very_careful:
2601e64d 308 TRACE_IRQS_ON
72fe4858 309 ENABLE_INTERRUPTS(CLBR_NONE)
76f5df43 310 SAVE_EXTRA_REGS
0bd7b798 311 /* Check for syscall exit trace */
4d732138
IM
312 testl $_TIF_WORK_SYSCALL_EXIT, %edx
313 jz int_signal
314 pushq %rdi
315 leaq 8(%rsp), %rdi /* &ptregs -> arg1 */
316 call syscall_trace_leave
317 popq %rdi
318 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi
319 jmp int_restore_rest
0bd7b798 320
1da177e4 321int_signal:
4d732138
IM
322 testl $_TIF_DO_NOTIFY_MASK, %edx
323 jz 1f
324 movq %rsp, %rdi /* &ptregs -> arg1 */
325 xorl %esi, %esi /* oldset -> arg2 */
326 call do_notify_resume
3271: movl $_TIF_WORK_MASK, %edi
1da177e4 328int_restore_rest:
76f5df43 329 RESTORE_EXTRA_REGS
72fe4858 330 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 331 TRACE_IRQS_OFF
4d732138 332 jmp int_with_check
fffbb5dc
DV
333
334syscall_return:
335 /* The IRETQ could re-enable interrupts: */
336 DISABLE_INTERRUPTS(CLBR_ANY)
337 TRACE_IRQS_IRETQ
338
339 /*
340 * Try to use SYSRET instead of IRET if we're returning to
341 * a completely clean 64-bit userspace context.
342 */
4d732138
IM
343 movq RCX(%rsp), %rcx
344 movq RIP(%rsp), %r11
345 cmpq %rcx, %r11 /* RCX == RIP */
346 jne opportunistic_sysret_failed
fffbb5dc
DV
347
348 /*
349 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
350 * in kernel space. This essentially lets the user take over
17be0aec 351 * the kernel, since userspace controls RSP.
fffbb5dc 352 *
17be0aec 353 * If width of "canonical tail" ever becomes variable, this will need
fffbb5dc
DV
354 * to be updated to remain correct on both old and new CPUs.
355 */
356 .ifne __VIRTUAL_MASK_SHIFT - 47
357 .error "virtual address width changed -- SYSRET checks need update"
358 .endif
4d732138 359
17be0aec
DV
360 /* Change top 16 bits to be the sign-extension of 47th bit */
361 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
362 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
4d732138 363
17be0aec
DV
364 /* If this changed %rcx, it was not canonical */
365 cmpq %rcx, %r11
366 jne opportunistic_sysret_failed
fffbb5dc 367
4d732138
IM
368 cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
369 jne opportunistic_sysret_failed
fffbb5dc 370
4d732138
IM
371 movq R11(%rsp), %r11
372 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
373 jne opportunistic_sysret_failed
fffbb5dc
DV
374
375 /*
376 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
377 * restoring TF results in a trap from userspace immediately after
378 * SYSRET. This would cause an infinite loop whenever #DB happens
379 * with register state that satisfies the opportunistic SYSRET
380 * conditions. For example, single-stepping this user code:
381 *
4d732138 382 * movq $stuck_here, %rcx
fffbb5dc
DV
383 * pushfq
384 * popq %r11
385 * stuck_here:
386 *
387 * would never get past 'stuck_here'.
388 */
4d732138
IM
389 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
390 jnz opportunistic_sysret_failed
fffbb5dc
DV
391
392 /* nothing to check for RSP */
393
4d732138
IM
394 cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
395 jne opportunistic_sysret_failed
fffbb5dc
DV
396
397 /*
4d732138
IM
398 * We win! This label is here just for ease of understanding
399 * perf profiles. Nothing jumps here.
fffbb5dc
DV
400 */
401syscall_return_via_sysret:
17be0aec
DV
402 /* rcx and r11 are already restored (see code above) */
403 RESTORE_C_REGS_EXCEPT_RCX_R11
4d732138 404 movq RSP(%rsp), %rsp
fffbb5dc 405 USERGS_SYSRET64
fffbb5dc
DV
406
407opportunistic_sysret_failed:
408 SWAPGS
409 jmp restore_c_regs_and_iret
b2502b41 410END(entry_SYSCALL_64)
0bd7b798 411
fffbb5dc 412
1d4b4b29
AV
413 .macro FORK_LIKE func
414ENTRY(stub_\func)
76f5df43 415 SAVE_EXTRA_REGS 8
4d732138 416 jmp sys_\func
1d4b4b29
AV
417END(stub_\func)
418 .endm
419
420 FORK_LIKE clone
421 FORK_LIKE fork
422 FORK_LIKE vfork
1da177e4 423
1da177e4 424ENTRY(stub_execve)
fc3e958a
DV
425 call sys_execve
426return_from_execve:
427 testl %eax, %eax
428 jz 1f
429 /* exec failed, can use fast SYSRET code path in this case */
430 ret
4311:
432 /* must use IRET code path (pt_regs->cs may have changed) */
433 addq $8, %rsp
434 ZERO_EXTRA_REGS
4d732138 435 movq %rax, RAX(%rsp)
fc3e958a 436 jmp int_ret_from_sys_call
4b787e0b 437END(stub_execve)
a37f34a3
DV
438/*
439 * Remaining execve stubs are only 7 bytes long.
440 * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
441 */
442 .align 8
443GLOBAL(stub_execveat)
fc3e958a
DV
444 call sys_execveat
445 jmp return_from_execve
27d6ec7a
DD
446END(stub_execveat)
447
ac7f5dfb 448#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
a37f34a3
DV
449 .align 8
450GLOBAL(stub_x32_execve)
ac7f5dfb 451GLOBAL(stub32_execve)
05f1752d
DV
452 call compat_sys_execve
453 jmp return_from_execve
ac7f5dfb 454END(stub32_execve)
05f1752d 455END(stub_x32_execve)
a37f34a3
DV
456 .align 8
457GLOBAL(stub_x32_execveat)
a37f34a3 458GLOBAL(stub32_execveat)
0f90fb97
DV
459 call compat_sys_execveat
460 jmp return_from_execve
0f90fb97 461END(stub32_execveat)
ac7f5dfb 462END(stub_x32_execveat)
0f90fb97
DV
463#endif
464
1da177e4
LT
465/*
466 * sigreturn is special because it needs to restore all registers on return.
467 * This cannot be done with SYSRET, so use the IRET return path instead.
0bd7b798 468 */
1da177e4 469ENTRY(stub_rt_sigreturn)
31f0119b
DV
470 /*
471 * SAVE_EXTRA_REGS result is not normally needed:
472 * sigreturn overwrites all pt_regs->GPREGS.
473 * But sigreturn can fail (!), and there is no easy way to detect that.
474 * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
475 * we SAVE_EXTRA_REGS here.
476 */
477 SAVE_EXTRA_REGS 8
4d732138 478 call sys_rt_sigreturn
31f0119b
DV
479return_from_stub:
480 addq $8, %rsp
76f5df43 481 RESTORE_EXTRA_REGS
4d732138
IM
482 movq %rax, RAX(%rsp)
483 jmp int_ret_from_sys_call
4b787e0b 484END(stub_rt_sigreturn)
1da177e4 485
c5a37394 486#ifdef CONFIG_X86_X32_ABI
c5a37394 487ENTRY(stub_x32_rt_sigreturn)
31f0119b 488 SAVE_EXTRA_REGS 8
4d732138
IM
489 call sys32_x32_rt_sigreturn
490 jmp return_from_stub
c5a37394 491END(stub_x32_rt_sigreturn)
c5a37394
PA
492#endif
493
1eeb207f
DV
494/*
495 * A newly forked process directly context switches into this address.
496 *
497 * rdi: prev task we switched from
498 */
499ENTRY(ret_from_fork)
1eeb207f 500
4d732138 501 LOCK ; btr $TIF_FORK, TI_flags(%r8)
1eeb207f 502
4d732138
IM
503 pushq $0x0002
504 popfq /* reset kernel eflags */
1eeb207f 505
4d732138 506 call schedule_tail /* rdi: 'prev' task parameter */
1eeb207f 507
1eeb207f
DV
508 RESTORE_EXTRA_REGS
509
4d732138 510 testb $3, CS(%rsp) /* from kernel_thread? */
1eeb207f 511
1e3fbb8a
AL
512 /*
513 * By the time we get here, we have no idea whether our pt_regs,
514 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
138bd56a 515 * the slow path, or one of the 32-bit compat paths.
66ad4efa 516 * Use IRET code path to return, since it can safely handle
1e3fbb8a
AL
517 * all of the above.
518 */
66ad4efa 519 jnz int_ret_from_sys_call
1eeb207f 520
4d732138
IM
521 /*
522 * We came from kernel_thread
523 * nb: we depend on RESTORE_EXTRA_REGS above
524 */
525 movq %rbp, %rdi
526 call *%rbx
527 movl $0, RAX(%rsp)
1eeb207f 528 RESTORE_EXTRA_REGS
4d732138 529 jmp int_ret_from_sys_call
1eeb207f
DV
530END(ret_from_fork)
531
939b7871 532/*
3304c9c3
DV
533 * Build the entry stubs with some assembler magic.
534 * We pack 1 stub into every 8-byte block.
939b7871 535 */
3304c9c3 536 .align 8
939b7871 537ENTRY(irq_entries_start)
3304c9c3
DV
538 vector=FIRST_EXTERNAL_VECTOR
539 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
4d732138 540 pushq $(~vector+0x80) /* Note: always in signed byte range */
3304c9c3
DV
541 vector=vector+1
542 jmp common_interrupt
3304c9c3
DV
543 .align 8
544 .endr
939b7871
PA
545END(irq_entries_start)
546
d99015b1 547/*
1da177e4
LT
548 * Interrupt entry/exit.
549 *
550 * Interrupt entry points save only callee clobbered registers in fast path.
d99015b1
AH
551 *
552 * Entry runs with interrupts off.
553 */
1da177e4 554
722024db 555/* 0(%rsp): ~(interrupt number) */
1da177e4 556 .macro interrupt func
f6f64681 557 cld
e90e147c
DV
558 /*
559 * Since nothing in interrupt handling code touches r12...r15 members
560 * of "struct pt_regs", and since interrupts can nest, we can save
561 * four stack slots and simultaneously provide
562 * an unwind-friendly stack layout by saving "truncated" pt_regs
563 * exactly up to rbp slot, without these members.
564 */
76f5df43
DV
565 ALLOC_PT_GPREGS_ON_STACK -RBP
566 SAVE_C_REGS -RBP
567 /* this goes to 0(%rsp) for unwinder, not for saving the value: */
568 SAVE_EXTRA_REGS_RBP -RBP
569
4d732138 570 leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */
f6f64681 571
03335e95 572 testb $3, CS-RBP(%rsp)
dde74f2e 573 jz 1f
f6f64681 574 SWAPGS
76f5df43 5751:
f6f64681 576 /*
e90e147c 577 * Save previous stack pointer, optionally switch to interrupt stack.
f6f64681
DV
578 * irq_count is used to check if a CPU is already on an interrupt stack
579 * or not. While this is essentially redundant with preempt_count it is
580 * a little cheaper to use a separate counter in the PDA (short of
581 * moving irq_enter into assembly, which would be too much work)
582 */
4d732138
IM
583 movq %rsp, %rsi
584 incl PER_CPU_VAR(irq_count)
585 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
586 pushq %rsi
f6f64681
DV
587 /* We entered an interrupt context - irqs are off: */
588 TRACE_IRQS_OFF
589
4d732138 590 call \func
1da177e4
LT
591 .endm
592
722024db
AH
593 /*
594 * The interrupt stubs push (~vector+0x80) onto the stack and
595 * then jump to common_interrupt.
596 */
939b7871
PA
597 .p2align CONFIG_X86_L1_CACHE_SHIFT
598common_interrupt:
ee4eb87b 599 ASM_CLAC
4d732138 600 addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
1da177e4 601 interrupt do_IRQ
34061f13 602 /* 0(%rsp): old RSP */
7effaa88 603ret_from_intr:
72fe4858 604 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 605 TRACE_IRQS_OFF
4d732138 606 decl PER_CPU_VAR(irq_count)
625dbc3b 607
a2bbe750 608 /* Restore saved previous stack */
4d732138 609 popq %rsi
e90e147c 610 /* return code expects complete pt_regs - adjust rsp accordingly: */
4d732138 611 leaq -RBP(%rsi), %rsp
625dbc3b 612
03335e95 613 testb $3, CS(%rsp)
dde74f2e 614 jz retint_kernel
1da177e4 615 /* Interrupt came from user space */
5e99cb7c 616GLOBAL(retint_user)
a3675b32 617 GET_THREAD_INFO(%rcx)
4d732138
IM
618
619 /* %rcx: thread info. Interrupts are off. */
1da177e4 620retint_with_reschedule:
4d732138 621 movl $_TIF_WORK_MASK, %edi
7effaa88 622retint_check:
10cd706d 623 LOCKDEP_SYS_EXIT_IRQ
4d732138
IM
624 movl TI_flags(%rcx), %edx
625 andl %edi, %edx
626 jnz retint_careful
10cd706d 627
4d732138 628retint_swapgs: /* return to user-space */
2601e64d
IM
629 /*
630 * The iretq could re-enable interrupts:
631 */
72fe4858 632 DISABLE_INTERRUPTS(CLBR_ANY)
2601e64d 633 TRACE_IRQS_IRETQ
2a23c6b8 634
72fe4858 635 SWAPGS
fffbb5dc 636 jmp restore_c_regs_and_iret
2601e64d 637
627276cb 638/* Returning to kernel space */
6ba71b76 639retint_kernel:
627276cb
DV
640#ifdef CONFIG_PREEMPT
641 /* Interrupts are off */
642 /* Check if we need preemption */
4d732138 643 bt $9, EFLAGS(%rsp) /* were interrupts off? */
6ba71b76 644 jnc 1f
4d732138 6450: cmpl $0, PER_CPU_VAR(__preempt_count)
36acef25 646 jnz 1f
627276cb 647 call preempt_schedule_irq
36acef25 648 jmp 0b
6ba71b76 6491:
627276cb 650#endif
2601e64d
IM
651 /*
652 * The iretq could re-enable interrupts:
653 */
654 TRACE_IRQS_IRETQ
fffbb5dc
DV
655
656/*
657 * At this label, code paths which return to kernel and to user,
658 * which come from interrupts/exception and from syscalls, merge.
659 */
660restore_c_regs_and_iret:
76f5df43
DV
661 RESTORE_C_REGS
662 REMOVE_PT_GPREGS_FROM_STACK 8
7209a75d
AL
663 INTERRUPT_RETURN
664
665ENTRY(native_iret)
3891a04a
PA
666 /*
667 * Are we returning to a stack segment from the LDT? Note: in
668 * 64-bit mode SS:RSP on the exception stack is always valid.
669 */
34273f41 670#ifdef CONFIG_X86_ESPFIX64
4d732138
IM
671 testb $4, (SS-RIP)(%rsp)
672 jnz native_irq_return_ldt
34273f41 673#endif
3891a04a 674
af726f21 675.global native_irq_return_iret
7209a75d 676native_irq_return_iret:
b645af2d
AL
677 /*
678 * This may fault. Non-paranoid faults on return to userspace are
679 * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
680 * Double-faults due to espfix64 are handled in do_double_fault.
681 * Other faults here are fatal.
682 */
1da177e4 683 iretq
3701d863 684
34273f41 685#ifdef CONFIG_X86_ESPFIX64
7209a75d 686native_irq_return_ldt:
4d732138
IM
687 pushq %rax
688 pushq %rdi
3891a04a 689 SWAPGS
4d732138
IM
690 movq PER_CPU_VAR(espfix_waddr), %rdi
691 movq %rax, (0*8)(%rdi) /* RAX */
692 movq (2*8)(%rsp), %rax /* RIP */
693 movq %rax, (1*8)(%rdi)
694 movq (3*8)(%rsp), %rax /* CS */
695 movq %rax, (2*8)(%rdi)
696 movq (4*8)(%rsp), %rax /* RFLAGS */
697 movq %rax, (3*8)(%rdi)
698 movq (6*8)(%rsp), %rax /* SS */
699 movq %rax, (5*8)(%rdi)
700 movq (5*8)(%rsp), %rax /* RSP */
701 movq %rax, (4*8)(%rdi)
702 andl $0xffff0000, %eax
703 popq %rdi
704 orq PER_CPU_VAR(espfix_stack), %rax
3891a04a 705 SWAPGS
4d732138
IM
706 movq %rax, %rsp
707 popq %rax
708 jmp native_irq_return_iret
34273f41 709#endif
3891a04a 710
7effaa88 711 /* edi: workmask, edx: work */
1da177e4 712retint_careful:
4d732138
IM
713 bt $TIF_NEED_RESCHED, %edx
714 jnc retint_signal
2601e64d 715 TRACE_IRQS_ON
72fe4858 716 ENABLE_INTERRUPTS(CLBR_NONE)
4d732138 717 pushq %rdi
0430499c 718 SCHEDULE_USER
4d732138 719 popq %rdi
1da177e4 720 GET_THREAD_INFO(%rcx)
72fe4858 721 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 722 TRACE_IRQS_OFF
4d732138 723 jmp retint_check
0bd7b798 724
1da177e4 725retint_signal:
4d732138
IM
726 testl $_TIF_DO_NOTIFY_MASK, %edx
727 jz retint_swapgs
2601e64d 728 TRACE_IRQS_ON
72fe4858 729 ENABLE_INTERRUPTS(CLBR_NONE)
76f5df43 730 SAVE_EXTRA_REGS
4d732138
IM
731 movq $-1, ORIG_RAX(%rsp)
732 xorl %esi, %esi /* oldset */
733 movq %rsp, %rdi /* &pt_regs */
734 call do_notify_resume
76f5df43 735 RESTORE_EXTRA_REGS
72fe4858 736 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 737 TRACE_IRQS_OFF
be9e6870 738 GET_THREAD_INFO(%rcx)
4d732138 739 jmp retint_with_reschedule
1da177e4 740
4b787e0b 741END(common_interrupt)
3891a04a 742
1da177e4
LT
743/*
744 * APIC interrupts.
0bd7b798 745 */
cf910e83 746.macro apicinterrupt3 num sym do_sym
322648d1 747ENTRY(\sym)
ee4eb87b 748 ASM_CLAC
4d732138 749 pushq $~(\num)
39e95433 750.Lcommon_\sym:
322648d1 751 interrupt \do_sym
4d732138 752 jmp ret_from_intr
322648d1
AH
753END(\sym)
754.endm
1da177e4 755
cf910e83
SA
756#ifdef CONFIG_TRACING
757#define trace(sym) trace_##sym
758#define smp_trace(sym) smp_trace_##sym
759
760.macro trace_apicinterrupt num sym
761apicinterrupt3 \num trace(\sym) smp_trace(\sym)
762.endm
763#else
764.macro trace_apicinterrupt num sym do_sym
765.endm
766#endif
767
768.macro apicinterrupt num sym do_sym
769apicinterrupt3 \num \sym \do_sym
770trace_apicinterrupt \num \sym
771.endm
772
322648d1 773#ifdef CONFIG_SMP
4d732138
IM
774apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
775apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt
322648d1 776#endif
1da177e4 777
03b48632 778#ifdef CONFIG_X86_UV
4d732138 779apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt
03b48632 780#endif
4d732138
IM
781
782apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt
783apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi
89b831ef 784
d78f2664 785#ifdef CONFIG_HAVE_KVM
4d732138
IM
786apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
787apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
d78f2664
YZ
788#endif
789
33e5ff63 790#ifdef CONFIG_X86_MCE_THRESHOLD
4d732138 791apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt
33e5ff63
SA
792#endif
793
24fd78a8 794#ifdef CONFIG_X86_MCE_AMD
4d732138 795apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt
24fd78a8
AG
796#endif
797
33e5ff63 798#ifdef CONFIG_X86_THERMAL_VECTOR
4d732138 799apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt
33e5ff63 800#endif
1812924b 801
322648d1 802#ifdef CONFIG_SMP
4d732138
IM
803apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt
804apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt
805apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt
322648d1 806#endif
1da177e4 807
4d732138
IM
808apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt
809apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
0bd7b798 810
e360adbe 811#ifdef CONFIG_IRQ_WORK
4d732138 812apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
241771ef
IM
813#endif
814
1da177e4
LT
815/*
816 * Exception entry points.
0bd7b798 817 */
9b476688 818#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
577ed45e
AL
819
820.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
322648d1 821ENTRY(\sym)
577ed45e
AL
822 /* Sanity check */
823 .if \shift_ist != -1 && \paranoid == 0
824 .error "using shift_ist requires paranoid=1"
825 .endif
826
ee4eb87b 827 ASM_CLAC
b8b1d08b 828 PARAVIRT_ADJUST_EXCEPTION_FRAME
cb5dd2c5
AL
829
830 .ifeq \has_error_code
4d732138 831 pushq $-1 /* ORIG_RAX: no syscall to restart */
cb5dd2c5
AL
832 .endif
833
76f5df43 834 ALLOC_PT_GPREGS_ON_STACK
cb5dd2c5
AL
835
836 .if \paranoid
48e08d0f 837 .if \paranoid == 1
4d732138
IM
838 testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
839 jnz 1f
48e08d0f 840 .endif
4d732138 841 call paranoid_entry
cb5dd2c5 842 .else
4d732138 843 call error_entry
cb5dd2c5 844 .endif
ebfc453e 845 /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
cb5dd2c5 846
cb5dd2c5 847 .if \paranoid
577ed45e 848 .if \shift_ist != -1
4d732138 849 TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
577ed45e 850 .else
b8b1d08b 851 TRACE_IRQS_OFF
cb5dd2c5 852 .endif
577ed45e 853 .endif
cb5dd2c5 854
4d732138 855 movq %rsp, %rdi /* pt_regs pointer */
cb5dd2c5
AL
856
857 .if \has_error_code
4d732138
IM
858 movq ORIG_RAX(%rsp), %rsi /* get error code */
859 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
cb5dd2c5 860 .else
4d732138 861 xorl %esi, %esi /* no error code */
cb5dd2c5
AL
862 .endif
863
577ed45e 864 .if \shift_ist != -1
4d732138 865 subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
577ed45e
AL
866 .endif
867
4d732138 868 call \do_sym
cb5dd2c5 869
577ed45e 870 .if \shift_ist != -1
4d732138 871 addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
577ed45e
AL
872 .endif
873
ebfc453e 874 /* these procedures expect "no swapgs" flag in ebx */
cb5dd2c5 875 .if \paranoid
4d732138 876 jmp paranoid_exit
cb5dd2c5 877 .else
4d732138 878 jmp error_exit
cb5dd2c5
AL
879 .endif
880
48e08d0f 881 .if \paranoid == 1
48e08d0f
AL
882 /*
883 * Paranoid entry from userspace. Switch stacks and treat it
884 * as a normal entry. This means that paranoid handlers
885 * run in real process context if user_mode(regs).
886 */
8871:
4d732138 888 call error_entry
48e08d0f 889
48e08d0f 890
4d732138
IM
891 movq %rsp, %rdi /* pt_regs pointer */
892 call sync_regs
893 movq %rax, %rsp /* switch stack */
48e08d0f 894
4d732138 895 movq %rsp, %rdi /* pt_regs pointer */
48e08d0f
AL
896
897 .if \has_error_code
4d732138
IM
898 movq ORIG_RAX(%rsp), %rsi /* get error code */
899 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
48e08d0f 900 .else
4d732138 901 xorl %esi, %esi /* no error code */
48e08d0f
AL
902 .endif
903
4d732138 904 call \do_sym
48e08d0f 905
4d732138 906 jmp error_exit /* %ebx: no swapgs flag */
48e08d0f 907 .endif
ddeb8f21 908END(\sym)
322648d1 909.endm
b8b1d08b 910
25c74b10 911#ifdef CONFIG_TRACING
cb5dd2c5
AL
912.macro trace_idtentry sym do_sym has_error_code:req
913idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
914idtentry \sym \do_sym has_error_code=\has_error_code
25c74b10
SA
915.endm
916#else
cb5dd2c5
AL
917.macro trace_idtentry sym do_sym has_error_code:req
918idtentry \sym \do_sym has_error_code=\has_error_code
25c74b10
SA
919.endm
920#endif
921
4d732138
IM
922idtentry divide_error do_divide_error has_error_code=0
923idtentry overflow do_overflow has_error_code=0
924idtentry bounds do_bounds has_error_code=0
925idtentry invalid_op do_invalid_op has_error_code=0
926idtentry device_not_available do_device_not_available has_error_code=0
927idtentry double_fault do_double_fault has_error_code=1 paranoid=2
928idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
929idtentry invalid_TSS do_invalid_TSS has_error_code=1
930idtentry segment_not_present do_segment_not_present has_error_code=1
931idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
932idtentry coprocessor_error do_coprocessor_error has_error_code=0
933idtentry alignment_check do_alignment_check has_error_code=1
934idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
935
936
937 /*
938 * Reload gs selector with exception handling
939 * edi: new selector
940 */
9f9d489a 941ENTRY(native_load_gs_index)
131484c8 942 pushfq
b8aa287f 943 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
9f1e87ea 944 SWAPGS
0bd7b798 945gs_change:
4d732138
IM
946 movl %edi, %gs
9472: mfence /* workaround */
72fe4858 948 SWAPGS
131484c8 949 popfq
9f1e87ea 950 ret
6efdcfaf 951END(native_load_gs_index)
0bd7b798 952
4d732138
IM
953 _ASM_EXTABLE(gs_change, bad_gs)
954 .section .fixup, "ax"
1da177e4 955 /* running with kernelgs */
0bd7b798 956bad_gs:
4d732138
IM
957 SWAPGS /* switch back to user gs */
958 xorl %eax, %eax
959 movl %eax, %gs
960 jmp 2b
9f1e87ea 961 .previous
0bd7b798 962
2699500b 963/* Call softirq on interrupt stack. Interrupts are off. */
7d65f4a6 964ENTRY(do_softirq_own_stack)
4d732138
IM
965 pushq %rbp
966 mov %rsp, %rbp
967 incl PER_CPU_VAR(irq_count)
968 cmove PER_CPU_VAR(irq_stack_ptr), %rsp
969 push %rbp /* frame pointer backlink */
970 call __do_softirq
2699500b 971 leaveq
4d732138 972 decl PER_CPU_VAR(irq_count)
ed6b676c 973 ret
7d65f4a6 974END(do_softirq_own_stack)
75154f40 975
3d75e1b8 976#ifdef CONFIG_XEN
cb5dd2c5 977idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
3d75e1b8
JF
978
979/*
9f1e87ea
CG
980 * A note on the "critical region" in our callback handler.
981 * We want to avoid stacking callback handlers due to events occurring
982 * during handling of the last event. To do this, we keep events disabled
983 * until we've done all processing. HOWEVER, we must enable events before
984 * popping the stack frame (can't be done atomically) and so it would still
985 * be possible to get enough handler activations to overflow the stack.
986 * Although unlikely, bugs of that kind are hard to track down, so we'd
987 * like to avoid the possibility.
988 * So, on entry to the handler we detect whether we interrupted an
989 * existing activation in its critical region -- if so, we pop the current
990 * activation and restart the handler using the previous one.
991 */
4d732138
IM
992ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
993
9f1e87ea
CG
994/*
995 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
996 * see the correct pointer to the pt_regs
997 */
4d732138
IM
998 movq %rdi, %rsp /* we don't return, adjust the stack frame */
99911: incl PER_CPU_VAR(irq_count)
1000 movq %rsp, %rbp
1001 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
1002 pushq %rbp /* frame pointer backlink */
1003 call xen_evtchn_do_upcall
1004 popq %rsp
1005 decl PER_CPU_VAR(irq_count)
fdfd811d 1006#ifndef CONFIG_PREEMPT
4d732138 1007 call xen_maybe_preempt_hcall
fdfd811d 1008#endif
4d732138 1009 jmp error_exit
371c394a 1010END(xen_do_hypervisor_callback)
3d75e1b8
JF
1011
1012/*
9f1e87ea
CG
1013 * Hypervisor uses this for application faults while it executes.
1014 * We get here for two reasons:
1015 * 1. Fault while reloading DS, ES, FS or GS
1016 * 2. Fault while executing IRET
1017 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1018 * registers that could be reloaded and zeroed the others.
1019 * Category 2 we fix up by killing the current process. We cannot use the
1020 * normal Linux return path in this case because if we use the IRET hypercall
1021 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1022 * We distinguish between categories by comparing each saved segment register
1023 * with its current contents: any discrepancy means we in category 1.
1024 */
3d75e1b8 1025ENTRY(xen_failsafe_callback)
4d732138
IM
1026 movl %ds, %ecx
1027 cmpw %cx, 0x10(%rsp)
1028 jne 1f
1029 movl %es, %ecx
1030 cmpw %cx, 0x18(%rsp)
1031 jne 1f
1032 movl %fs, %ecx
1033 cmpw %cx, 0x20(%rsp)
1034 jne 1f
1035 movl %gs, %ecx
1036 cmpw %cx, 0x28(%rsp)
1037 jne 1f
3d75e1b8 1038 /* All segments match their saved values => Category 2 (Bad IRET). */
4d732138
IM
1039 movq (%rsp), %rcx
1040 movq 8(%rsp), %r11
1041 addq $0x30, %rsp
1042 pushq $0 /* RIP */
1043 pushq %r11
1044 pushq %rcx
1045 jmp general_protection
3d75e1b8 10461: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
4d732138
IM
1047 movq (%rsp), %rcx
1048 movq 8(%rsp), %r11
1049 addq $0x30, %rsp
1050 pushq $-1 /* orig_ax = -1 => not a system call */
76f5df43
DV
1051 ALLOC_PT_GPREGS_ON_STACK
1052 SAVE_C_REGS
1053 SAVE_EXTRA_REGS
4d732138 1054 jmp error_exit
3d75e1b8
JF
1055END(xen_failsafe_callback)
1056
cf910e83 1057apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
38e20b07
SY
1058 xen_hvm_callback_vector xen_evtchn_do_upcall
1059
3d75e1b8 1060#endif /* CONFIG_XEN */
ddeb8f21 1061
bc2b0331 1062#if IS_ENABLED(CONFIG_HYPERV)
cf910e83 1063apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
bc2b0331
S
1064 hyperv_callback_vector hyperv_vector_handler
1065#endif /* CONFIG_HYPERV */
1066
4d732138
IM
1067idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
1068idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
1069idtentry stack_segment do_stack_segment has_error_code=1
1070
6cac5a92 1071#ifdef CONFIG_XEN
4d732138
IM
1072idtentry xen_debug do_debug has_error_code=0
1073idtentry xen_int3 do_int3 has_error_code=0
1074idtentry xen_stack_segment do_stack_segment has_error_code=1
6cac5a92 1075#endif
4d732138
IM
1076
1077idtentry general_protection do_general_protection has_error_code=1
1078trace_idtentry page_fault do_page_fault has_error_code=1
1079
631bc487 1080#ifdef CONFIG_KVM_GUEST
4d732138 1081idtentry async_page_fault do_async_page_fault has_error_code=1
631bc487 1082#endif
4d732138 1083
ddeb8f21 1084#ifdef CONFIG_X86_MCE
4d732138 1085idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
ddeb8f21
AH
1086#endif
1087
ebfc453e
DV
1088/*
1089 * Save all registers in pt_regs, and switch gs if needed.
1090 * Use slow, but surefire "are we in kernel?" check.
1091 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1092 */
1093ENTRY(paranoid_entry)
1eeb207f
DV
1094 cld
1095 SAVE_C_REGS 8
1096 SAVE_EXTRA_REGS 8
4d732138
IM
1097 movl $1, %ebx
1098 movl $MSR_GS_BASE, %ecx
1eeb207f 1099 rdmsr
4d732138
IM
1100 testl %edx, %edx
1101 js 1f /* negative -> in kernel */
1eeb207f 1102 SWAPGS
4d732138 1103 xorl %ebx, %ebx
1eeb207f 11041: ret
ebfc453e 1105END(paranoid_entry)
ddeb8f21 1106
ebfc453e
DV
1107/*
1108 * "Paranoid" exit path from exception stack. This is invoked
1109 * only on return from non-NMI IST interrupts that came
1110 * from kernel space.
1111 *
1112 * We may be returning to very strange contexts (e.g. very early
1113 * in syscall entry), so checking for preemption here would
1114 * be complicated. Fortunately, we there's no good reason
1115 * to try to handle preemption here.
4d732138
IM
1116 *
1117 * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
ebfc453e 1118 */
ddeb8f21 1119ENTRY(paranoid_exit)
ddeb8f21 1120 DISABLE_INTERRUPTS(CLBR_NONE)
5963e317 1121 TRACE_IRQS_OFF_DEBUG
4d732138
IM
1122 testl %ebx, %ebx /* swapgs needed? */
1123 jnz paranoid_exit_no_swapgs
f2db9382 1124 TRACE_IRQS_IRETQ
ddeb8f21 1125 SWAPGS_UNSAFE_STACK
4d732138 1126 jmp paranoid_exit_restore
0d550836 1127paranoid_exit_no_swapgs:
f2db9382 1128 TRACE_IRQS_IRETQ_DEBUG
0d550836 1129paranoid_exit_restore:
76f5df43
DV
1130 RESTORE_EXTRA_REGS
1131 RESTORE_C_REGS
1132 REMOVE_PT_GPREGS_FROM_STACK 8
48e08d0f 1133 INTERRUPT_RETURN
ddeb8f21
AH
1134END(paranoid_exit)
1135
1136/*
ebfc453e 1137 * Save all registers in pt_regs, and switch gs if needed.
539f5113 1138 * Return: EBX=0: came from user mode; EBX=1: otherwise
ddeb8f21
AH
1139 */
1140ENTRY(error_entry)
ddeb8f21 1141 cld
76f5df43
DV
1142 SAVE_C_REGS 8
1143 SAVE_EXTRA_REGS 8
4d732138 1144 xorl %ebx, %ebx
03335e95 1145 testb $3, CS+8(%rsp)
cb6f64ed 1146 jz .Lerror_kernelspace
539f5113 1147
cb6f64ed
AL
1148.Lerror_entry_from_usermode_swapgs:
1149 /*
1150 * We entered from user mode or we're pretending to have entered
1151 * from user mode due to an IRET fault.
1152 */
ddeb8f21 1153 SWAPGS
539f5113 1154
cb6f64ed
AL
1155.Lerror_entry_from_usermode_after_swapgs:
1156.Lerror_entry_done:
ddeb8f21
AH
1157 TRACE_IRQS_OFF
1158 ret
ddeb8f21 1159
ebfc453e
DV
1160 /*
1161 * There are two places in the kernel that can potentially fault with
1162 * usergs. Handle them here. B stepping K8s sometimes report a
1163 * truncated RIP for IRET exceptions returning to compat mode. Check
1164 * for these here too.
1165 */
cb6f64ed 1166.Lerror_kernelspace:
4d732138
IM
1167 incl %ebx
1168 leaq native_irq_return_iret(%rip), %rcx
1169 cmpq %rcx, RIP+8(%rsp)
cb6f64ed 1170 je .Lerror_bad_iret
4d732138
IM
1171 movl %ecx, %eax /* zero extend */
1172 cmpq %rax, RIP+8(%rsp)
cb6f64ed 1173 je .Lbstep_iret
4d732138 1174 cmpq $gs_change, RIP+8(%rsp)
cb6f64ed 1175 jne .Lerror_entry_done
539f5113
AL
1176
1177 /*
1178 * hack: gs_change can fail with user gsbase. If this happens, fix up
1179 * gsbase and proceed. We'll fix up the exception and land in
1180 * gs_change's error handler with kernel gsbase.
1181 */
cb6f64ed 1182 jmp .Lerror_entry_from_usermode_swapgs
ae24ffe5 1183
cb6f64ed 1184.Lbstep_iret:
ae24ffe5 1185 /* Fix truncated RIP */
4d732138 1186 movq %rcx, RIP+8(%rsp)
b645af2d
AL
1187 /* fall through */
1188
cb6f64ed 1189.Lerror_bad_iret:
539f5113
AL
1190 /*
1191 * We came from an IRET to user mode, so we have user gsbase.
1192 * Switch to kernel gsbase:
1193 */
b645af2d 1194 SWAPGS
539f5113
AL
1195
1196 /*
1197 * Pretend that the exception came from user mode: set up pt_regs
1198 * as if we faulted immediately after IRET and clear EBX so that
1199 * error_exit knows that we will be returning to user mode.
1200 */
4d732138
IM
1201 mov %rsp, %rdi
1202 call fixup_bad_iret
1203 mov %rax, %rsp
539f5113 1204 decl %ebx
cb6f64ed 1205 jmp .Lerror_entry_from_usermode_after_swapgs
ddeb8f21
AH
1206END(error_entry)
1207
1208
539f5113
AL
1209/*
1210 * On entry, EBS is a "return to kernel mode" flag:
1211 * 1: already in kernel mode, don't need SWAPGS
1212 * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
1213 */
ddeb8f21 1214ENTRY(error_exit)
4d732138 1215 movl %ebx, %eax
76f5df43 1216 RESTORE_EXTRA_REGS
ddeb8f21
AH
1217 DISABLE_INTERRUPTS(CLBR_NONE)
1218 TRACE_IRQS_OFF
4d732138
IM
1219 testl %eax, %eax
1220 jnz retint_kernel
1221 jmp retint_user
ddeb8f21
AH
1222END(error_exit)
1223
0784b364 1224/* Runs on exception stack */
ddeb8f21 1225ENTRY(nmi)
ddeb8f21 1226 PARAVIRT_ADJUST_EXCEPTION_FRAME
3f3c8b8c
SR
1227 /*
1228 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1229 * the iretq it performs will take us out of NMI context.
1230 * This means that we can have nested NMIs where the next
1231 * NMI is using the top of the stack of the previous NMI. We
1232 * can't let it execute because the nested NMI will corrupt the
1233 * stack of the previous NMI. NMI handlers are not re-entrant
1234 * anyway.
1235 *
1236 * To handle this case we do the following:
1237 * Check the a special location on the stack that contains
1238 * a variable that is set when NMIs are executing.
1239 * The interrupted task's stack is also checked to see if it
1240 * is an NMI stack.
1241 * If the variable is not set and the stack is not the NMI
1242 * stack then:
1243 * o Set the special variable on the stack
1244 * o Copy the interrupt frame into a "saved" location on the stack
1245 * o Copy the interrupt frame into a "copy" location on the stack
1246 * o Continue processing the NMI
1247 * If the variable is set or the previous stack is the NMI stack:
1248 * o Modify the "copy" location to jump to the repeate_nmi
1249 * o return back to the first NMI
1250 *
1251 * Now on exit of the first NMI, we first clear the stack variable
1252 * The NMI stack will tell any nested NMIs at that point that it is
1253 * nested. Then we pop the stack normally with iret, and if there was
1254 * a nested NMI that updated the copy interrupt stack frame, a
1255 * jump will be made to the repeat_nmi code that will handle the second
1256 * NMI.
1257 */
1258
146b2b09 1259 /* Use %rdx as our temp variable throughout */
4d732138 1260 pushq %rdx
3f3c8b8c 1261
45d5a168
SR
1262 /*
1263 * If %cs was not the kernel segment, then the NMI triggered in user
1264 * space, which means it is definitely not nested.
1265 */
4d732138
IM
1266 cmpl $__KERNEL_CS, 16(%rsp)
1267 jne first_nmi
45d5a168 1268
3f3c8b8c
SR
1269 /*
1270 * Check the special variable on the stack to see if NMIs are
1271 * executing.
1272 */
4d732138
IM
1273 cmpl $1, -8(%rsp)
1274 je nested_nmi
3f3c8b8c
SR
1275
1276 /*
1277 * Now test if the previous stack was an NMI stack.
1278 * We need the double check. We check the NMI stack to satisfy the
1279 * race when the first NMI clears the variable before returning.
1280 * We check the variable because the first NMI could be in a
1281 * breakpoint routine using a breakpoint stack.
1282 */
0784b364
DV
1283 lea 6*8(%rsp), %rdx
1284 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1285 cmpq %rdx, 4*8(%rsp)
1286 /* If the stack pointer is above the NMI stack, this is a normal NMI */
1287 ja first_nmi
4d732138 1288
0784b364
DV
1289 subq $EXCEPTION_STKSZ, %rdx
1290 cmpq %rdx, 4*8(%rsp)
1291 /* If it is below the NMI stack, it is a normal NMI */
1292 jb first_nmi
1293 /* Ah, it is within the NMI stack, treat it as nested */
0784b364 1294
3f3c8b8c
SR
1295nested_nmi:
1296 /*
1297 * Do nothing if we interrupted the fixup in repeat_nmi.
1298 * It's about to repeat the NMI handler, so we are fine
1299 * with ignoring this one.
1300 */
4d732138
IM
1301 movq $repeat_nmi, %rdx
1302 cmpq 8(%rsp), %rdx
1303 ja 1f
1304 movq $end_repeat_nmi, %rdx
1305 cmpq 8(%rsp), %rdx
1306 ja nested_nmi_out
3f3c8b8c
SR
1307
13081:
1309 /* Set up the interrupted NMIs stack to jump to repeat_nmi */
4d732138
IM
1310 leaq -1*8(%rsp), %rdx
1311 movq %rdx, %rsp
1312 leaq -10*8(%rsp), %rdx
1313 pushq $__KERNEL_DS
1314 pushq %rdx
131484c8 1315 pushfq
4d732138
IM
1316 pushq $__KERNEL_CS
1317 pushq $repeat_nmi
3f3c8b8c
SR
1318
1319 /* Put stack back */
4d732138 1320 addq $(6*8), %rsp
3f3c8b8c
SR
1321
1322nested_nmi_out:
4d732138 1323 popq %rdx
3f3c8b8c
SR
1324
1325 /* No need to check faults here */
1326 INTERRUPT_RETURN
1327
1328first_nmi:
1329 /*
1330 * Because nested NMIs will use the pushed location that we
1331 * stored in rdx, we must keep that space available.
1332 * Here's what our stack frame will look like:
1333 * +-------------------------+
1334 * | original SS |
1335 * | original Return RSP |
1336 * | original RFLAGS |
1337 * | original CS |
1338 * | original RIP |
1339 * +-------------------------+
1340 * | temp storage for rdx |
1341 * +-------------------------+
1342 * | NMI executing variable |
1343 * +-------------------------+
3f3c8b8c
SR
1344 * | copied SS |
1345 * | copied Return RSP |
1346 * | copied RFLAGS |
1347 * | copied CS |
1348 * | copied RIP |
1349 * +-------------------------+
28696f43
SQ
1350 * | Saved SS |
1351 * | Saved Return RSP |
1352 * | Saved RFLAGS |
1353 * | Saved CS |
1354 * | Saved RIP |
1355 * +-------------------------+
3f3c8b8c
SR
1356 * | pt_regs |
1357 * +-------------------------+
1358 *
79fb4ad6
SR
1359 * The saved stack frame is used to fix up the copied stack frame
1360 * that a nested NMI may change to make the interrupted NMI iret jump
1361 * to the repeat_nmi. The original stack frame and the temp storage
3f3c8b8c
SR
1362 * is also used by nested NMIs and can not be trusted on exit.
1363 */
79fb4ad6 1364 /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
4d732138 1365 movq (%rsp), %rdx
62610913 1366
3f3c8b8c 1367 /* Set the NMI executing variable on the stack. */
4d732138 1368 pushq $1
3f3c8b8c 1369
4d732138
IM
1370 /* Leave room for the "copied" frame */
1371 subq $(5*8), %rsp
28696f43 1372
3f3c8b8c
SR
1373 /* Copy the stack frame to the Saved frame */
1374 .rept 5
4d732138 1375 pushq 11*8(%rsp)
3f3c8b8c 1376 .endr
62610913 1377
79fb4ad6
SR
1378 /* Everything up to here is safe from nested NMIs */
1379
62610913
JB
1380 /*
1381 * If there was a nested NMI, the first NMI's iret will return
1382 * here. But NMIs are still enabled and we can take another
1383 * nested NMI. The nested NMI checks the interrupted RIP to see
1384 * if it is between repeat_nmi and end_repeat_nmi, and if so
1385 * it will just return, as we are about to repeat an NMI anyway.
1386 * This makes it safe to copy to the stack frame that a nested
1387 * NMI will update.
1388 */
1389repeat_nmi:
1390 /*
1391 * Update the stack variable to say we are still in NMI (the update
1392 * is benign for the non-repeat case, where 1 was pushed just above
1393 * to this very stack slot).
1394 */
4d732138 1395 movq $1, 10*8(%rsp)
3f3c8b8c
SR
1396
1397 /* Make another copy, this one may be modified by nested NMIs */
4d732138 1398 addq $(10*8), %rsp
3f3c8b8c 1399 .rept 5
4d732138 1400 pushq -6*8(%rsp)
3f3c8b8c 1401 .endr
4d732138 1402 subq $(5*8), %rsp
62610913 1403end_repeat_nmi:
3f3c8b8c
SR
1404
1405 /*
1406 * Everything below this point can be preempted by a nested
79fb4ad6
SR
1407 * NMI if the first NMI took an exception and reset our iret stack
1408 * so that we repeat another NMI.
3f3c8b8c 1409 */
4d732138 1410 pushq $-1 /* ORIG_RAX: no syscall to restart */
76f5df43
DV
1411 ALLOC_PT_GPREGS_ON_STACK
1412
1fd466ef 1413 /*
ebfc453e 1414 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1fd466ef
SR
1415 * as we should not be calling schedule in NMI context.
1416 * Even with normal interrupts enabled. An NMI should not be
1417 * setting NEED_RESCHED or anything that normal interrupts and
1418 * exceptions might do.
1419 */
4d732138 1420 call paranoid_entry
7fbb98c5
SR
1421
1422 /*
1423 * Save off the CR2 register. If we take a page fault in the NMI then
1424 * it could corrupt the CR2 value. If the NMI preempts a page fault
1425 * handler before it was able to read the CR2 register, and then the
1426 * NMI itself takes a page fault, the page fault that was preempted
1427 * will read the information from the NMI page fault and not the
1428 * origin fault. Save it off and restore it if it changes.
1429 * Use the r12 callee-saved register.
1430 */
4d732138 1431 movq %cr2, %r12
7fbb98c5 1432
ddeb8f21 1433 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
4d732138
IM
1434 movq %rsp, %rdi
1435 movq $-1, %rsi
1436 call do_nmi
7fbb98c5
SR
1437
1438 /* Did the NMI take a page fault? Restore cr2 if it did */
4d732138
IM
1439 movq %cr2, %rcx
1440 cmpq %rcx, %r12
1441 je 1f
1442 movq %r12, %cr2
7fbb98c5 14431:
4d732138
IM
1444 testl %ebx, %ebx /* swapgs needed? */
1445 jnz nmi_restore
ddeb8f21
AH
1446nmi_swapgs:
1447 SWAPGS_UNSAFE_STACK
1448nmi_restore:
76f5df43
DV
1449 RESTORE_EXTRA_REGS
1450 RESTORE_C_REGS
444723dc 1451 /* Pop the extra iret frame at once */
76f5df43 1452 REMOVE_PT_GPREGS_FROM_STACK 6*8
28696f43 1453
3f3c8b8c 1454 /* Clear the NMI executing stack variable */
4d732138 1455 movq $0, 5*8(%rsp)
5ca6f70f 1456 INTERRUPT_RETURN
ddeb8f21
AH
1457END(nmi)
1458
1459ENTRY(ignore_sysret)
4d732138 1460 mov $-ENOSYS, %eax
ddeb8f21 1461 sysret
ddeb8f21 1462END(ignore_sysret)