]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - arch/x86/kernel/entry_64.S
ftrace: Do not test frame pointers if -mfentry is used
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kernel / entry_64.S
CommitLineData
1da177e4
LT
1/*
2 * linux/arch/x86_64/entry.S
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
1da177e4
LT
7 */
8
9/*
10 * entry.S contains the system-call and fault low-level handling routines.
11 *
8b4777a4
AL
12 * Some of this is documented in Documentation/x86/entry_64.txt
13 *
1da177e4
LT
14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call.
0bd7b798
AH
16 *
17 * Normal syscalls and interrupts don't save a full stack frame, this is
1da177e4 18 * only done for syscall tracing, signals or fork/exec et.al.
0bd7b798
AH
19 *
20 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack.
0d2eb44f 23 * - partial stack frame: partially saved registers up to R11.
0bd7b798 24 * - full stack frame: Like partial stack frame, but all register saved.
2e91a17b
AK
25 *
26 * Some macro usage:
27 * - CFI macros are used to generate dwarf2 unwind information for better
28 * backtraces. They don't change any code.
29 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
30 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
31 * There are unfortunately lots of special cases where some registers
32 * not touched. The macro is a big mess that should be cleaned up.
33 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
34 * Gives a full stack frame.
35 * - ENTRY/END Define functions in the symbol table.
36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
37 * frame that is otherwise undefined after a SYSCALL
38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
39 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
1da177e4
LT
40 */
41
1da177e4
LT
42#include <linux/linkage.h>
43#include <asm/segment.h>
1da177e4
LT
44#include <asm/cache.h>
45#include <asm/errno.h>
46#include <asm/dwarf2.h>
47#include <asm/calling.h>
e2d5df93 48#include <asm/asm-offsets.h>
1da177e4
LT
49#include <asm/msr.h>
50#include <asm/unistd.h>
51#include <asm/thread_info.h>
52#include <asm/hw_irq.h>
0341c14d 53#include <asm/page_types.h>
2601e64d 54#include <asm/irqflags.h>
72fe4858 55#include <asm/paravirt.h>
395a59d0 56#include <asm/ftrace.h>
9939ddaf 57#include <asm/percpu.h>
d7abc0fa 58#include <asm/asm.h>
d7e7528b 59#include <linux/err.h>
1da177e4 60
86a1c34a
RM
61/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
62#include <linux/elf-em.h>
63#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
64#define __AUDIT_ARCH_64BIT 0x80000000
65#define __AUDIT_ARCH_LE 0x40000000
66
1da177e4 67 .code64
ea714547
JO
68 .section .entry.text, "ax"
69
606576ce 70#ifdef CONFIG_FUNCTION_TRACER
d61f82d0
SR
71#ifdef CONFIG_DYNAMIC_FTRACE
72ENTRY(mcount)
d61f82d0
SR
73 retq
74END(mcount)
75
08f6fba5
SR
76/* skip is set if stack has been adjusted */
77.macro ftrace_caller_setup skip=0
78 MCOUNT_SAVE_FRAME \skip
79
80 /* Load the ftrace_ops into the 3rd parameter */
81 leaq function_trace_op, %rdx
82
83 /* Load ip into the first parameter */
84 movq RIP(%rsp), %rdi
85 subq $MCOUNT_INSN_SIZE, %rdi
86 /* Load the parent_ip into the second parameter */
87 movq 8(%rbp), %rsi
88.endm
89
d61f82d0 90ENTRY(ftrace_caller)
08f6fba5 91 /* Check if tracing was disabled (quick check) */
60a7ecf4
SR
92 cmpl $0, function_trace_stop
93 jne ftrace_stub
d61f82d0 94
08f6fba5
SR
95 ftrace_caller_setup
96 /* regs go into 4th parameter (but make it NULL) */
97 movq $0, %rcx
d61f82d0 98
bc8b2b92 99GLOBAL(ftrace_call)
d61f82d0
SR
100 call ftrace_stub
101
d680fe44 102 MCOUNT_RESTORE_FRAME
08f6fba5 103ftrace_return:
d61f82d0 104
48d68b20 105#ifdef CONFIG_FUNCTION_GRAPH_TRACER
bc8b2b92 106GLOBAL(ftrace_graph_call)
48d68b20
FW
107 jmp ftrace_stub
108#endif
d61f82d0 109
bc8b2b92 110GLOBAL(ftrace_stub)
d61f82d0
SR
111 retq
112END(ftrace_caller)
113
08f6fba5
SR
114ENTRY(ftrace_regs_caller)
115 /* Save the current flags before compare (in SS location)*/
116 pushfq
117
118 /* Check if tracing was disabled (quick check) */
119 cmpl $0, function_trace_stop
120 jne ftrace_restore_flags
121
122 /* skip=8 to skip flags saved in SS */
123 ftrace_caller_setup 8
124
125 /* Save the rest of pt_regs */
126 movq %r15, R15(%rsp)
127 movq %r14, R14(%rsp)
128 movq %r13, R13(%rsp)
129 movq %r12, R12(%rsp)
130 movq %r11, R11(%rsp)
131 movq %r10, R10(%rsp)
132 movq %rbp, RBP(%rsp)
133 movq %rbx, RBX(%rsp)
134 /* Copy saved flags */
135 movq SS(%rsp), %rcx
136 movq %rcx, EFLAGS(%rsp)
137 /* Kernel segments */
138 movq $__KERNEL_DS, %rcx
139 movq %rcx, SS(%rsp)
140 movq $__KERNEL_CS, %rcx
141 movq %rcx, CS(%rsp)
142 /* Stack - skipping return address */
143 leaq SS+16(%rsp), %rcx
144 movq %rcx, RSP(%rsp)
145
146 /* regs go into 4th parameter */
147 leaq (%rsp), %rcx
148
149GLOBAL(ftrace_regs_call)
150 call ftrace_stub
151
152 /* Copy flags back to SS, to restore them */
153 movq EFLAGS(%rsp), %rax
154 movq %rax, SS(%rsp)
155
156 /* restore the rest of pt_regs */
157 movq R15(%rsp), %r15
158 movq R14(%rsp), %r14
159 movq R13(%rsp), %r13
160 movq R12(%rsp), %r12
161 movq R10(%rsp), %r10
162 movq RBP(%rsp), %rbp
163 movq RBX(%rsp), %rbx
164
165 /* skip=8 to skip flags saved in SS */
166 MCOUNT_RESTORE_FRAME 8
167
168 /* Restore flags */
169 popfq
170
171 jmp ftrace_return
172ftrace_restore_flags:
173 popfq
174 jmp ftrace_stub
175
176END(ftrace_regs_caller)
177
178
d61f82d0 179#else /* ! CONFIG_DYNAMIC_FTRACE */
16444a8a 180ENTRY(mcount)
60a7ecf4
SR
181 cmpl $0, function_trace_stop
182 jne ftrace_stub
183
16444a8a
ACM
184 cmpq $ftrace_stub, ftrace_trace_function
185 jnz trace
48d68b20
FW
186
187#ifdef CONFIG_FUNCTION_GRAPH_TRACER
188 cmpq $ftrace_stub, ftrace_graph_return
189 jnz ftrace_graph_caller
e49dc19c
SR
190
191 cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
192 jnz ftrace_graph_caller
48d68b20
FW
193#endif
194
bc8b2b92 195GLOBAL(ftrace_stub)
16444a8a
ACM
196 retq
197
198trace:
d680fe44 199 MCOUNT_SAVE_FRAME
16444a8a 200
08f6fba5 201 movq RIP(%rsp), %rdi
16444a8a 202 movq 8(%rbp), %rsi
395a59d0 203 subq $MCOUNT_INSN_SIZE, %rdi
16444a8a
ACM
204
205 call *ftrace_trace_function
206
d680fe44 207 MCOUNT_RESTORE_FRAME
16444a8a
ACM
208
209 jmp ftrace_stub
210END(mcount)
d61f82d0 211#endif /* CONFIG_DYNAMIC_FTRACE */
606576ce 212#endif /* CONFIG_FUNCTION_TRACER */
16444a8a 213
48d68b20
FW
214#ifdef CONFIG_FUNCTION_GRAPH_TRACER
215ENTRY(ftrace_graph_caller)
d680fe44 216 MCOUNT_SAVE_FRAME
48d68b20
FW
217
218 leaq 8(%rbp), %rdi
08f6fba5 219 movq RIP(%rsp), %rsi
71e308a2 220 movq (%rbp), %rdx
bb4304c7 221 subq $MCOUNT_INSN_SIZE, %rsi
48d68b20
FW
222
223 call prepare_ftrace_return
224
d680fe44
CG
225 MCOUNT_RESTORE_FRAME
226
48d68b20
FW
227 retq
228END(ftrace_graph_caller)
229
bc8b2b92 230GLOBAL(return_to_handler)
4818d809 231 subq $24, %rsp
48d68b20 232
e71e99c2 233 /* Save the return values */
16444a8a 234 movq %rax, (%rsp)
e71e99c2 235 movq %rdx, 8(%rsp)
71e308a2 236 movq %rbp, %rdi
16444a8a 237
48d68b20 238 call ftrace_return_to_handler
16444a8a 239
194ec341 240 movq %rax, %rdi
e71e99c2 241 movq 8(%rsp), %rdx
16444a8a 242 movq (%rsp), %rax
194ec341
SR
243 addq $24, %rsp
244 jmp *%rdi
48d68b20 245#endif
16444a8a 246
16444a8a 247
dc37db4d 248#ifndef CONFIG_PREEMPT
1da177e4 249#define retint_kernel retint_restore_args
0bd7b798 250#endif
2601e64d 251
72fe4858 252#ifdef CONFIG_PARAVIRT
2be29982 253ENTRY(native_usergs_sysret64)
72fe4858
GOC
254 swapgs
255 sysretq
b3baaa13 256ENDPROC(native_usergs_sysret64)
72fe4858
GOC
257#endif /* CONFIG_PARAVIRT */
258
2601e64d
IM
259
260.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
261#ifdef CONFIG_TRACE_IRQFLAGS
262 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
263 jnc 1f
264 TRACE_IRQS_ON
2651:
266#endif
267.endm
268
5963e317
SR
269/*
270 * When dynamic function tracer is enabled it will add a breakpoint
271 * to all locations that it is about to modify, sync CPUs, update
272 * all the code, sync CPUs, then remove the breakpoints. In this time
273 * if lockdep is enabled, it might jump back into the debug handler
274 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
275 *
276 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
277 * make sure the stack pointer does not get reset back to the top
278 * of the debug stack, and instead just reuses the current stack.
279 */
280#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
281
282.macro TRACE_IRQS_OFF_DEBUG
283 call debug_stack_set_zero
284 TRACE_IRQS_OFF
285 call debug_stack_reset
286.endm
287
288.macro TRACE_IRQS_ON_DEBUG
289 call debug_stack_set_zero
290 TRACE_IRQS_ON
291 call debug_stack_reset
292.endm
293
294.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
295 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
296 jnc 1f
297 TRACE_IRQS_ON_DEBUG
2981:
299.endm
300
301#else
302# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
303# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
304# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
305#endif
306
1da177e4 307/*
0bd7b798
AH
308 * C code is not supposed to know about undefined top of stack. Every time
309 * a C function with an pt_regs argument is called from the SYSCALL based
1da177e4
LT
310 * fast path FIXUP_TOP_OF_STACK is needed.
311 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
312 * manipulation.
0bd7b798
AH
313 */
314
315 /* %rsp:at FRAMEEND */
c002a1e6 316 .macro FIXUP_TOP_OF_STACK tmp offset=0
3d1e42a7 317 movq PER_CPU_VAR(old_rsp),\tmp
c002a1e6
AH
318 movq \tmp,RSP+\offset(%rsp)
319 movq $__USER_DS,SS+\offset(%rsp)
320 movq $__USER_CS,CS+\offset(%rsp)
321 movq $-1,RCX+\offset(%rsp)
322 movq R11+\offset(%rsp),\tmp /* get eflags */
323 movq \tmp,EFLAGS+\offset(%rsp)
1da177e4
LT
324 .endm
325
c002a1e6
AH
326 .macro RESTORE_TOP_OF_STACK tmp offset=0
327 movq RSP+\offset(%rsp),\tmp
3d1e42a7 328 movq \tmp,PER_CPU_VAR(old_rsp)
c002a1e6
AH
329 movq EFLAGS+\offset(%rsp),\tmp
330 movq \tmp,R11+\offset(%rsp)
1da177e4
LT
331 .endm
332
333 .macro FAKE_STACK_FRAME child_rip
334 /* push in order ss, rsp, eflags, cs, rip */
3829ee6b 335 xorl %eax, %eax
df5d1874 336 pushq_cfi $__KERNEL_DS /* ss */
7effaa88 337 /*CFI_REL_OFFSET ss,0*/
df5d1874 338 pushq_cfi %rax /* rsp */
7effaa88 339 CFI_REL_OFFSET rsp,0
1cf8343f 340 pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
7effaa88 341 /*CFI_REL_OFFSET rflags,0*/
df5d1874 342 pushq_cfi $__KERNEL_CS /* cs */
7effaa88 343 /*CFI_REL_OFFSET cs,0*/
df5d1874 344 pushq_cfi \child_rip /* rip */
7effaa88 345 CFI_REL_OFFSET rip,0
df5d1874 346 pushq_cfi %rax /* orig rax */
1da177e4
LT
347 .endm
348
349 .macro UNFAKE_STACK_FRAME
350 addq $8*6, %rsp
351 CFI_ADJUST_CFA_OFFSET -(6*8)
352 .endm
353
dcd072e2
AH
354/*
355 * initial frame state for interrupts (and exceptions without error code)
356 */
357 .macro EMPTY_FRAME start=1 offset=0
7effaa88 358 .if \start
dcd072e2 359 CFI_STARTPROC simple
adf14236 360 CFI_SIGNAL_FRAME
dcd072e2 361 CFI_DEF_CFA rsp,8+\offset
7effaa88 362 .else
dcd072e2 363 CFI_DEF_CFA_OFFSET 8+\offset
7effaa88 364 .endif
1da177e4 365 .endm
d99015b1
AH
366
367/*
dcd072e2 368 * initial frame state for interrupts (and exceptions without error code)
d99015b1 369 */
dcd072e2 370 .macro INTR_FRAME start=1 offset=0
e8a0e276
IM
371 EMPTY_FRAME \start, SS+8+\offset-RIP
372 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
373 CFI_REL_OFFSET rsp, RSP+\offset-RIP
374 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
375 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
376 CFI_REL_OFFSET rip, RIP+\offset-RIP
d99015b1
AH
377 .endm
378
d99015b1
AH
379/*
380 * initial frame state for exceptions with error code (and interrupts
381 * with vector already pushed)
382 */
dcd072e2 383 .macro XCPT_FRAME start=1 offset=0
e8a0e276 384 INTR_FRAME \start, RIP+\offset-ORIG_RAX
dcd072e2
AH
385 /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
386 .endm
387
388/*
389 * frame that enables calling into C.
390 */
391 .macro PARTIAL_FRAME start=1 offset=0
e8a0e276
IM
392 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
393 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
394 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
395 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
396 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
397 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
398 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
399 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
400 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
401 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
dcd072e2
AH
402 .endm
403
404/*
405 * frame that enables passing a complete pt_regs to a C function.
406 */
407 .macro DEFAULT_FRAME start=1 offset=0
e8a0e276 408 PARTIAL_FRAME \start, R11+\offset-R15
dcd072e2
AH
409 CFI_REL_OFFSET rbx, RBX+\offset
410 CFI_REL_OFFSET rbp, RBP+\offset
411 CFI_REL_OFFSET r12, R12+\offset
412 CFI_REL_OFFSET r13, R13+\offset
413 CFI_REL_OFFSET r14, R14+\offset
414 CFI_REL_OFFSET r15, R15+\offset
415 .endm
d99015b1
AH
416
417/* save partial stack frame */
1871853f 418 .macro SAVE_ARGS_IRQ
d99015b1 419 cld
1871853f
FW
420 /* start from rbp in pt_regs and jump over */
421 movq_cfi rdi, RDI-RBP
422 movq_cfi rsi, RSI-RBP
423 movq_cfi rdx, RDX-RBP
424 movq_cfi rcx, RCX-RBP
425 movq_cfi rax, RAX-RBP
426 movq_cfi r8, R8-RBP
427 movq_cfi r9, R9-RBP
428 movq_cfi r10, R10-RBP
429 movq_cfi r11, R11-RBP
430
a2bbe750
FW
431 /* Save rbp so that we can unwind from get_irq_regs() */
432 movq_cfi rbp, 0
433
434 /* Save previous stack value */
435 movq %rsp, %rsi
3b99a3ef
FW
436
437 leaq -RBP(%rsp),%rdi /* arg1 for handler */
69466466 438 testl $3, CS-RBP(%rsi)
d99015b1
AH
439 je 1f
440 SWAPGS
441 /*
56895530 442 * irq_count is used to check if a CPU is already on an interrupt stack
d99015b1
AH
443 * or not. While this is essentially redundant with preempt_count it is
444 * a little cheaper to use a separate counter in the PDA (short of
445 * moving irq_enter into assembly, which would be too much work)
446 */
56895530 4471: incl PER_CPU_VAR(irq_count)
69466466 448 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
eab9e613 449 CFI_DEF_CFA_REGISTER rsi
a2bbe750 450
69466466 451 /* Store previous stack value */
a2bbe750 452 pushq %rsi
eab9e613
JB
453 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
454 0x77 /* DW_OP_breg7 */, 0, \
455 0x06 /* DW_OP_deref */, \
456 0x08 /* DW_OP_const1u */, SS+8-RBP, \
457 0x22 /* DW_OP_plus */
a2bbe750
FW
458 /* We entered an interrupt context - irqs are off: */
459 TRACE_IRQS_OFF
1871853f 460 .endm
d99015b1 461
c002a1e6
AH
462ENTRY(save_rest)
463 PARTIAL_FRAME 1 REST_SKIP+8
464 movq 5*8+16(%rsp), %r11 /* save return address */
465 movq_cfi rbx, RBX+16
466 movq_cfi rbp, RBP+16
467 movq_cfi r12, R12+16
468 movq_cfi r13, R13+16
469 movq_cfi r14, R14+16
470 movq_cfi r15, R15+16
471 movq %r11, 8(%rsp) /* return address */
472 FIXUP_TOP_OF_STACK %r11, 16
473 ret
474 CFI_ENDPROC
475END(save_rest)
476
e2f6bc25 477/* save complete stack frame */
c2810188 478 .pushsection .kprobes.text, "ax"
e2f6bc25
AH
479ENTRY(save_paranoid)
480 XCPT_FRAME 1 RDI+8
481 cld
482 movq_cfi rdi, RDI+8
483 movq_cfi rsi, RSI+8
484 movq_cfi rdx, RDX+8
485 movq_cfi rcx, RCX+8
486 movq_cfi rax, RAX+8
487 movq_cfi r8, R8+8
488 movq_cfi r9, R9+8
489 movq_cfi r10, R10+8
490 movq_cfi r11, R11+8
491 movq_cfi rbx, RBX+8
492 movq_cfi rbp, RBP+8
493 movq_cfi r12, R12+8
494 movq_cfi r13, R13+8
495 movq_cfi r14, R14+8
496 movq_cfi r15, R15+8
497 movl $1,%ebx
498 movl $MSR_GS_BASE,%ecx
499 rdmsr
500 testl %edx,%edx
501 js 1f /* negative -> in kernel */
502 SWAPGS
503 xorl %ebx,%ebx
5041: ret
505 CFI_ENDPROC
506END(save_paranoid)
c2810188 507 .popsection
e2f6bc25 508
1da177e4 509/*
5b3eec0c
IM
510 * A newly forked process directly context switches into this address.
511 *
512 * rdi: prev task we switched from
0bd7b798 513 */
1da177e4 514ENTRY(ret_from_fork)
dcd072e2 515 DEFAULT_FRAME
5b3eec0c 516
7106a5ab
BL
517 LOCK ; btr $TIF_FORK,TI_flags(%r8)
518
df5d1874
JB
519 pushq_cfi kernel_eflags(%rip)
520 popfq_cfi # reset kernel eflags
5b3eec0c
IM
521
522 call schedule_tail # rdi: 'prev' task parameter
523
1da177e4 524 GET_THREAD_INFO(%rcx)
5b3eec0c 525
1da177e4 526 RESTORE_REST
5b3eec0c
IM
527
528 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
70ea6855 529 jz retint_restore_args
5b3eec0c
IM
530
531 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
1da177e4 532 jnz int_ret_from_sys_call
5b3eec0c 533
c002a1e6 534 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
5b3eec0c
IM
535 jmp ret_from_sys_call # go to the SYSRET fastpath
536
1da177e4 537 CFI_ENDPROC
4b787e0b 538END(ret_from_fork)
1da177e4
LT
539
540/*
0d2eb44f 541 * System call entry. Up to 6 arguments in registers are supported.
1da177e4
LT
542 *
543 * SYSCALL does not save anything on the stack and does not change the
544 * stack pointer.
545 */
0bd7b798 546
1da177e4 547/*
0bd7b798 548 * Register setup:
1da177e4
LT
549 * rax system call number
550 * rdi arg0
0bd7b798 551 * rcx return address for syscall/sysret, C arg3
1da177e4 552 * rsi arg1
0bd7b798 553 * rdx arg2
1da177e4
LT
554 * r10 arg3 (--> moved to rcx for C)
555 * r8 arg4
556 * r9 arg5
557 * r11 eflags for syscall/sysret, temporary for C
0bd7b798
AH
558 * r12-r15,rbp,rbx saved by C code, not touched.
559 *
1da177e4
LT
560 * Interrupts are off on entry.
561 * Only called from user space.
562 *
563 * XXX if we had a free scratch register we could save the RSP into the stack frame
564 * and report it properly in ps. Unfortunately we haven't.
7bf36bbc
AK
565 *
566 * When user can change the frames always force IRET. That is because
567 * it deals with uncanonical addresses better. SYSRET has trouble
568 * with them due to bugs in both AMD and Intel CPUs.
0bd7b798 569 */
1da177e4
LT
570
571ENTRY(system_call)
7effaa88 572 CFI_STARTPROC simple
adf14236 573 CFI_SIGNAL_FRAME
9af45651 574 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
7effaa88
JB
575 CFI_REGISTER rip,rcx
576 /*CFI_REGISTER rflags,r11*/
72fe4858
GOC
577 SWAPGS_UNSAFE_STACK
578 /*
579 * A hypervisor implementation might want to use a label
580 * after the swapgs, so that it can do the swapgs
581 * for the guest and jump here on syscall.
582 */
f6b2bc84 583GLOBAL(system_call_after_swapgs)
72fe4858 584
3d1e42a7 585 movq %rsp,PER_CPU_VAR(old_rsp)
9af45651 586 movq PER_CPU_VAR(kernel_stack),%rsp
2601e64d
IM
587 /*
588 * No need to follow this irqs off/on section - it's straight
589 * and short:
590 */
72fe4858 591 ENABLE_INTERRUPTS(CLBR_NONE)
cac0e0a7 592 SAVE_ARGS 8,0
0bd7b798 593 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
7effaa88
JB
594 movq %rcx,RIP-ARGOFFSET(%rsp)
595 CFI_REL_OFFSET rip,RIP-ARGOFFSET
46db09d3 596 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
1da177e4 597 jnz tracesys
86a1c34a 598system_call_fastpath:
fca460f9 599#if __SYSCALL_MASK == ~0
1da177e4 600 cmpq $__NR_syscall_max,%rax
fca460f9
PA
601#else
602 andl $__SYSCALL_MASK,%eax
603 cmpl $__NR_syscall_max,%eax
604#endif
1da177e4
LT
605 ja badsys
606 movq %r10,%rcx
607 call *sys_call_table(,%rax,8) # XXX: rip relative
608 movq %rax,RAX-ARGOFFSET(%rsp)
609/*
610 * Syscall return path ending with SYSRET (fast path)
0bd7b798
AH
611 * Has incomplete stack frame and undefined top of stack.
612 */
1da177e4 613ret_from_sys_call:
11b854b2 614 movl $_TIF_ALLWORK_MASK,%edi
1da177e4 615 /* edi: flagmask */
0bd7b798 616sysret_check:
10cd706d 617 LOCKDEP_SYS_EXIT
72fe4858 618 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 619 TRACE_IRQS_OFF
46db09d3 620 movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
1da177e4 621 andl %edi,%edx
0bd7b798 622 jnz sysret_careful
bcddc015 623 CFI_REMEMBER_STATE
2601e64d
IM
624 /*
625 * sysretq will re-enable interrupts:
626 */
627 TRACE_IRQS_ON
1da177e4 628 movq RIP-ARGOFFSET(%rsp),%rcx
7effaa88 629 CFI_REGISTER rip,rcx
838feb47 630 RESTORE_ARGS 1,-ARG_SKIP,0
7effaa88 631 /*CFI_REGISTER rflags,r11*/
3d1e42a7 632 movq PER_CPU_VAR(old_rsp), %rsp
2be29982 633 USERGS_SYSRET64
1da177e4 634
bcddc015 635 CFI_RESTORE_STATE
1da177e4 636 /* Handle reschedules */
0bd7b798 637 /* edx: work, edi: workmask */
1da177e4
LT
638sysret_careful:
639 bt $TIF_NEED_RESCHED,%edx
640 jnc sysret_signal
2601e64d 641 TRACE_IRQS_ON
72fe4858 642 ENABLE_INTERRUPTS(CLBR_NONE)
df5d1874 643 pushq_cfi %rdi
1da177e4 644 call schedule
df5d1874 645 popq_cfi %rdi
1da177e4
LT
646 jmp sysret_check
647
0bd7b798 648 /* Handle a signal */
1da177e4 649sysret_signal:
2601e64d 650 TRACE_IRQS_ON
72fe4858 651 ENABLE_INTERRUPTS(CLBR_NONE)
86a1c34a
RM
652#ifdef CONFIG_AUDITSYSCALL
653 bt $TIF_SYSCALL_AUDIT,%edx
654 jc sysret_audit
655#endif
b60e714d
RM
656 /*
657 * We have a signal, or exit tracing or single-step.
658 * These all wind up with the iret return path anyway,
659 * so just join that path right now.
660 */
661 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
662 jmp int_check_syscall_exit_work
0bd7b798 663
7effaa88
JB
664badsys:
665 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
666 jmp ret_from_sys_call
667
86a1c34a
RM
668#ifdef CONFIG_AUDITSYSCALL
669 /*
670 * Fast path for syscall audit without full syscall trace.
b05d8447 671 * We just call __audit_syscall_entry() directly, and then
86a1c34a
RM
672 * jump back to the normal fast path.
673 */
674auditsys:
675 movq %r10,%r9 /* 6th arg: 4th syscall arg */
676 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
677 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
678 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
679 movq %rax,%rsi /* 2nd arg: syscall number */
680 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
b05d8447 681 call __audit_syscall_entry
86a1c34a
RM
682 LOAD_ARGS 0 /* reload call-clobbered registers */
683 jmp system_call_fastpath
684
685 /*
d7e7528b 686 * Return fast path for syscall audit. Call __audit_syscall_exit()
86a1c34a
RM
687 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
688 * masked off.
689 */
690sysret_audit:
03275591 691 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
d7e7528b
EP
692 cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
693 setbe %al /* 1 if so, 0 if not */
86a1c34a 694 movzbl %al,%edi /* zero-extend that into %edi */
d7e7528b 695 call __audit_syscall_exit
86a1c34a
RM
696 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
697 jmp sysret_check
698#endif /* CONFIG_AUDITSYSCALL */
699
1da177e4 700 /* Do syscall tracing */
0bd7b798 701tracesys:
86a1c34a 702#ifdef CONFIG_AUDITSYSCALL
46db09d3 703 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
86a1c34a
RM
704 jz auditsys
705#endif
1da177e4 706 SAVE_REST
a31f8dd7 707 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
1da177e4
LT
708 FIXUP_TOP_OF_STACK %rdi
709 movq %rsp,%rdi
710 call syscall_trace_enter
d4d67150
RM
711 /*
712 * Reload arg registers from stack in case ptrace changed them.
713 * We don't reload %rax because syscall_trace_enter() returned
714 * the value it wants us to use in the table lookup.
715 */
716 LOAD_ARGS ARGOFFSET, 1
1da177e4 717 RESTORE_REST
fca460f9 718#if __SYSCALL_MASK == ~0
1da177e4 719 cmpq $__NR_syscall_max,%rax
fca460f9
PA
720#else
721 andl $__SYSCALL_MASK,%eax
722 cmpl $__NR_syscall_max,%eax
723#endif
a31f8dd7 724 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
1da177e4
LT
725 movq %r10,%rcx /* fixup for C */
726 call *sys_call_table(,%rax,8)
a31f8dd7 727 movq %rax,RAX-ARGOFFSET(%rsp)
7bf36bbc 728 /* Use IRET because user could have changed frame */
0bd7b798
AH
729
730/*
1da177e4
LT
731 * Syscall return path ending with IRET.
732 * Has correct top of stack, but partial stack frame.
bcddc015 733 */
bc8b2b92 734GLOBAL(int_ret_from_sys_call)
72fe4858 735 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 736 TRACE_IRQS_OFF
1da177e4
LT
737 movl $_TIF_ALLWORK_MASK,%edi
738 /* edi: mask to check */
bc8b2b92 739GLOBAL(int_with_check)
10cd706d 740 LOCKDEP_SYS_EXIT_IRQ
1da177e4 741 GET_THREAD_INFO(%rcx)
26ccb8a7 742 movl TI_flags(%rcx),%edx
1da177e4
LT
743 andl %edi,%edx
744 jnz int_careful
26ccb8a7 745 andl $~TS_COMPAT,TI_status(%rcx)
1da177e4
LT
746 jmp retint_swapgs
747
748 /* Either reschedule or signal or syscall exit tracking needed. */
749 /* First do a reschedule test. */
750 /* edx: work, edi: workmask */
751int_careful:
752 bt $TIF_NEED_RESCHED,%edx
753 jnc int_very_careful
2601e64d 754 TRACE_IRQS_ON
72fe4858 755 ENABLE_INTERRUPTS(CLBR_NONE)
df5d1874 756 pushq_cfi %rdi
1da177e4 757 call schedule
df5d1874 758 popq_cfi %rdi
72fe4858 759 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 760 TRACE_IRQS_OFF
1da177e4
LT
761 jmp int_with_check
762
763 /* handle signals and tracing -- both require a full stack frame */
764int_very_careful:
2601e64d 765 TRACE_IRQS_ON
72fe4858 766 ENABLE_INTERRUPTS(CLBR_NONE)
b60e714d 767int_check_syscall_exit_work:
1da177e4 768 SAVE_REST
0bd7b798 769 /* Check for syscall exit trace */
d4d67150 770 testl $_TIF_WORK_SYSCALL_EXIT,%edx
1da177e4 771 jz int_signal
df5d1874 772 pushq_cfi %rdi
0bd7b798 773 leaq 8(%rsp),%rdi # &ptregs -> arg1
1da177e4 774 call syscall_trace_leave
df5d1874 775 popq_cfi %rdi
d4d67150 776 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
1da177e4 777 jmp int_restore_rest
0bd7b798 778
1da177e4 779int_signal:
8f4d37ec 780 testl $_TIF_DO_NOTIFY_MASK,%edx
1da177e4
LT
781 jz 1f
782 movq %rsp,%rdi # &ptregs -> arg1
783 xorl %esi,%esi # oldset -> arg2
784 call do_notify_resume
eca91e78 7851: movl $_TIF_WORK_MASK,%edi
1da177e4
LT
786int_restore_rest:
787 RESTORE_REST
72fe4858 788 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 789 TRACE_IRQS_OFF
1da177e4
LT
790 jmp int_with_check
791 CFI_ENDPROC
bcddc015 792END(system_call)
0bd7b798
AH
793
794/*
1da177e4 795 * Certain special system calls that need to save a complete full stack frame.
0bd7b798 796 */
1da177e4 797 .macro PTREGSCALL label,func,arg
c002a1e6
AH
798ENTRY(\label)
799 PARTIAL_FRAME 1 8 /* offset 8: return address */
800 subq $REST_SKIP, %rsp
801 CFI_ADJUST_CFA_OFFSET REST_SKIP
802 call save_rest
803 DEFAULT_FRAME 0 8 /* offset 8: return address */
804 leaq 8(%rsp), \arg /* pt_regs pointer */
805 call \func
806 jmp ptregscall_common
807 CFI_ENDPROC
4b787e0b 808END(\label)
1da177e4
LT
809 .endm
810
811 PTREGSCALL stub_clone, sys_clone, %r8
812 PTREGSCALL stub_fork, sys_fork, %rdi
813 PTREGSCALL stub_vfork, sys_vfork, %rdi
1da177e4
LT
814 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
815 PTREGSCALL stub_iopl, sys_iopl, %rsi
816
817ENTRY(ptregscall_common)
c002a1e6
AH
818 DEFAULT_FRAME 1 8 /* offset 8: return address */
819 RESTORE_TOP_OF_STACK %r11, 8
820 movq_cfi_restore R15+8, r15
821 movq_cfi_restore R14+8, r14
822 movq_cfi_restore R13+8, r13
823 movq_cfi_restore R12+8, r12
824 movq_cfi_restore RBP+8, rbp
825 movq_cfi_restore RBX+8, rbx
826 ret $REST_SKIP /* pop extended registers */
1da177e4 827 CFI_ENDPROC
4b787e0b 828END(ptregscall_common)
0bd7b798 829
1da177e4
LT
830ENTRY(stub_execve)
831 CFI_STARTPROC
e6b04b6b
JB
832 addq $8, %rsp
833 PARTIAL_FRAME 0
1da177e4 834 SAVE_REST
1da177e4 835 FIXUP_TOP_OF_STACK %r11
5d119b2c 836 movq %rsp, %rcx
1da177e4 837 call sys_execve
1da177e4 838 RESTORE_TOP_OF_STACK %r11
1da177e4
LT
839 movq %rax,RAX(%rsp)
840 RESTORE_REST
841 jmp int_ret_from_sys_call
842 CFI_ENDPROC
4b787e0b 843END(stub_execve)
0bd7b798 844
1da177e4
LT
845/*
846 * sigreturn is special because it needs to restore all registers on return.
847 * This cannot be done with SYSRET, so use the IRET return path instead.
0bd7b798 848 */
1da177e4
LT
849ENTRY(stub_rt_sigreturn)
850 CFI_STARTPROC
7effaa88 851 addq $8, %rsp
e6b04b6b 852 PARTIAL_FRAME 0
1da177e4
LT
853 SAVE_REST
854 movq %rsp,%rdi
855 FIXUP_TOP_OF_STACK %r11
856 call sys_rt_sigreturn
857 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
858 RESTORE_REST
859 jmp int_ret_from_sys_call
860 CFI_ENDPROC
4b787e0b 861END(stub_rt_sigreturn)
1da177e4 862
c5a37394
PA
863#ifdef CONFIG_X86_X32_ABI
864 PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx
865
866ENTRY(stub_x32_rt_sigreturn)
867 CFI_STARTPROC
868 addq $8, %rsp
869 PARTIAL_FRAME 0
870 SAVE_REST
871 movq %rsp,%rdi
872 FIXUP_TOP_OF_STACK %r11
873 call sys32_x32_rt_sigreturn
874 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
875 RESTORE_REST
876 jmp int_ret_from_sys_call
877 CFI_ENDPROC
878END(stub_x32_rt_sigreturn)
879
d1a797f3
PA
880ENTRY(stub_x32_execve)
881 CFI_STARTPROC
882 addq $8, %rsp
883 PARTIAL_FRAME 0
884 SAVE_REST
885 FIXUP_TOP_OF_STACK %r11
886 movq %rsp, %rcx
887 call sys32_execve
888 RESTORE_TOP_OF_STACK %r11
889 movq %rax,RAX(%rsp)
890 RESTORE_REST
891 jmp int_ret_from_sys_call
892 CFI_ENDPROC
893END(stub_x32_execve)
894
c5a37394
PA
895#endif
896
939b7871
PA
897/*
898 * Build the entry stubs and pointer table with some assembler magic.
899 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
900 * single cache line on all modern x86 implementations.
901 */
902 .section .init.rodata,"a"
903ENTRY(interrupt)
ea714547 904 .section .entry.text
939b7871
PA
905 .p2align 5
906 .p2align CONFIG_X86_L1_CACHE_SHIFT
907ENTRY(irq_entries_start)
908 INTR_FRAME
909vector=FIRST_EXTERNAL_VECTOR
910.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
911 .balign 32
912 .rept 7
913 .if vector < NR_VECTORS
8665596e 914 .if vector <> FIRST_EXTERNAL_VECTOR
939b7871
PA
915 CFI_ADJUST_CFA_OFFSET -8
916 .endif
df5d1874 9171: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
8665596e 918 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
939b7871
PA
919 jmp 2f
920 .endif
921 .previous
922 .quad 1b
ea714547 923 .section .entry.text
939b7871
PA
924vector=vector+1
925 .endif
926 .endr
9272: jmp common_interrupt
928.endr
929 CFI_ENDPROC
930END(irq_entries_start)
931
932.previous
933END(interrupt)
934.previous
935
d99015b1 936/*
1da177e4
LT
937 * Interrupt entry/exit.
938 *
939 * Interrupt entry points save only callee clobbered registers in fast path.
d99015b1
AH
940 *
941 * Entry runs with interrupts off.
942 */
1da177e4 943
722024db 944/* 0(%rsp): ~(interrupt number) */
1da177e4 945 .macro interrupt func
625dbc3b
FW
946 /* reserve pt_regs for scratch regs and rbp */
947 subq $ORIG_RAX-RBP, %rsp
948 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
1871853f 949 SAVE_ARGS_IRQ
1da177e4
LT
950 call \func
951 .endm
952
8222d718
MH
953/*
954 * Interrupt entry/exit should be protected against kprobes
955 */
956 .pushsection .kprobes.text, "ax"
722024db
AH
957 /*
958 * The interrupt stubs push (~vector+0x80) onto the stack and
959 * then jump to common_interrupt.
960 */
939b7871
PA
961 .p2align CONFIG_X86_L1_CACHE_SHIFT
962common_interrupt:
7effaa88 963 XCPT_FRAME
722024db 964 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
1da177e4 965 interrupt do_IRQ
3d1e42a7 966 /* 0(%rsp): old_rsp-ARGOFFSET */
7effaa88 967ret_from_intr:
72fe4858 968 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 969 TRACE_IRQS_OFF
56895530 970 decl PER_CPU_VAR(irq_count)
625dbc3b 971
a2bbe750
FW
972 /* Restore saved previous stack */
973 popq %rsi
928282e4 974 CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
eab9e613 975 leaq ARGOFFSET-RBP(%rsi), %rsp
7effaa88 976 CFI_DEF_CFA_REGISTER rsp
eab9e613 977 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
625dbc3b 978
7effaa88 979exit_intr:
1da177e4
LT
980 GET_THREAD_INFO(%rcx)
981 testl $3,CS-ARGOFFSET(%rsp)
982 je retint_kernel
0bd7b798 983
1da177e4
LT
984 /* Interrupt came from user space */
985 /*
986 * Has a correct top of stack, but a partial stack frame
987 * %rcx: thread info. Interrupts off.
0bd7b798 988 */
1da177e4
LT
989retint_with_reschedule:
990 movl $_TIF_WORK_MASK,%edi
7effaa88 991retint_check:
10cd706d 992 LOCKDEP_SYS_EXIT_IRQ
26ccb8a7 993 movl TI_flags(%rcx),%edx
1da177e4 994 andl %edi,%edx
7effaa88 995 CFI_REMEMBER_STATE
1da177e4 996 jnz retint_careful
10cd706d
PZ
997
998retint_swapgs: /* return to user-space */
2601e64d
IM
999 /*
1000 * The iretq could re-enable interrupts:
1001 */
72fe4858 1002 DISABLE_INTERRUPTS(CLBR_ANY)
2601e64d 1003 TRACE_IRQS_IRETQ
72fe4858 1004 SWAPGS
2601e64d
IM
1005 jmp restore_args
1006
10cd706d 1007retint_restore_args: /* return to kernel space */
72fe4858 1008 DISABLE_INTERRUPTS(CLBR_ANY)
2601e64d
IM
1009 /*
1010 * The iretq could re-enable interrupts:
1011 */
1012 TRACE_IRQS_IRETQ
1013restore_args:
838feb47 1014 RESTORE_ARGS 1,8,1
3701d863 1015
f7f3d791 1016irq_return:
72fe4858 1017 INTERRUPT_RETURN
d7abc0fa 1018 _ASM_EXTABLE(irq_return, bad_iret)
3701d863
IM
1019
1020#ifdef CONFIG_PARAVIRT
72fe4858 1021ENTRY(native_iret)
1da177e4 1022 iretq
d7abc0fa 1023 _ASM_EXTABLE(native_iret, bad_iret)
3701d863
IM
1024#endif
1025
1da177e4 1026 .section .fixup,"ax"
1da177e4 1027bad_iret:
3aa4b37d
RM
1028 /*
1029 * The iret traps when the %cs or %ss being restored is bogus.
1030 * We've lost the original trap vector and error code.
1031 * #GPF is the most likely one to get for an invalid selector.
1032 * So pretend we completed the iret and took the #GPF in user mode.
1033 *
1034 * We are now running with the kernel GS after exception recovery.
1035 * But error_entry expects us to have user GS to match the user %cs,
1036 * so swap back.
1037 */
1038 pushq $0
1039
1040 SWAPGS
1041 jmp general_protection
1042
72fe4858
GOC
1043 .previous
1044
7effaa88 1045 /* edi: workmask, edx: work */
1da177e4 1046retint_careful:
7effaa88 1047 CFI_RESTORE_STATE
1da177e4
LT
1048 bt $TIF_NEED_RESCHED,%edx
1049 jnc retint_signal
2601e64d 1050 TRACE_IRQS_ON
72fe4858 1051 ENABLE_INTERRUPTS(CLBR_NONE)
df5d1874 1052 pushq_cfi %rdi
1da177e4 1053 call schedule
df5d1874 1054 popq_cfi %rdi
1da177e4 1055 GET_THREAD_INFO(%rcx)
72fe4858 1056 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 1057 TRACE_IRQS_OFF
1da177e4 1058 jmp retint_check
0bd7b798 1059
1da177e4 1060retint_signal:
8f4d37ec 1061 testl $_TIF_DO_NOTIFY_MASK,%edx
10ffdbb8 1062 jz retint_swapgs
2601e64d 1063 TRACE_IRQS_ON
72fe4858 1064 ENABLE_INTERRUPTS(CLBR_NONE)
1da177e4 1065 SAVE_REST
0bd7b798 1066 movq $-1,ORIG_RAX(%rsp)
3829ee6b 1067 xorl %esi,%esi # oldset
1da177e4
LT
1068 movq %rsp,%rdi # &pt_regs
1069 call do_notify_resume
1070 RESTORE_REST
72fe4858 1071 DISABLE_INTERRUPTS(CLBR_NONE)
2601e64d 1072 TRACE_IRQS_OFF
be9e6870 1073 GET_THREAD_INFO(%rcx)
eca91e78 1074 jmp retint_with_reschedule
1da177e4
LT
1075
1076#ifdef CONFIG_PREEMPT
1077 /* Returning to kernel space. Check if we need preemption */
1078 /* rcx: threadinfo. interrupts off. */
b06babac 1079ENTRY(retint_kernel)
26ccb8a7 1080 cmpl $0,TI_preempt_count(%rcx)
1da177e4 1081 jnz retint_restore_args
26ccb8a7 1082 bt $TIF_NEED_RESCHED,TI_flags(%rcx)
1da177e4
LT
1083 jnc retint_restore_args
1084 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
1085 jnc retint_restore_args
1086 call preempt_schedule_irq
1087 jmp exit_intr
0bd7b798 1088#endif
4b787e0b 1089
1da177e4 1090 CFI_ENDPROC
4b787e0b 1091END(common_interrupt)
8222d718
MH
1092/*
1093 * End of kprobes section
1094 */
1095 .popsection
0bd7b798 1096
1da177e4
LT
1097/*
1098 * APIC interrupts.
0bd7b798 1099 */
322648d1
AH
1100.macro apicinterrupt num sym do_sym
1101ENTRY(\sym)
7effaa88 1102 INTR_FRAME
df5d1874 1103 pushq_cfi $~(\num)
39e95433 1104.Lcommon_\sym:
322648d1 1105 interrupt \do_sym
1da177e4
LT
1106 jmp ret_from_intr
1107 CFI_ENDPROC
322648d1
AH
1108END(\sym)
1109.endm
1da177e4 1110
322648d1
AH
1111#ifdef CONFIG_SMP
1112apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
1113 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
4ef702c1
AK
1114apicinterrupt REBOOT_VECTOR \
1115 reboot_interrupt smp_reboot_interrupt
322648d1 1116#endif
1da177e4 1117
03b48632 1118#ifdef CONFIG_X86_UV
5ae3a139 1119apicinterrupt UV_BAU_MESSAGE \
322648d1 1120 uv_bau_message_intr1 uv_bau_message_interrupt
03b48632 1121#endif
322648d1
AH
1122apicinterrupt LOCAL_TIMER_VECTOR \
1123 apic_timer_interrupt smp_apic_timer_interrupt
4a4de9c7
DS
1124apicinterrupt X86_PLATFORM_IPI_VECTOR \
1125 x86_platform_ipi smp_x86_platform_ipi
89b831ef 1126
322648d1 1127apicinterrupt THRESHOLD_APIC_VECTOR \
7856f6cc 1128 threshold_interrupt smp_threshold_interrupt
322648d1
AH
1129apicinterrupt THERMAL_APIC_VECTOR \
1130 thermal_interrupt smp_thermal_interrupt
1812924b 1131
322648d1
AH
1132#ifdef CONFIG_SMP
1133apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1134 call_function_single_interrupt smp_call_function_single_interrupt
1135apicinterrupt CALL_FUNCTION_VECTOR \
1136 call_function_interrupt smp_call_function_interrupt
1137apicinterrupt RESCHEDULE_VECTOR \
1138 reschedule_interrupt smp_reschedule_interrupt
1139#endif
1da177e4 1140
322648d1
AH
1141apicinterrupt ERROR_APIC_VECTOR \
1142 error_interrupt smp_error_interrupt
1143apicinterrupt SPURIOUS_APIC_VECTOR \
1144 spurious_interrupt smp_spurious_interrupt
0bd7b798 1145
e360adbe
PZ
1146#ifdef CONFIG_IRQ_WORK
1147apicinterrupt IRQ_WORK_VECTOR \
1148 irq_work_interrupt smp_irq_work_interrupt
241771ef
IM
1149#endif
1150
1da177e4
LT
1151/*
1152 * Exception entry points.
0bd7b798 1153 */
322648d1
AH
1154.macro zeroentry sym do_sym
1155ENTRY(\sym)
7effaa88 1156 INTR_FRAME
fab58420 1157 PARAVIRT_ADJUST_EXCEPTION_FRAME
14ae22ba 1158 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
b1cccb1b
JB
1159 subq $ORIG_RAX-R15, %rsp
1160 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
d99015b1 1161 call error_entry
dcd072e2 1162 DEFAULT_FRAME 0
d99015b1
AH
1163 movq %rsp,%rdi /* pt_regs pointer */
1164 xorl %esi,%esi /* no error code */
322648d1 1165 call \do_sym
d99015b1 1166 jmp error_exit /* %ebx: no swapgs flag */
7effaa88 1167 CFI_ENDPROC
322648d1
AH
1168END(\sym)
1169.endm
1da177e4 1170
322648d1 1171.macro paranoidzeroentry sym do_sym
ddeb8f21 1172ENTRY(\sym)
b8b1d08b
AH
1173 INTR_FRAME
1174 PARAVIRT_ADJUST_EXCEPTION_FRAME
b1cccb1b
JB
1175 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1176 subq $ORIG_RAX-R15, %rsp
1177 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
b8b1d08b
AH
1178 call save_paranoid
1179 TRACE_IRQS_OFF
1180 movq %rsp,%rdi /* pt_regs pointer */
1181 xorl %esi,%esi /* no error code */
322648d1 1182 call \do_sym
b8b1d08b
AH
1183 jmp paranoid_exit /* %ebx: no swapgs flag */
1184 CFI_ENDPROC
ddeb8f21 1185END(\sym)
322648d1 1186.endm
b8b1d08b 1187
c15a5958 1188#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
322648d1 1189.macro paranoidzeroentry_ist sym do_sym ist
ddeb8f21 1190ENTRY(\sym)
9f1e87ea 1191 INTR_FRAME
b8b1d08b 1192 PARAVIRT_ADJUST_EXCEPTION_FRAME
b1cccb1b
JB
1193 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1194 subq $ORIG_RAX-R15, %rsp
1195 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
b8b1d08b 1196 call save_paranoid
5963e317 1197 TRACE_IRQS_OFF_DEBUG
b8b1d08b
AH
1198 movq %rsp,%rdi /* pt_regs pointer */
1199 xorl %esi,%esi /* no error code */
c15a5958 1200 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
322648d1 1201 call \do_sym
c15a5958 1202 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
b8b1d08b
AH
1203 jmp paranoid_exit /* %ebx: no swapgs flag */
1204 CFI_ENDPROC
ddeb8f21 1205END(\sym)
322648d1 1206.endm
b8b1d08b 1207
ddeb8f21 1208.macro errorentry sym do_sym
322648d1 1209ENTRY(\sym)
7effaa88 1210 XCPT_FRAME
fab58420 1211 PARAVIRT_ADJUST_EXCEPTION_FRAME
b1cccb1b
JB
1212 subq $ORIG_RAX-R15, %rsp
1213 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
d99015b1 1214 call error_entry
dcd072e2 1215 DEFAULT_FRAME 0
d99015b1
AH
1216 movq %rsp,%rdi /* pt_regs pointer */
1217 movq ORIG_RAX(%rsp),%rsi /* get error code */
1218 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
322648d1 1219 call \do_sym
d99015b1 1220 jmp error_exit /* %ebx: no swapgs flag */
7effaa88 1221 CFI_ENDPROC
322648d1 1222END(\sym)
322648d1 1223.endm
1da177e4
LT
1224
1225 /* error code is on the stack already */
ddeb8f21 1226.macro paranoiderrorentry sym do_sym
322648d1 1227ENTRY(\sym)
b8b1d08b
AH
1228 XCPT_FRAME
1229 PARAVIRT_ADJUST_EXCEPTION_FRAME
b1cccb1b
JB
1230 subq $ORIG_RAX-R15, %rsp
1231 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
e2f6bc25
AH
1232 call save_paranoid
1233 DEFAULT_FRAME 0
7e61a793 1234 TRACE_IRQS_OFF
b8b1d08b
AH
1235 movq %rsp,%rdi /* pt_regs pointer */
1236 movq ORIG_RAX(%rsp),%rsi /* get error code */
1237 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
322648d1 1238 call \do_sym
b8b1d08b
AH
1239 jmp paranoid_exit /* %ebx: no swapgs flag */
1240 CFI_ENDPROC
322648d1 1241END(\sym)
322648d1
AH
1242.endm
1243
1244zeroentry divide_error do_divide_error
322648d1
AH
1245zeroentry overflow do_overflow
1246zeroentry bounds do_bounds
1247zeroentry invalid_op do_invalid_op
1248zeroentry device_not_available do_device_not_available
ddeb8f21 1249paranoiderrorentry double_fault do_double_fault
322648d1
AH
1250zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
1251errorentry invalid_TSS do_invalid_TSS
1252errorentry segment_not_present do_segment_not_present
322648d1
AH
1253zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1254zeroentry coprocessor_error do_coprocessor_error
1255errorentry alignment_check do_alignment_check
322648d1 1256zeroentry simd_coprocessor_error do_simd_coprocessor_error
5cec93c2 1257
2601e64d 1258
9f1e87ea
CG
1259 /* Reload gs selector with exception handling */
1260 /* edi: new selector */
9f9d489a 1261ENTRY(native_load_gs_index)
7effaa88 1262 CFI_STARTPROC
df5d1874 1263 pushfq_cfi
b8aa287f 1264 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
9f1e87ea 1265 SWAPGS
0bd7b798 1266gs_change:
9f1e87ea 1267 movl %edi,%gs
1da177e4 12682: mfence /* workaround */
72fe4858 1269 SWAPGS
df5d1874 1270 popfq_cfi
9f1e87ea 1271 ret
7effaa88 1272 CFI_ENDPROC
6efdcfaf 1273END(native_load_gs_index)
0bd7b798 1274
d7abc0fa 1275 _ASM_EXTABLE(gs_change,bad_gs)
9f1e87ea 1276 .section .fixup,"ax"
1da177e4 1277 /* running with kernelgs */
0bd7b798 1278bad_gs:
72fe4858 1279 SWAPGS /* switch back to user gs */
1da177e4 1280 xorl %eax,%eax
9f1e87ea
CG
1281 movl %eax,%gs
1282 jmp 2b
1283 .previous
0bd7b798 1284
3bd95dfb 1285ENTRY(kernel_thread_helper)
c05991ed
AK
1286 pushq $0 # fake return address
1287 CFI_STARTPROC
1da177e4
LT
1288 /*
1289 * Here we are in the child and the registers are set as they were
1290 * at kernel_thread() invocation in the parent.
1291 */
3bd95dfb 1292 call *%rsi
1da177e4 1293 # exit
1c5b5cfd 1294 mov %eax, %edi
1da177e4 1295 call do_exit
5f5db591 1296 ud2 # padding for call trace
c05991ed 1297 CFI_ENDPROC
3bd95dfb 1298END(kernel_thread_helper)
1da177e4
LT
1299
1300/*
1301 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
1302 *
1303 * C extern interface:
c7887325 1304 * extern long execve(const char *name, char **argv, char **envp)
1da177e4
LT
1305 *
1306 * asm input arguments:
1307 * rdi: name, rsi: argv, rdx: envp
1308 *
1309 * We want to fallback into:
c7887325 1310 * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
1da177e4
LT
1311 *
1312 * do_sys_execve asm fallback arguments:
5d119b2c 1313 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
1da177e4 1314 */
3db03b4a 1315ENTRY(kernel_execve)
1da177e4
LT
1316 CFI_STARTPROC
1317 FAKE_STACK_FRAME $0
0bd7b798 1318 SAVE_ALL
5d119b2c 1319 movq %rsp,%rcx
1da177e4 1320 call sys_execve
0bd7b798 1321 movq %rax, RAX(%rsp)
1da177e4
LT
1322 RESTORE_REST
1323 testq %rax,%rax
1324 je int_ret_from_sys_call
1325 RESTORE_ARGS
1326 UNFAKE_STACK_FRAME
1327 ret
1328 CFI_ENDPROC
6efdcfaf 1329END(kernel_execve)
1da177e4 1330
2699500b 1331/* Call softirq on interrupt stack. Interrupts are off. */
ed6b676c 1332ENTRY(call_softirq)
7effaa88 1333 CFI_STARTPROC
df5d1874 1334 pushq_cfi %rbp
2699500b
AK
1335 CFI_REL_OFFSET rbp,0
1336 mov %rsp,%rbp
1337 CFI_DEF_CFA_REGISTER rbp
56895530 1338 incl PER_CPU_VAR(irq_count)
26f80bd6 1339 cmove PER_CPU_VAR(irq_stack_ptr),%rsp
2699500b 1340 push %rbp # backlink for old unwinder
ed6b676c 1341 call __do_softirq
2699500b 1342 leaveq
df5d1874 1343 CFI_RESTORE rbp
7effaa88 1344 CFI_DEF_CFA_REGISTER rsp
2699500b 1345 CFI_ADJUST_CFA_OFFSET -8
56895530 1346 decl PER_CPU_VAR(irq_count)
ed6b676c 1347 ret
7effaa88 1348 CFI_ENDPROC
6efdcfaf 1349END(call_softirq)
75154f40 1350
3d75e1b8 1351#ifdef CONFIG_XEN
322648d1 1352zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
3d75e1b8
JF
1353
1354/*
9f1e87ea
CG
1355 * A note on the "critical region" in our callback handler.
1356 * We want to avoid stacking callback handlers due to events occurring
1357 * during handling of the last event. To do this, we keep events disabled
1358 * until we've done all processing. HOWEVER, we must enable events before
1359 * popping the stack frame (can't be done atomically) and so it would still
1360 * be possible to get enough handler activations to overflow the stack.
1361 * Although unlikely, bugs of that kind are hard to track down, so we'd
1362 * like to avoid the possibility.
1363 * So, on entry to the handler we detect whether we interrupted an
1364 * existing activation in its critical region -- if so, we pop the current
1365 * activation and restart the handler using the previous one.
1366 */
3d75e1b8
JF
1367ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1368 CFI_STARTPROC
9f1e87ea
CG
1369/*
1370 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1371 * see the correct pointer to the pt_regs
1372 */
3d75e1b8
JF
1373 movq %rdi, %rsp # we don't return, adjust the stack frame
1374 CFI_ENDPROC
dcd072e2 1375 DEFAULT_FRAME
56895530 137611: incl PER_CPU_VAR(irq_count)
3d75e1b8
JF
1377 movq %rsp,%rbp
1378 CFI_DEF_CFA_REGISTER rbp
26f80bd6 1379 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
3d75e1b8
JF
1380 pushq %rbp # backlink for old unwinder
1381 call xen_evtchn_do_upcall
1382 popq %rsp
1383 CFI_DEF_CFA_REGISTER rsp
56895530 1384 decl PER_CPU_VAR(irq_count)
3d75e1b8
JF
1385 jmp error_exit
1386 CFI_ENDPROC
371c394a 1387END(xen_do_hypervisor_callback)
3d75e1b8
JF
1388
1389/*
9f1e87ea
CG
1390 * Hypervisor uses this for application faults while it executes.
1391 * We get here for two reasons:
1392 * 1. Fault while reloading DS, ES, FS or GS
1393 * 2. Fault while executing IRET
1394 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1395 * registers that could be reloaded and zeroed the others.
1396 * Category 2 we fix up by killing the current process. We cannot use the
1397 * normal Linux return path in this case because if we use the IRET hypercall
1398 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1399 * We distinguish between categories by comparing each saved segment register
1400 * with its current contents: any discrepancy means we in category 1.
1401 */
3d75e1b8 1402ENTRY(xen_failsafe_callback)
dcd072e2
AH
1403 INTR_FRAME 1 (6*8)
1404 /*CFI_REL_OFFSET gs,GS*/
1405 /*CFI_REL_OFFSET fs,FS*/
1406 /*CFI_REL_OFFSET es,ES*/
1407 /*CFI_REL_OFFSET ds,DS*/
1408 CFI_REL_OFFSET r11,8
1409 CFI_REL_OFFSET rcx,0
3d75e1b8
JF
1410 movw %ds,%cx
1411 cmpw %cx,0x10(%rsp)
1412 CFI_REMEMBER_STATE
1413 jne 1f
1414 movw %es,%cx
1415 cmpw %cx,0x18(%rsp)
1416 jne 1f
1417 movw %fs,%cx
1418 cmpw %cx,0x20(%rsp)
1419 jne 1f
1420 movw %gs,%cx
1421 cmpw %cx,0x28(%rsp)
1422 jne 1f
1423 /* All segments match their saved values => Category 2 (Bad IRET). */
1424 movq (%rsp),%rcx
1425 CFI_RESTORE rcx
1426 movq 8(%rsp),%r11
1427 CFI_RESTORE r11
1428 addq $0x30,%rsp
1429 CFI_ADJUST_CFA_OFFSET -0x30
14ae22ba
IM
1430 pushq_cfi $0 /* RIP */
1431 pushq_cfi %r11
1432 pushq_cfi %rcx
4a5c3e77 1433 jmp general_protection
3d75e1b8
JF
1434 CFI_RESTORE_STATE
14351: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1436 movq (%rsp),%rcx
1437 CFI_RESTORE rcx
1438 movq 8(%rsp),%r11
1439 CFI_RESTORE r11
1440 addq $0x30,%rsp
1441 CFI_ADJUST_CFA_OFFSET -0x30
14ae22ba 1442 pushq_cfi $0
3d75e1b8
JF
1443 SAVE_ALL
1444 jmp error_exit
1445 CFI_ENDPROC
3d75e1b8
JF
1446END(xen_failsafe_callback)
1447
38e20b07
SY
1448apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
1449 xen_hvm_callback_vector xen_evtchn_do_upcall
1450
3d75e1b8 1451#endif /* CONFIG_XEN */
ddeb8f21
AH
1452
1453/*
1454 * Some functions should be protected against kprobes
1455 */
1456 .pushsection .kprobes.text, "ax"
1457
1458paranoidzeroentry_ist debug do_debug DEBUG_STACK
1459paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1460paranoiderrorentry stack_segment do_stack_segment
6cac5a92
JF
1461#ifdef CONFIG_XEN
1462zeroentry xen_debug do_debug
1463zeroentry xen_int3 do_int3
1464errorentry xen_stack_segment do_stack_segment
1465#endif
ddeb8f21
AH
1466errorentry general_protection do_general_protection
1467errorentry page_fault do_page_fault
631bc487
GN
1468#ifdef CONFIG_KVM_GUEST
1469errorentry async_page_fault do_async_page_fault
1470#endif
ddeb8f21 1471#ifdef CONFIG_X86_MCE
5d727926 1472paranoidzeroentry machine_check *machine_check_vector(%rip)
ddeb8f21
AH
1473#endif
1474
1475 /*
9f1e87ea
CG
1476 * "Paranoid" exit path from exception stack.
1477 * Paranoid because this is used by NMIs and cannot take
ddeb8f21
AH
1478 * any kernel state for granted.
1479 * We don't do kernel preemption checks here, because only
1480 * NMI should be common and it does not enable IRQs and
1481 * cannot get reschedule ticks.
1482 *
1483 * "trace" is 0 for the NMI handler only, because irq-tracing
1484 * is fundamentally NMI-unsafe. (we cannot change the soft and
1485 * hard flags at once, atomically)
1486 */
1487
1488 /* ebx: no swapgs flag */
1489ENTRY(paranoid_exit)
1f130a78 1490 DEFAULT_FRAME
ddeb8f21 1491 DISABLE_INTERRUPTS(CLBR_NONE)
5963e317 1492 TRACE_IRQS_OFF_DEBUG
ddeb8f21
AH
1493 testl %ebx,%ebx /* swapgs needed? */
1494 jnz paranoid_restore
1495 testl $3,CS(%rsp)
1496 jnz paranoid_userspace
1497paranoid_swapgs:
1498 TRACE_IRQS_IRETQ 0
1499 SWAPGS_UNSAFE_STACK
0300e7f1
SR
1500 RESTORE_ALL 8
1501 jmp irq_return
ddeb8f21 1502paranoid_restore:
5963e317 1503 TRACE_IRQS_IRETQ_DEBUG 0
ddeb8f21
AH
1504 RESTORE_ALL 8
1505 jmp irq_return
1506paranoid_userspace:
1507 GET_THREAD_INFO(%rcx)
1508 movl TI_flags(%rcx),%ebx
1509 andl $_TIF_WORK_MASK,%ebx
1510 jz paranoid_swapgs
1511 movq %rsp,%rdi /* &pt_regs */
1512 call sync_regs
1513 movq %rax,%rsp /* switch stack for scheduling */
1514 testl $_TIF_NEED_RESCHED,%ebx
1515 jnz paranoid_schedule
1516 movl %ebx,%edx /* arg3: thread flags */
1517 TRACE_IRQS_ON
1518 ENABLE_INTERRUPTS(CLBR_NONE)
1519 xorl %esi,%esi /* arg2: oldset */
1520 movq %rsp,%rdi /* arg1: &pt_regs */
1521 call do_notify_resume
1522 DISABLE_INTERRUPTS(CLBR_NONE)
1523 TRACE_IRQS_OFF
1524 jmp paranoid_userspace
1525paranoid_schedule:
1526 TRACE_IRQS_ON
1527 ENABLE_INTERRUPTS(CLBR_ANY)
1528 call schedule
1529 DISABLE_INTERRUPTS(CLBR_ANY)
1530 TRACE_IRQS_OFF
1531 jmp paranoid_userspace
1532 CFI_ENDPROC
1533END(paranoid_exit)
1534
1535/*
1536 * Exception entry point. This expects an error code/orig_rax on the stack.
1537 * returns in "no swapgs flag" in %ebx.
1538 */
1539ENTRY(error_entry)
1540 XCPT_FRAME
1541 CFI_ADJUST_CFA_OFFSET 15*8
1542 /* oldrax contains error code */
1543 cld
1544 movq_cfi rdi, RDI+8
1545 movq_cfi rsi, RSI+8
1546 movq_cfi rdx, RDX+8
1547 movq_cfi rcx, RCX+8
1548 movq_cfi rax, RAX+8
1549 movq_cfi r8, R8+8
1550 movq_cfi r9, R9+8
1551 movq_cfi r10, R10+8
1552 movq_cfi r11, R11+8
1553 movq_cfi rbx, RBX+8
1554 movq_cfi rbp, RBP+8
1555 movq_cfi r12, R12+8
1556 movq_cfi r13, R13+8
1557 movq_cfi r14, R14+8
1558 movq_cfi r15, R15+8
1559 xorl %ebx,%ebx
1560 testl $3,CS+8(%rsp)
1561 je error_kernelspace
1562error_swapgs:
1563 SWAPGS
1564error_sti:
1565 TRACE_IRQS_OFF
1566 ret
ddeb8f21
AH
1567
1568/*
1569 * There are two places in the kernel that can potentially fault with
1570 * usergs. Handle them here. The exception handlers after iret run with
1571 * kernel gs again, so don't set the user space flag. B stepping K8s
1572 * sometimes report an truncated RIP for IRET exceptions returning to
1573 * compat mode. Check for these here too.
1574 */
1575error_kernelspace:
1576 incl %ebx
1577 leaq irq_return(%rip),%rcx
1578 cmpq %rcx,RIP+8(%rsp)
1579 je error_swapgs
ae24ffe5
BG
1580 movl %ecx,%eax /* zero extend */
1581 cmpq %rax,RIP+8(%rsp)
1582 je bstep_iret
ddeb8f21 1583 cmpq $gs_change,RIP+8(%rsp)
9f1e87ea 1584 je error_swapgs
ddeb8f21 1585 jmp error_sti
ae24ffe5
BG
1586
1587bstep_iret:
1588 /* Fix truncated RIP */
1589 movq %rcx,RIP+8(%rsp)
97829de5 1590 jmp error_swapgs
e6b04b6b 1591 CFI_ENDPROC
ddeb8f21
AH
1592END(error_entry)
1593
1594
1595/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1596ENTRY(error_exit)
1597 DEFAULT_FRAME
1598 movl %ebx,%eax
1599 RESTORE_REST
1600 DISABLE_INTERRUPTS(CLBR_NONE)
1601 TRACE_IRQS_OFF
1602 GET_THREAD_INFO(%rcx)
1603 testl %eax,%eax
1604 jne retint_kernel
1605 LOCKDEP_SYS_EXIT_IRQ
1606 movl TI_flags(%rcx),%edx
1607 movl $_TIF_WORK_MASK,%edi
1608 andl %edi,%edx
1609 jnz retint_careful
1610 jmp retint_swapgs
1611 CFI_ENDPROC
1612END(error_exit)
1613
3f3c8b8c
SR
1614/*
1615 * Test if a given stack is an NMI stack or not.
1616 */
1617 .macro test_in_nmi reg stack nmi_ret normal_ret
1618 cmpq %\reg, \stack
1619 ja \normal_ret
1620 subq $EXCEPTION_STKSZ, %\reg
1621 cmpq %\reg, \stack
1622 jb \normal_ret
1623 jmp \nmi_ret
1624 .endm
ddeb8f21
AH
1625
1626 /* runs on exception stack */
1627ENTRY(nmi)
1628 INTR_FRAME
1629 PARAVIRT_ADJUST_EXCEPTION_FRAME
3f3c8b8c
SR
1630 /*
1631 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1632 * the iretq it performs will take us out of NMI context.
1633 * This means that we can have nested NMIs where the next
1634 * NMI is using the top of the stack of the previous NMI. We
1635 * can't let it execute because the nested NMI will corrupt the
1636 * stack of the previous NMI. NMI handlers are not re-entrant
1637 * anyway.
1638 *
1639 * To handle this case we do the following:
1640 * Check the a special location on the stack that contains
1641 * a variable that is set when NMIs are executing.
1642 * The interrupted task's stack is also checked to see if it
1643 * is an NMI stack.
1644 * If the variable is not set and the stack is not the NMI
1645 * stack then:
1646 * o Set the special variable on the stack
1647 * o Copy the interrupt frame into a "saved" location on the stack
1648 * o Copy the interrupt frame into a "copy" location on the stack
1649 * o Continue processing the NMI
1650 * If the variable is set or the previous stack is the NMI stack:
1651 * o Modify the "copy" location to jump to the repeate_nmi
1652 * o return back to the first NMI
1653 *
1654 * Now on exit of the first NMI, we first clear the stack variable
1655 * The NMI stack will tell any nested NMIs at that point that it is
1656 * nested. Then we pop the stack normally with iret, and if there was
1657 * a nested NMI that updated the copy interrupt stack frame, a
1658 * jump will be made to the repeat_nmi code that will handle the second
1659 * NMI.
1660 */
1661
1662 /* Use %rdx as out temp variable throughout */
1663 pushq_cfi %rdx
62610913 1664 CFI_REL_OFFSET rdx, 0
3f3c8b8c 1665
45d5a168
SR
1666 /*
1667 * If %cs was not the kernel segment, then the NMI triggered in user
1668 * space, which means it is definitely not nested.
1669 */
a38449ef 1670 cmpl $__KERNEL_CS, 16(%rsp)
45d5a168
SR
1671 jne first_nmi
1672
3f3c8b8c
SR
1673 /*
1674 * Check the special variable on the stack to see if NMIs are
1675 * executing.
1676 */
a38449ef 1677 cmpl $1, -8(%rsp)
3f3c8b8c
SR
1678 je nested_nmi
1679
1680 /*
1681 * Now test if the previous stack was an NMI stack.
1682 * We need the double check. We check the NMI stack to satisfy the
1683 * race when the first NMI clears the variable before returning.
1684 * We check the variable because the first NMI could be in a
1685 * breakpoint routine using a breakpoint stack.
1686 */
1687 lea 6*8(%rsp), %rdx
1688 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
62610913 1689 CFI_REMEMBER_STATE
3f3c8b8c
SR
1690
1691nested_nmi:
1692 /*
1693 * Do nothing if we interrupted the fixup in repeat_nmi.
1694 * It's about to repeat the NMI handler, so we are fine
1695 * with ignoring this one.
1696 */
1697 movq $repeat_nmi, %rdx
1698 cmpq 8(%rsp), %rdx
1699 ja 1f
1700 movq $end_repeat_nmi, %rdx
1701 cmpq 8(%rsp), %rdx
1702 ja nested_nmi_out
1703
17041:
1705 /* Set up the interrupted NMIs stack to jump to repeat_nmi */
1706 leaq -6*8(%rsp), %rdx
1707 movq %rdx, %rsp
1708 CFI_ADJUST_CFA_OFFSET 6*8
1709 pushq_cfi $__KERNEL_DS
1710 pushq_cfi %rdx
1711 pushfq_cfi
1712 pushq_cfi $__KERNEL_CS
1713 pushq_cfi $repeat_nmi
1714
1715 /* Put stack back */
1716 addq $(11*8), %rsp
1717 CFI_ADJUST_CFA_OFFSET -11*8
1718
1719nested_nmi_out:
1720 popq_cfi %rdx
62610913 1721 CFI_RESTORE rdx
3f3c8b8c
SR
1722
1723 /* No need to check faults here */
1724 INTERRUPT_RETURN
1725
62610913 1726 CFI_RESTORE_STATE
3f3c8b8c
SR
1727first_nmi:
1728 /*
1729 * Because nested NMIs will use the pushed location that we
1730 * stored in rdx, we must keep that space available.
1731 * Here's what our stack frame will look like:
1732 * +-------------------------+
1733 * | original SS |
1734 * | original Return RSP |
1735 * | original RFLAGS |
1736 * | original CS |
1737 * | original RIP |
1738 * +-------------------------+
1739 * | temp storage for rdx |
1740 * +-------------------------+
1741 * | NMI executing variable |
1742 * +-------------------------+
1743 * | Saved SS |
1744 * | Saved Return RSP |
1745 * | Saved RFLAGS |
1746 * | Saved CS |
1747 * | Saved RIP |
1748 * +-------------------------+
1749 * | copied SS |
1750 * | copied Return RSP |
1751 * | copied RFLAGS |
1752 * | copied CS |
1753 * | copied RIP |
1754 * +-------------------------+
1755 * | pt_regs |
1756 * +-------------------------+
1757 *
79fb4ad6
SR
1758 * The saved stack frame is used to fix up the copied stack frame
1759 * that a nested NMI may change to make the interrupted NMI iret jump
1760 * to the repeat_nmi. The original stack frame and the temp storage
3f3c8b8c
SR
1761 * is also used by nested NMIs and can not be trusted on exit.
1762 */
79fb4ad6 1763 /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
62610913
JB
1764 movq (%rsp), %rdx
1765 CFI_RESTORE rdx
1766
3f3c8b8c
SR
1767 /* Set the NMI executing variable on the stack. */
1768 pushq_cfi $1
1769
1770 /* Copy the stack frame to the Saved frame */
1771 .rept 5
1772 pushq_cfi 6*8(%rsp)
1773 .endr
62610913
JB
1774 CFI_DEF_CFA_OFFSET SS+8-RIP
1775
79fb4ad6
SR
1776 /* Everything up to here is safe from nested NMIs */
1777
62610913
JB
1778 /*
1779 * If there was a nested NMI, the first NMI's iret will return
1780 * here. But NMIs are still enabled and we can take another
1781 * nested NMI. The nested NMI checks the interrupted RIP to see
1782 * if it is between repeat_nmi and end_repeat_nmi, and if so
1783 * it will just return, as we are about to repeat an NMI anyway.
1784 * This makes it safe to copy to the stack frame that a nested
1785 * NMI will update.
1786 */
1787repeat_nmi:
1788 /*
1789 * Update the stack variable to say we are still in NMI (the update
1790 * is benign for the non-repeat case, where 1 was pushed just above
1791 * to this very stack slot).
1792 */
1793 movq $1, 5*8(%rsp)
3f3c8b8c
SR
1794
1795 /* Make another copy, this one may be modified by nested NMIs */
1796 .rept 5
1797 pushq_cfi 4*8(%rsp)
1798 .endr
62610913
JB
1799 CFI_DEF_CFA_OFFSET SS+8-RIP
1800end_repeat_nmi:
3f3c8b8c
SR
1801
1802 /*
1803 * Everything below this point can be preempted by a nested
79fb4ad6
SR
1804 * NMI if the first NMI took an exception and reset our iret stack
1805 * so that we repeat another NMI.
3f3c8b8c 1806 */
1fd466ef 1807 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
b1cccb1b
JB
1808 subq $ORIG_RAX-R15, %rsp
1809 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1fd466ef
SR
1810 /*
1811 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
1812 * as we should not be calling schedule in NMI context.
1813 * Even with normal interrupts enabled. An NMI should not be
1814 * setting NEED_RESCHED or anything that normal interrupts and
1815 * exceptions might do.
1816 */
ddeb8f21
AH
1817 call save_paranoid
1818 DEFAULT_FRAME 0
7fbb98c5
SR
1819
1820 /*
1821 * Save off the CR2 register. If we take a page fault in the NMI then
1822 * it could corrupt the CR2 value. If the NMI preempts a page fault
1823 * handler before it was able to read the CR2 register, and then the
1824 * NMI itself takes a page fault, the page fault that was preempted
1825 * will read the information from the NMI page fault and not the
1826 * origin fault. Save it off and restore it if it changes.
1827 * Use the r12 callee-saved register.
1828 */
1829 movq %cr2, %r12
1830
ddeb8f21
AH
1831 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1832 movq %rsp,%rdi
1833 movq $-1,%rsi
1834 call do_nmi
7fbb98c5
SR
1835
1836 /* Did the NMI take a page fault? Restore cr2 if it did */
1837 movq %cr2, %rcx
1838 cmpq %rcx, %r12
1839 je 1f
1840 movq %r12, %cr2
18411:
1842
ddeb8f21
AH
1843 testl %ebx,%ebx /* swapgs needed? */
1844 jnz nmi_restore
ddeb8f21
AH
1845nmi_swapgs:
1846 SWAPGS_UNSAFE_STACK
1847nmi_restore:
1848 RESTORE_ALL 8
3f3c8b8c
SR
1849 /* Clear the NMI executing stack variable */
1850 movq $0, 10*8(%rsp)
ddeb8f21 1851 jmp irq_return
9f1e87ea 1852 CFI_ENDPROC
ddeb8f21
AH
1853END(nmi)
1854
1855ENTRY(ignore_sysret)
1856 CFI_STARTPROC
1857 mov $-ENOSYS,%eax
1858 sysret
1859 CFI_ENDPROC
1860END(ignore_sysret)
1861
1862/*
1863 * End of kprobes section
1864 */
1865 .popsection