]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - arch/x86/kernel/entry_64.S
Merge branch 'linus' into x86/asm
[mirror_ubuntu-artful-kernel.git] / arch / x86 / kernel / entry_64.S
1 /*
2 * linux/arch/x86_64/entry.S
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
7 */
8
9 /*
10 * entry.S contains the system-call and fault low-level handling routines.
11 *
12 * Some of this is documented in Documentation/x86/entry_64.txt
13 *
14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call.
16 *
17 * Normal syscalls and interrupts don't save a full stack frame, this is
18 * only done for syscall tracing, signals or fork/exec et.al.
19 *
20 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers up to R11.
24 * - full stack frame: Like partial stack frame, but all register saved.
25 *
26 * Some macro usage:
27 * - CFI macros are used to generate dwarf2 unwind information for better
28 * backtraces. They don't change any code.
29 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
30 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
31 * There are unfortunately lots of special cases where some registers
32 * not touched. The macro is a big mess that should be cleaned up.
33 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
34 * Gives a full stack frame.
35 * - ENTRY/END Define functions in the symbol table.
36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
37 * frame that is otherwise undefined after a SYSCALL
38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
39 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
40 */
41
42 #include <linux/linkage.h>
43 #include <asm/segment.h>
44 #include <asm/cache.h>
45 #include <asm/errno.h>
46 #include <asm/dwarf2.h>
47 #include <asm/calling.h>
48 #include <asm/asm-offsets.h>
49 #include <asm/msr.h>
50 #include <asm/unistd.h>
51 #include <asm/thread_info.h>
52 #include <asm/hw_irq.h>
53 #include <asm/page_types.h>
54 #include <asm/irqflags.h>
55 #include <asm/paravirt.h>
56 #include <asm/ftrace.h>
57 #include <asm/percpu.h>
58 #include <linux/err.h>
59
60 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
61 #include <linux/elf-em.h>
62 #define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
63 #define __AUDIT_ARCH_64BIT 0x80000000
64 #define __AUDIT_ARCH_LE 0x40000000
65
66 .code64
67 .section .entry.text, "ax"
68
69 #ifdef CONFIG_FUNCTION_TRACER
70 #ifdef CONFIG_DYNAMIC_FTRACE
71 ENTRY(mcount)
72 retq
73 END(mcount)
74
75 ENTRY(ftrace_caller)
76 cmpl $0, function_trace_stop
77 jne ftrace_stub
78
79 MCOUNT_SAVE_FRAME
80
81 movq 0x38(%rsp), %rdi
82 movq 8(%rbp), %rsi
83 subq $MCOUNT_INSN_SIZE, %rdi
84
85 GLOBAL(ftrace_call)
86 call ftrace_stub
87
88 MCOUNT_RESTORE_FRAME
89
90 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
91 GLOBAL(ftrace_graph_call)
92 jmp ftrace_stub
93 #endif
94
95 GLOBAL(ftrace_stub)
96 retq
97 END(ftrace_caller)
98
99 #else /* ! CONFIG_DYNAMIC_FTRACE */
100 ENTRY(mcount)
101 cmpl $0, function_trace_stop
102 jne ftrace_stub
103
104 cmpq $ftrace_stub, ftrace_trace_function
105 jnz trace
106
107 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
108 cmpq $ftrace_stub, ftrace_graph_return
109 jnz ftrace_graph_caller
110
111 cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
112 jnz ftrace_graph_caller
113 #endif
114
115 GLOBAL(ftrace_stub)
116 retq
117
118 trace:
119 MCOUNT_SAVE_FRAME
120
121 movq 0x38(%rsp), %rdi
122 movq 8(%rbp), %rsi
123 subq $MCOUNT_INSN_SIZE, %rdi
124
125 call *ftrace_trace_function
126
127 MCOUNT_RESTORE_FRAME
128
129 jmp ftrace_stub
130 END(mcount)
131 #endif /* CONFIG_DYNAMIC_FTRACE */
132 #endif /* CONFIG_FUNCTION_TRACER */
133
134 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
135 ENTRY(ftrace_graph_caller)
136 cmpl $0, function_trace_stop
137 jne ftrace_stub
138
139 MCOUNT_SAVE_FRAME
140
141 leaq 8(%rbp), %rdi
142 movq 0x38(%rsp), %rsi
143 movq (%rbp), %rdx
144 subq $MCOUNT_INSN_SIZE, %rsi
145
146 call prepare_ftrace_return
147
148 MCOUNT_RESTORE_FRAME
149
150 retq
151 END(ftrace_graph_caller)
152
153 GLOBAL(return_to_handler)
154 subq $24, %rsp
155
156 /* Save the return values */
157 movq %rax, (%rsp)
158 movq %rdx, 8(%rsp)
159 movq %rbp, %rdi
160
161 call ftrace_return_to_handler
162
163 movq %rax, %rdi
164 movq 8(%rsp), %rdx
165 movq (%rsp), %rax
166 addq $24, %rsp
167 jmp *%rdi
168 #endif
169
170
171 #ifndef CONFIG_PREEMPT
172 #define retint_kernel retint_restore_args
173 #endif
174
175 #ifdef CONFIG_PARAVIRT
176 ENTRY(native_usergs_sysret64)
177 swapgs
178 sysretq
179 ENDPROC(native_usergs_sysret64)
180 #endif /* CONFIG_PARAVIRT */
181
182
183 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
184 #ifdef CONFIG_TRACE_IRQFLAGS
185 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
186 jnc 1f
187 TRACE_IRQS_ON
188 1:
189 #endif
190 .endm
191
192 /*
193 * C code is not supposed to know about undefined top of stack. Every time
194 * a C function with an pt_regs argument is called from the SYSCALL based
195 * fast path FIXUP_TOP_OF_STACK is needed.
196 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
197 * manipulation.
198 */
199
200 /* %rsp:at FRAMEEND */
201 .macro FIXUP_TOP_OF_STACK tmp offset=0
202 movq PER_CPU_VAR(old_rsp),\tmp
203 movq \tmp,RSP+\offset(%rsp)
204 movq $__USER_DS,SS+\offset(%rsp)
205 movq $__USER_CS,CS+\offset(%rsp)
206 movq $-1,RCX+\offset(%rsp)
207 movq R11+\offset(%rsp),\tmp /* get eflags */
208 movq \tmp,EFLAGS+\offset(%rsp)
209 .endm
210
211 .macro RESTORE_TOP_OF_STACK tmp offset=0
212 movq RSP+\offset(%rsp),\tmp
213 movq \tmp,PER_CPU_VAR(old_rsp)
214 movq EFLAGS+\offset(%rsp),\tmp
215 movq \tmp,R11+\offset(%rsp)
216 .endm
217
218 .macro FAKE_STACK_FRAME child_rip
219 /* push in order ss, rsp, eflags, cs, rip */
220 xorl %eax, %eax
221 pushq_cfi $__KERNEL_DS /* ss */
222 /*CFI_REL_OFFSET ss,0*/
223 pushq_cfi %rax /* rsp */
224 CFI_REL_OFFSET rsp,0
225 pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
226 /*CFI_REL_OFFSET rflags,0*/
227 pushq_cfi $__KERNEL_CS /* cs */
228 /*CFI_REL_OFFSET cs,0*/
229 pushq_cfi \child_rip /* rip */
230 CFI_REL_OFFSET rip,0
231 pushq_cfi %rax /* orig rax */
232 .endm
233
234 .macro UNFAKE_STACK_FRAME
235 addq $8*6, %rsp
236 CFI_ADJUST_CFA_OFFSET -(6*8)
237 .endm
238
239 /*
240 * initial frame state for interrupts (and exceptions without error code)
241 */
242 .macro EMPTY_FRAME start=1 offset=0
243 .if \start
244 CFI_STARTPROC simple
245 CFI_SIGNAL_FRAME
246 CFI_DEF_CFA rsp,8+\offset
247 .else
248 CFI_DEF_CFA_OFFSET 8+\offset
249 .endif
250 .endm
251
252 /*
253 * initial frame state for interrupts (and exceptions without error code)
254 */
255 .macro INTR_FRAME start=1 offset=0
256 EMPTY_FRAME \start, SS+8+\offset-RIP
257 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
258 CFI_REL_OFFSET rsp, RSP+\offset-RIP
259 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
260 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
261 CFI_REL_OFFSET rip, RIP+\offset-RIP
262 .endm
263
264 /*
265 * initial frame state for exceptions with error code (and interrupts
266 * with vector already pushed)
267 */
268 .macro XCPT_FRAME start=1 offset=0
269 INTR_FRAME \start, RIP+\offset-ORIG_RAX
270 /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
271 .endm
272
273 /*
274 * frame that enables calling into C.
275 */
276 .macro PARTIAL_FRAME start=1 offset=0
277 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
278 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
279 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
280 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
281 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
282 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
283 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
284 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
285 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
286 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
287 .endm
288
289 /*
290 * frame that enables passing a complete pt_regs to a C function.
291 */
292 .macro DEFAULT_FRAME start=1 offset=0
293 PARTIAL_FRAME \start, R11+\offset-R15
294 CFI_REL_OFFSET rbx, RBX+\offset
295 CFI_REL_OFFSET rbp, RBP+\offset
296 CFI_REL_OFFSET r12, R12+\offset
297 CFI_REL_OFFSET r13, R13+\offset
298 CFI_REL_OFFSET r14, R14+\offset
299 CFI_REL_OFFSET r15, R15+\offset
300 .endm
301
302 /* save partial stack frame */
303 .macro SAVE_ARGS_IRQ
304 cld
305 /* start from rbp in pt_regs and jump over */
306 movq_cfi rdi, RDI-RBP
307 movq_cfi rsi, RSI-RBP
308 movq_cfi rdx, RDX-RBP
309 movq_cfi rcx, RCX-RBP
310 movq_cfi rax, RAX-RBP
311 movq_cfi r8, R8-RBP
312 movq_cfi r9, R9-RBP
313 movq_cfi r10, R10-RBP
314 movq_cfi r11, R11-RBP
315
316 /* Save rbp so that we can unwind from get_irq_regs() */
317 movq_cfi rbp, 0
318
319 /* Save previous stack value */
320 movq %rsp, %rsi
321
322 leaq -RBP(%rsp),%rdi /* arg1 for handler */
323 testl $3, CS-RBP(%rsi)
324 je 1f
325 SWAPGS
326 /*
327 * irq_count is used to check if a CPU is already on an interrupt stack
328 * or not. While this is essentially redundant with preempt_count it is
329 * a little cheaper to use a separate counter in the PDA (short of
330 * moving irq_enter into assembly, which would be too much work)
331 */
332 1: incl PER_CPU_VAR(irq_count)
333 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
334 CFI_DEF_CFA_REGISTER rsi
335
336 /* Store previous stack value */
337 pushq %rsi
338 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
339 0x77 /* DW_OP_breg7 */, 0, \
340 0x06 /* DW_OP_deref */, \
341 0x08 /* DW_OP_const1u */, SS+8-RBP, \
342 0x22 /* DW_OP_plus */
343 /* We entered an interrupt context - irqs are off: */
344 TRACE_IRQS_OFF
345 .endm
346
347 ENTRY(save_rest)
348 PARTIAL_FRAME 1 REST_SKIP+8
349 movq 5*8+16(%rsp), %r11 /* save return address */
350 movq_cfi rbx, RBX+16
351 movq_cfi rbp, RBP+16
352 movq_cfi r12, R12+16
353 movq_cfi r13, R13+16
354 movq_cfi r14, R14+16
355 movq_cfi r15, R15+16
356 movq %r11, 8(%rsp) /* return address */
357 FIXUP_TOP_OF_STACK %r11, 16
358 ret
359 CFI_ENDPROC
360 END(save_rest)
361
362 /* save complete stack frame */
363 .pushsection .kprobes.text, "ax"
364 ENTRY(save_paranoid)
365 XCPT_FRAME 1 RDI+8
366 cld
367 movq_cfi rdi, RDI+8
368 movq_cfi rsi, RSI+8
369 movq_cfi rdx, RDX+8
370 movq_cfi rcx, RCX+8
371 movq_cfi rax, RAX+8
372 movq_cfi r8, R8+8
373 movq_cfi r9, R9+8
374 movq_cfi r10, R10+8
375 movq_cfi r11, R11+8
376 movq_cfi rbx, RBX+8
377 movq_cfi rbp, RBP+8
378 movq_cfi r12, R12+8
379 movq_cfi r13, R13+8
380 movq_cfi r14, R14+8
381 movq_cfi r15, R15+8
382 movl $1,%ebx
383 movl $MSR_GS_BASE,%ecx
384 rdmsr
385 testl %edx,%edx
386 js 1f /* negative -> in kernel */
387 SWAPGS
388 xorl %ebx,%ebx
389 1: ret
390 CFI_ENDPROC
391 END(save_paranoid)
392 .popsection
393
394 /*
395 * A newly forked process directly context switches into this address.
396 *
397 * rdi: prev task we switched from
398 */
399 ENTRY(ret_from_fork)
400 DEFAULT_FRAME
401
402 LOCK ; btr $TIF_FORK,TI_flags(%r8)
403
404 pushq_cfi kernel_eflags(%rip)
405 popfq_cfi # reset kernel eflags
406
407 call schedule_tail # rdi: 'prev' task parameter
408
409 GET_THREAD_INFO(%rcx)
410
411 RESTORE_REST
412
413 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
414 jz retint_restore_args
415
416 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
417 jnz int_ret_from_sys_call
418
419 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
420 jmp ret_from_sys_call # go to the SYSRET fastpath
421
422 CFI_ENDPROC
423 END(ret_from_fork)
424
425 /*
426 * System call entry. Up to 6 arguments in registers are supported.
427 *
428 * SYSCALL does not save anything on the stack and does not change the
429 * stack pointer.
430 */
431
432 /*
433 * Register setup:
434 * rax system call number
435 * rdi arg0
436 * rcx return address for syscall/sysret, C arg3
437 * rsi arg1
438 * rdx arg2
439 * r10 arg3 (--> moved to rcx for C)
440 * r8 arg4
441 * r9 arg5
442 * r11 eflags for syscall/sysret, temporary for C
443 * r12-r15,rbp,rbx saved by C code, not touched.
444 *
445 * Interrupts are off on entry.
446 * Only called from user space.
447 *
448 * XXX if we had a free scratch register we could save the RSP into the stack frame
449 * and report it properly in ps. Unfortunately we haven't.
450 *
451 * When user can change the frames always force IRET. That is because
452 * it deals with uncanonical addresses better. SYSRET has trouble
453 * with them due to bugs in both AMD and Intel CPUs.
454 */
455
456 ENTRY(system_call)
457 CFI_STARTPROC simple
458 CFI_SIGNAL_FRAME
459 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
460 CFI_REGISTER rip,rcx
461 /*CFI_REGISTER rflags,r11*/
462 SWAPGS_UNSAFE_STACK
463 /*
464 * A hypervisor implementation might want to use a label
465 * after the swapgs, so that it can do the swapgs
466 * for the guest and jump here on syscall.
467 */
468 GLOBAL(system_call_after_swapgs)
469
470 movq %rsp,PER_CPU_VAR(old_rsp)
471 movq PER_CPU_VAR(kernel_stack),%rsp
472 /*
473 * No need to follow this irqs off/on section - it's straight
474 * and short:
475 */
476 ENABLE_INTERRUPTS(CLBR_NONE)
477 SAVE_ARGS 8,0
478 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
479 movq %rcx,RIP-ARGOFFSET(%rsp)
480 CFI_REL_OFFSET rip,RIP-ARGOFFSET
481 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
482 jnz tracesys
483 system_call_fastpath:
484 cmpq $__NR_syscall_max,%rax
485 ja badsys
486 movq %r10,%rcx
487 call *sys_call_table(,%rax,8) # XXX: rip relative
488 movq %rax,RAX-ARGOFFSET(%rsp)
489 /*
490 * Syscall return path ending with SYSRET (fast path)
491 * Has incomplete stack frame and undefined top of stack.
492 */
493 ret_from_sys_call:
494 movl $_TIF_ALLWORK_MASK,%edi
495 /* edi: flagmask */
496 sysret_check:
497 LOCKDEP_SYS_EXIT
498 DISABLE_INTERRUPTS(CLBR_NONE)
499 TRACE_IRQS_OFF
500 movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
501 andl %edi,%edx
502 jnz sysret_careful
503 CFI_REMEMBER_STATE
504 /*
505 * sysretq will re-enable interrupts:
506 */
507 TRACE_IRQS_ON
508 movq RIP-ARGOFFSET(%rsp),%rcx
509 CFI_REGISTER rip,rcx
510 RESTORE_ARGS 1,-ARG_SKIP,0
511 /*CFI_REGISTER rflags,r11*/
512 movq PER_CPU_VAR(old_rsp), %rsp
513 USERGS_SYSRET64
514
515 CFI_RESTORE_STATE
516 /* Handle reschedules */
517 /* edx: work, edi: workmask */
518 sysret_careful:
519 bt $TIF_NEED_RESCHED,%edx
520 jnc sysret_signal
521 TRACE_IRQS_ON
522 ENABLE_INTERRUPTS(CLBR_NONE)
523 pushq_cfi %rdi
524 call schedule
525 popq_cfi %rdi
526 jmp sysret_check
527
528 /* Handle a signal */
529 sysret_signal:
530 TRACE_IRQS_ON
531 ENABLE_INTERRUPTS(CLBR_NONE)
532 #ifdef CONFIG_AUDITSYSCALL
533 bt $TIF_SYSCALL_AUDIT,%edx
534 jc sysret_audit
535 #endif
536 /*
537 * We have a signal, or exit tracing or single-step.
538 * These all wind up with the iret return path anyway,
539 * so just join that path right now.
540 */
541 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
542 jmp int_check_syscall_exit_work
543
544 badsys:
545 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
546 jmp ret_from_sys_call
547
548 #ifdef CONFIG_AUDITSYSCALL
549 /*
550 * Fast path for syscall audit without full syscall trace.
551 * We just call __audit_syscall_entry() directly, and then
552 * jump back to the normal fast path.
553 */
554 auditsys:
555 movq %r10,%r9 /* 6th arg: 4th syscall arg */
556 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
557 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
558 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
559 movq %rax,%rsi /* 2nd arg: syscall number */
560 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
561 call __audit_syscall_entry
562 LOAD_ARGS 0 /* reload call-clobbered registers */
563 jmp system_call_fastpath
564
565 /*
566 * Return fast path for syscall audit. Call __audit_syscall_exit()
567 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
568 * masked off.
569 */
570 sysret_audit:
571 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
572 cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
573 setbe %al /* 1 if so, 0 if not */
574 movzbl %al,%edi /* zero-extend that into %edi */
575 call __audit_syscall_exit
576 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
577 jmp sysret_check
578 #endif /* CONFIG_AUDITSYSCALL */
579
580 /* Do syscall tracing */
581 tracesys:
582 #ifdef CONFIG_AUDITSYSCALL
583 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
584 jz auditsys
585 #endif
586 SAVE_REST
587 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
588 FIXUP_TOP_OF_STACK %rdi
589 movq %rsp,%rdi
590 call syscall_trace_enter
591 /*
592 * Reload arg registers from stack in case ptrace changed them.
593 * We don't reload %rax because syscall_trace_enter() returned
594 * the value it wants us to use in the table lookup.
595 */
596 LOAD_ARGS ARGOFFSET, 1
597 RESTORE_REST
598 cmpq $__NR_syscall_max,%rax
599 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
600 movq %r10,%rcx /* fixup for C */
601 call *sys_call_table(,%rax,8)
602 movq %rax,RAX-ARGOFFSET(%rsp)
603 /* Use IRET because user could have changed frame */
604
605 /*
606 * Syscall return path ending with IRET.
607 * Has correct top of stack, but partial stack frame.
608 */
609 GLOBAL(int_ret_from_sys_call)
610 DISABLE_INTERRUPTS(CLBR_NONE)
611 TRACE_IRQS_OFF
612 movl $_TIF_ALLWORK_MASK,%edi
613 /* edi: mask to check */
614 GLOBAL(int_with_check)
615 LOCKDEP_SYS_EXIT_IRQ
616 GET_THREAD_INFO(%rcx)
617 movl TI_flags(%rcx),%edx
618 andl %edi,%edx
619 jnz int_careful
620 andl $~TS_COMPAT,TI_status(%rcx)
621 jmp retint_swapgs
622
623 /* Either reschedule or signal or syscall exit tracking needed. */
624 /* First do a reschedule test. */
625 /* edx: work, edi: workmask */
626 int_careful:
627 bt $TIF_NEED_RESCHED,%edx
628 jnc int_very_careful
629 TRACE_IRQS_ON
630 ENABLE_INTERRUPTS(CLBR_NONE)
631 pushq_cfi %rdi
632 call schedule
633 popq_cfi %rdi
634 DISABLE_INTERRUPTS(CLBR_NONE)
635 TRACE_IRQS_OFF
636 jmp int_with_check
637
638 /* handle signals and tracing -- both require a full stack frame */
639 int_very_careful:
640 TRACE_IRQS_ON
641 ENABLE_INTERRUPTS(CLBR_NONE)
642 int_check_syscall_exit_work:
643 SAVE_REST
644 /* Check for syscall exit trace */
645 testl $_TIF_WORK_SYSCALL_EXIT,%edx
646 jz int_signal
647 pushq_cfi %rdi
648 leaq 8(%rsp),%rdi # &ptregs -> arg1
649 call syscall_trace_leave
650 popq_cfi %rdi
651 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
652 jmp int_restore_rest
653
654 int_signal:
655 testl $_TIF_DO_NOTIFY_MASK,%edx
656 jz 1f
657 movq %rsp,%rdi # &ptregs -> arg1
658 xorl %esi,%esi # oldset -> arg2
659 call do_notify_resume
660 1: movl $_TIF_WORK_MASK,%edi
661 int_restore_rest:
662 RESTORE_REST
663 DISABLE_INTERRUPTS(CLBR_NONE)
664 TRACE_IRQS_OFF
665 jmp int_with_check
666 CFI_ENDPROC
667 END(system_call)
668
669 /*
670 * Certain special system calls that need to save a complete full stack frame.
671 */
672 .macro PTREGSCALL label,func,arg
673 ENTRY(\label)
674 PARTIAL_FRAME 1 8 /* offset 8: return address */
675 subq $REST_SKIP, %rsp
676 CFI_ADJUST_CFA_OFFSET REST_SKIP
677 call save_rest
678 DEFAULT_FRAME 0 8 /* offset 8: return address */
679 leaq 8(%rsp), \arg /* pt_regs pointer */
680 call \func
681 jmp ptregscall_common
682 CFI_ENDPROC
683 END(\label)
684 .endm
685
686 PTREGSCALL stub_clone, sys_clone, %r8
687 PTREGSCALL stub_fork, sys_fork, %rdi
688 PTREGSCALL stub_vfork, sys_vfork, %rdi
689 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
690 PTREGSCALL stub_iopl, sys_iopl, %rsi
691
692 ENTRY(ptregscall_common)
693 DEFAULT_FRAME 1 8 /* offset 8: return address */
694 RESTORE_TOP_OF_STACK %r11, 8
695 movq_cfi_restore R15+8, r15
696 movq_cfi_restore R14+8, r14
697 movq_cfi_restore R13+8, r13
698 movq_cfi_restore R12+8, r12
699 movq_cfi_restore RBP+8, rbp
700 movq_cfi_restore RBX+8, rbx
701 ret $REST_SKIP /* pop extended registers */
702 CFI_ENDPROC
703 END(ptregscall_common)
704
705 ENTRY(stub_execve)
706 CFI_STARTPROC
707 addq $8, %rsp
708 PARTIAL_FRAME 0
709 SAVE_REST
710 FIXUP_TOP_OF_STACK %r11
711 movq %rsp, %rcx
712 call sys_execve
713 RESTORE_TOP_OF_STACK %r11
714 movq %rax,RAX(%rsp)
715 RESTORE_REST
716 jmp int_ret_from_sys_call
717 CFI_ENDPROC
718 END(stub_execve)
719
720 /*
721 * sigreturn is special because it needs to restore all registers on return.
722 * This cannot be done with SYSRET, so use the IRET return path instead.
723 */
724 ENTRY(stub_rt_sigreturn)
725 CFI_STARTPROC
726 addq $8, %rsp
727 PARTIAL_FRAME 0
728 SAVE_REST
729 movq %rsp,%rdi
730 FIXUP_TOP_OF_STACK %r11
731 call sys_rt_sigreturn
732 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
733 RESTORE_REST
734 jmp int_ret_from_sys_call
735 CFI_ENDPROC
736 END(stub_rt_sigreturn)
737
738 /*
739 * Build the entry stubs and pointer table with some assembler magic.
740 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
741 * single cache line on all modern x86 implementations.
742 */
743 .section .init.rodata,"a"
744 ENTRY(interrupt)
745 .section .entry.text
746 .p2align 5
747 .p2align CONFIG_X86_L1_CACHE_SHIFT
748 ENTRY(irq_entries_start)
749 INTR_FRAME
750 vector=FIRST_EXTERNAL_VECTOR
751 .rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
752 .balign 32
753 .rept 7
754 .if vector < NR_VECTORS
755 .if vector <> FIRST_EXTERNAL_VECTOR
756 CFI_ADJUST_CFA_OFFSET -8
757 .endif
758 1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
759 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
760 jmp 2f
761 .endif
762 .previous
763 .quad 1b
764 .section .entry.text
765 vector=vector+1
766 .endif
767 .endr
768 2: jmp common_interrupt
769 .endr
770 CFI_ENDPROC
771 END(irq_entries_start)
772
773 .previous
774 END(interrupt)
775 .previous
776
777 /*
778 * Interrupt entry/exit.
779 *
780 * Interrupt entry points save only callee clobbered registers in fast path.
781 *
782 * Entry runs with interrupts off.
783 */
784
785 /* 0(%rsp): ~(interrupt number) */
786 .macro interrupt func
787 /* reserve pt_regs for scratch regs and rbp */
788 subq $ORIG_RAX-RBP, %rsp
789 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
790 SAVE_ARGS_IRQ
791 call \func
792 .endm
793
794 /*
795 * Interrupt entry/exit should be protected against kprobes
796 */
797 .pushsection .kprobes.text, "ax"
798 /*
799 * The interrupt stubs push (~vector+0x80) onto the stack and
800 * then jump to common_interrupt.
801 */
802 .p2align CONFIG_X86_L1_CACHE_SHIFT
803 common_interrupt:
804 XCPT_FRAME
805 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
806 interrupt do_IRQ
807 /* 0(%rsp): old_rsp-ARGOFFSET */
808 ret_from_intr:
809 DISABLE_INTERRUPTS(CLBR_NONE)
810 TRACE_IRQS_OFF
811 decl PER_CPU_VAR(irq_count)
812
813 /* Restore saved previous stack */
814 popq %rsi
815 CFI_DEF_CFA_REGISTER rsi
816 leaq ARGOFFSET-RBP(%rsi), %rsp
817 CFI_DEF_CFA_REGISTER rsp
818 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
819
820 exit_intr:
821 GET_THREAD_INFO(%rcx)
822 testl $3,CS-ARGOFFSET(%rsp)
823 je retint_kernel
824
825 /* Interrupt came from user space */
826 /*
827 * Has a correct top of stack, but a partial stack frame
828 * %rcx: thread info. Interrupts off.
829 */
830 retint_with_reschedule:
831 movl $_TIF_WORK_MASK,%edi
832 retint_check:
833 LOCKDEP_SYS_EXIT_IRQ
834 movl TI_flags(%rcx),%edx
835 andl %edi,%edx
836 CFI_REMEMBER_STATE
837 jnz retint_careful
838
839 retint_swapgs: /* return to user-space */
840 /*
841 * The iretq could re-enable interrupts:
842 */
843 DISABLE_INTERRUPTS(CLBR_ANY)
844 TRACE_IRQS_IRETQ
845 SWAPGS
846 jmp restore_args
847
848 retint_restore_args: /* return to kernel space */
849 DISABLE_INTERRUPTS(CLBR_ANY)
850 /*
851 * The iretq could re-enable interrupts:
852 */
853 TRACE_IRQS_IRETQ
854 restore_args:
855 RESTORE_ARGS 1,8,1
856
857 irq_return:
858 INTERRUPT_RETURN
859
860 .section __ex_table, "a"
861 .quad irq_return, bad_iret
862 .previous
863
864 #ifdef CONFIG_PARAVIRT
865 ENTRY(native_iret)
866 iretq
867
868 .section __ex_table,"a"
869 .quad native_iret, bad_iret
870 .previous
871 #endif
872
873 .section .fixup,"ax"
874 bad_iret:
875 /*
876 * The iret traps when the %cs or %ss being restored is bogus.
877 * We've lost the original trap vector and error code.
878 * #GPF is the most likely one to get for an invalid selector.
879 * So pretend we completed the iret and took the #GPF in user mode.
880 *
881 * We are now running with the kernel GS after exception recovery.
882 * But error_entry expects us to have user GS to match the user %cs,
883 * so swap back.
884 */
885 pushq $0
886
887 SWAPGS
888 jmp general_protection
889
890 .previous
891
892 /* edi: workmask, edx: work */
893 retint_careful:
894 CFI_RESTORE_STATE
895 bt $TIF_NEED_RESCHED,%edx
896 jnc retint_signal
897 TRACE_IRQS_ON
898 ENABLE_INTERRUPTS(CLBR_NONE)
899 pushq_cfi %rdi
900 call schedule
901 popq_cfi %rdi
902 GET_THREAD_INFO(%rcx)
903 DISABLE_INTERRUPTS(CLBR_NONE)
904 TRACE_IRQS_OFF
905 jmp retint_check
906
907 retint_signal:
908 testl $_TIF_DO_NOTIFY_MASK,%edx
909 jz retint_swapgs
910 TRACE_IRQS_ON
911 ENABLE_INTERRUPTS(CLBR_NONE)
912 SAVE_REST
913 movq $-1,ORIG_RAX(%rsp)
914 xorl %esi,%esi # oldset
915 movq %rsp,%rdi # &pt_regs
916 call do_notify_resume
917 RESTORE_REST
918 DISABLE_INTERRUPTS(CLBR_NONE)
919 TRACE_IRQS_OFF
920 GET_THREAD_INFO(%rcx)
921 jmp retint_with_reschedule
922
923 #ifdef CONFIG_PREEMPT
924 /* Returning to kernel space. Check if we need preemption */
925 /* rcx: threadinfo. interrupts off. */
926 ENTRY(retint_kernel)
927 cmpl $0,TI_preempt_count(%rcx)
928 jnz retint_restore_args
929 bt $TIF_NEED_RESCHED,TI_flags(%rcx)
930 jnc retint_restore_args
931 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
932 jnc retint_restore_args
933 call preempt_schedule_irq
934 jmp exit_intr
935 #endif
936
937 CFI_ENDPROC
938 END(common_interrupt)
939 /*
940 * End of kprobes section
941 */
942 .popsection
943
944 /*
945 * APIC interrupts.
946 */
947 .macro apicinterrupt num sym do_sym
948 ENTRY(\sym)
949 INTR_FRAME
950 pushq_cfi $~(\num)
951 .Lcommon_\sym:
952 interrupt \do_sym
953 jmp ret_from_intr
954 CFI_ENDPROC
955 END(\sym)
956 .endm
957
958 #ifdef CONFIG_SMP
959 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
960 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
961 apicinterrupt REBOOT_VECTOR \
962 reboot_interrupt smp_reboot_interrupt
963 #endif
964
965 #ifdef CONFIG_X86_UV
966 apicinterrupt UV_BAU_MESSAGE \
967 uv_bau_message_intr1 uv_bau_message_interrupt
968 #endif
969 apicinterrupt LOCAL_TIMER_VECTOR \
970 apic_timer_interrupt smp_apic_timer_interrupt
971 apicinterrupt X86_PLATFORM_IPI_VECTOR \
972 x86_platform_ipi smp_x86_platform_ipi
973
974 #ifdef CONFIG_SMP
975 ALIGN
976 INTR_FRAME
977 .irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
978 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
979 .if NUM_INVALIDATE_TLB_VECTORS > \idx
980 ENTRY(invalidate_interrupt\idx)
981 pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
982 jmp .Lcommon_invalidate_interrupt0
983 CFI_ADJUST_CFA_OFFSET -8
984 END(invalidate_interrupt\idx)
985 .endif
986 .endr
987 CFI_ENDPROC
988 apicinterrupt INVALIDATE_TLB_VECTOR_START, \
989 invalidate_interrupt0, smp_invalidate_interrupt
990 #endif
991
992 apicinterrupt THRESHOLD_APIC_VECTOR \
993 threshold_interrupt smp_threshold_interrupt
994 apicinterrupt THERMAL_APIC_VECTOR \
995 thermal_interrupt smp_thermal_interrupt
996
997 #ifdef CONFIG_SMP
998 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
999 call_function_single_interrupt smp_call_function_single_interrupt
1000 apicinterrupt CALL_FUNCTION_VECTOR \
1001 call_function_interrupt smp_call_function_interrupt
1002 apicinterrupt RESCHEDULE_VECTOR \
1003 reschedule_interrupt smp_reschedule_interrupt
1004 #endif
1005
1006 apicinterrupt ERROR_APIC_VECTOR \
1007 error_interrupt smp_error_interrupt
1008 apicinterrupt SPURIOUS_APIC_VECTOR \
1009 spurious_interrupt smp_spurious_interrupt
1010
1011 #ifdef CONFIG_IRQ_WORK
1012 apicinterrupt IRQ_WORK_VECTOR \
1013 irq_work_interrupt smp_irq_work_interrupt
1014 #endif
1015
1016 /*
1017 * Exception entry points.
1018 */
1019 .macro zeroentry sym do_sym
1020 ENTRY(\sym)
1021 INTR_FRAME
1022 PARAVIRT_ADJUST_EXCEPTION_FRAME
1023 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1024 subq $ORIG_RAX-R15, %rsp
1025 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1026 call error_entry
1027 DEFAULT_FRAME 0
1028 movq %rsp,%rdi /* pt_regs pointer */
1029 xorl %esi,%esi /* no error code */
1030 call \do_sym
1031 jmp error_exit /* %ebx: no swapgs flag */
1032 CFI_ENDPROC
1033 END(\sym)
1034 .endm
1035
1036 .macro paranoidzeroentry sym do_sym
1037 ENTRY(\sym)
1038 INTR_FRAME
1039 PARAVIRT_ADJUST_EXCEPTION_FRAME
1040 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1041 subq $ORIG_RAX-R15, %rsp
1042 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1043 call save_paranoid
1044 TRACE_IRQS_OFF
1045 movq %rsp,%rdi /* pt_regs pointer */
1046 xorl %esi,%esi /* no error code */
1047 call \do_sym
1048 jmp paranoid_exit /* %ebx: no swapgs flag */
1049 CFI_ENDPROC
1050 END(\sym)
1051 .endm
1052
1053 #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
1054 .macro paranoidzeroentry_ist sym do_sym ist
1055 ENTRY(\sym)
1056 INTR_FRAME
1057 PARAVIRT_ADJUST_EXCEPTION_FRAME
1058 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1059 subq $ORIG_RAX-R15, %rsp
1060 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1061 call save_paranoid
1062 TRACE_IRQS_OFF
1063 movq %rsp,%rdi /* pt_regs pointer */
1064 xorl %esi,%esi /* no error code */
1065 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1066 call \do_sym
1067 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1068 jmp paranoid_exit /* %ebx: no swapgs flag */
1069 CFI_ENDPROC
1070 END(\sym)
1071 .endm
1072
1073 .macro errorentry sym do_sym
1074 ENTRY(\sym)
1075 XCPT_FRAME
1076 PARAVIRT_ADJUST_EXCEPTION_FRAME
1077 subq $ORIG_RAX-R15, %rsp
1078 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1079 call error_entry
1080 DEFAULT_FRAME 0
1081 movq %rsp,%rdi /* pt_regs pointer */
1082 movq ORIG_RAX(%rsp),%rsi /* get error code */
1083 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1084 call \do_sym
1085 jmp error_exit /* %ebx: no swapgs flag */
1086 CFI_ENDPROC
1087 END(\sym)
1088 .endm
1089
1090 /* error code is on the stack already */
1091 .macro paranoiderrorentry sym do_sym
1092 ENTRY(\sym)
1093 XCPT_FRAME
1094 PARAVIRT_ADJUST_EXCEPTION_FRAME
1095 subq $ORIG_RAX-R15, %rsp
1096 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1097 call save_paranoid
1098 DEFAULT_FRAME 0
1099 TRACE_IRQS_OFF
1100 movq %rsp,%rdi /* pt_regs pointer */
1101 movq ORIG_RAX(%rsp),%rsi /* get error code */
1102 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1103 call \do_sym
1104 jmp paranoid_exit /* %ebx: no swapgs flag */
1105 CFI_ENDPROC
1106 END(\sym)
1107 .endm
1108
1109 zeroentry divide_error do_divide_error
1110 zeroentry overflow do_overflow
1111 zeroentry bounds do_bounds
1112 zeroentry invalid_op do_invalid_op
1113 zeroentry device_not_available do_device_not_available
1114 paranoiderrorentry double_fault do_double_fault
1115 zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
1116 errorentry invalid_TSS do_invalid_TSS
1117 errorentry segment_not_present do_segment_not_present
1118 zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1119 zeroentry coprocessor_error do_coprocessor_error
1120 errorentry alignment_check do_alignment_check
1121 zeroentry simd_coprocessor_error do_simd_coprocessor_error
1122
1123
1124 /* Reload gs selector with exception handling */
1125 /* edi: new selector */
1126 ENTRY(native_load_gs_index)
1127 CFI_STARTPROC
1128 pushfq_cfi
1129 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1130 SWAPGS
1131 gs_change:
1132 movl %edi,%gs
1133 2: mfence /* workaround */
1134 SWAPGS
1135 popfq_cfi
1136 ret
1137 CFI_ENDPROC
1138 END(native_load_gs_index)
1139
1140 .section __ex_table,"a"
1141 .align 8
1142 .quad gs_change,bad_gs
1143 .previous
1144 .section .fixup,"ax"
1145 /* running with kernelgs */
1146 bad_gs:
1147 SWAPGS /* switch back to user gs */
1148 xorl %eax,%eax
1149 movl %eax,%gs
1150 jmp 2b
1151 .previous
1152
1153 ENTRY(kernel_thread_helper)
1154 pushq $0 # fake return address
1155 CFI_STARTPROC
1156 /*
1157 * Here we are in the child and the registers are set as they were
1158 * at kernel_thread() invocation in the parent.
1159 */
1160 call *%rsi
1161 # exit
1162 mov %eax, %edi
1163 call do_exit
1164 ud2 # padding for call trace
1165 CFI_ENDPROC
1166 END(kernel_thread_helper)
1167
1168 /*
1169 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
1170 *
1171 * C extern interface:
1172 * extern long execve(const char *name, char **argv, char **envp)
1173 *
1174 * asm input arguments:
1175 * rdi: name, rsi: argv, rdx: envp
1176 *
1177 * We want to fallback into:
1178 * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
1179 *
1180 * do_sys_execve asm fallback arguments:
1181 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
1182 */
1183 ENTRY(kernel_execve)
1184 CFI_STARTPROC
1185 FAKE_STACK_FRAME $0
1186 SAVE_ALL
1187 movq %rsp,%rcx
1188 call sys_execve
1189 movq %rax, RAX(%rsp)
1190 RESTORE_REST
1191 testq %rax,%rax
1192 je int_ret_from_sys_call
1193 RESTORE_ARGS
1194 UNFAKE_STACK_FRAME
1195 ret
1196 CFI_ENDPROC
1197 END(kernel_execve)
1198
1199 /* Call softirq on interrupt stack. Interrupts are off. */
1200 ENTRY(call_softirq)
1201 CFI_STARTPROC
1202 pushq_cfi %rbp
1203 CFI_REL_OFFSET rbp,0
1204 mov %rsp,%rbp
1205 CFI_DEF_CFA_REGISTER rbp
1206 incl PER_CPU_VAR(irq_count)
1207 cmove PER_CPU_VAR(irq_stack_ptr),%rsp
1208 push %rbp # backlink for old unwinder
1209 call __do_softirq
1210 leaveq
1211 CFI_RESTORE rbp
1212 CFI_DEF_CFA_REGISTER rsp
1213 CFI_ADJUST_CFA_OFFSET -8
1214 decl PER_CPU_VAR(irq_count)
1215 ret
1216 CFI_ENDPROC
1217 END(call_softirq)
1218
1219 #ifdef CONFIG_XEN
1220 zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
1221
1222 /*
1223 * A note on the "critical region" in our callback handler.
1224 * We want to avoid stacking callback handlers due to events occurring
1225 * during handling of the last event. To do this, we keep events disabled
1226 * until we've done all processing. HOWEVER, we must enable events before
1227 * popping the stack frame (can't be done atomically) and so it would still
1228 * be possible to get enough handler activations to overflow the stack.
1229 * Although unlikely, bugs of that kind are hard to track down, so we'd
1230 * like to avoid the possibility.
1231 * So, on entry to the handler we detect whether we interrupted an
1232 * existing activation in its critical region -- if so, we pop the current
1233 * activation and restart the handler using the previous one.
1234 */
1235 ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1236 CFI_STARTPROC
1237 /*
1238 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1239 * see the correct pointer to the pt_regs
1240 */
1241 movq %rdi, %rsp # we don't return, adjust the stack frame
1242 CFI_ENDPROC
1243 DEFAULT_FRAME
1244 11: incl PER_CPU_VAR(irq_count)
1245 movq %rsp,%rbp
1246 CFI_DEF_CFA_REGISTER rbp
1247 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
1248 pushq %rbp # backlink for old unwinder
1249 call xen_evtchn_do_upcall
1250 popq %rsp
1251 CFI_DEF_CFA_REGISTER rsp
1252 decl PER_CPU_VAR(irq_count)
1253 jmp error_exit
1254 CFI_ENDPROC
1255 END(xen_do_hypervisor_callback)
1256
1257 /*
1258 * Hypervisor uses this for application faults while it executes.
1259 * We get here for two reasons:
1260 * 1. Fault while reloading DS, ES, FS or GS
1261 * 2. Fault while executing IRET
1262 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1263 * registers that could be reloaded and zeroed the others.
1264 * Category 2 we fix up by killing the current process. We cannot use the
1265 * normal Linux return path in this case because if we use the IRET hypercall
1266 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1267 * We distinguish between categories by comparing each saved segment register
1268 * with its current contents: any discrepancy means we in category 1.
1269 */
1270 ENTRY(xen_failsafe_callback)
1271 INTR_FRAME 1 (6*8)
1272 /*CFI_REL_OFFSET gs,GS*/
1273 /*CFI_REL_OFFSET fs,FS*/
1274 /*CFI_REL_OFFSET es,ES*/
1275 /*CFI_REL_OFFSET ds,DS*/
1276 CFI_REL_OFFSET r11,8
1277 CFI_REL_OFFSET rcx,0
1278 movw %ds,%cx
1279 cmpw %cx,0x10(%rsp)
1280 CFI_REMEMBER_STATE
1281 jne 1f
1282 movw %es,%cx
1283 cmpw %cx,0x18(%rsp)
1284 jne 1f
1285 movw %fs,%cx
1286 cmpw %cx,0x20(%rsp)
1287 jne 1f
1288 movw %gs,%cx
1289 cmpw %cx,0x28(%rsp)
1290 jne 1f
1291 /* All segments match their saved values => Category 2 (Bad IRET). */
1292 movq (%rsp),%rcx
1293 CFI_RESTORE rcx
1294 movq 8(%rsp),%r11
1295 CFI_RESTORE r11
1296 addq $0x30,%rsp
1297 CFI_ADJUST_CFA_OFFSET -0x30
1298 pushq_cfi $0 /* RIP */
1299 pushq_cfi %r11
1300 pushq_cfi %rcx
1301 jmp general_protection
1302 CFI_RESTORE_STATE
1303 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1304 movq (%rsp),%rcx
1305 CFI_RESTORE rcx
1306 movq 8(%rsp),%r11
1307 CFI_RESTORE r11
1308 addq $0x30,%rsp
1309 CFI_ADJUST_CFA_OFFSET -0x30
1310 pushq_cfi $0
1311 SAVE_ALL
1312 jmp error_exit
1313 CFI_ENDPROC
1314 END(xen_failsafe_callback)
1315
1316 apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
1317 xen_hvm_callback_vector xen_evtchn_do_upcall
1318
1319 #endif /* CONFIG_XEN */
1320
1321 /*
1322 * Some functions should be protected against kprobes
1323 */
1324 .pushsection .kprobes.text, "ax"
1325
1326 paranoidzeroentry_ist debug do_debug DEBUG_STACK
1327 paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1328 paranoiderrorentry stack_segment do_stack_segment
1329 #ifdef CONFIG_XEN
1330 zeroentry xen_debug do_debug
1331 zeroentry xen_int3 do_int3
1332 errorentry xen_stack_segment do_stack_segment
1333 #endif
1334 errorentry general_protection do_general_protection
1335 errorentry page_fault do_page_fault
1336 #ifdef CONFIG_KVM_GUEST
1337 errorentry async_page_fault do_async_page_fault
1338 #endif
1339 #ifdef CONFIG_X86_MCE
1340 paranoidzeroentry machine_check *machine_check_vector(%rip)
1341 #endif
1342
1343 /*
1344 * "Paranoid" exit path from exception stack.
1345 * Paranoid because this is used by NMIs and cannot take
1346 * any kernel state for granted.
1347 * We don't do kernel preemption checks here, because only
1348 * NMI should be common and it does not enable IRQs and
1349 * cannot get reschedule ticks.
1350 *
1351 * "trace" is 0 for the NMI handler only, because irq-tracing
1352 * is fundamentally NMI-unsafe. (we cannot change the soft and
1353 * hard flags at once, atomically)
1354 */
1355
1356 /* ebx: no swapgs flag */
1357 ENTRY(paranoid_exit)
1358 DEFAULT_FRAME
1359 DISABLE_INTERRUPTS(CLBR_NONE)
1360 TRACE_IRQS_OFF
1361 testl %ebx,%ebx /* swapgs needed? */
1362 jnz paranoid_restore
1363 testl $3,CS(%rsp)
1364 jnz paranoid_userspace
1365 paranoid_swapgs:
1366 TRACE_IRQS_IRETQ 0
1367 SWAPGS_UNSAFE_STACK
1368 RESTORE_ALL 8
1369 jmp irq_return
1370 paranoid_restore:
1371 TRACE_IRQS_IRETQ 0
1372 RESTORE_ALL 8
1373 jmp irq_return
1374 paranoid_userspace:
1375 GET_THREAD_INFO(%rcx)
1376 movl TI_flags(%rcx),%ebx
1377 andl $_TIF_WORK_MASK,%ebx
1378 jz paranoid_swapgs
1379 movq %rsp,%rdi /* &pt_regs */
1380 call sync_regs
1381 movq %rax,%rsp /* switch stack for scheduling */
1382 testl $_TIF_NEED_RESCHED,%ebx
1383 jnz paranoid_schedule
1384 movl %ebx,%edx /* arg3: thread flags */
1385 TRACE_IRQS_ON
1386 ENABLE_INTERRUPTS(CLBR_NONE)
1387 xorl %esi,%esi /* arg2: oldset */
1388 movq %rsp,%rdi /* arg1: &pt_regs */
1389 call do_notify_resume
1390 DISABLE_INTERRUPTS(CLBR_NONE)
1391 TRACE_IRQS_OFF
1392 jmp paranoid_userspace
1393 paranoid_schedule:
1394 TRACE_IRQS_ON
1395 ENABLE_INTERRUPTS(CLBR_ANY)
1396 call schedule
1397 DISABLE_INTERRUPTS(CLBR_ANY)
1398 TRACE_IRQS_OFF
1399 jmp paranoid_userspace
1400 CFI_ENDPROC
1401 END(paranoid_exit)
1402
1403 /*
1404 * Exception entry point. This expects an error code/orig_rax on the stack.
1405 * returns in "no swapgs flag" in %ebx.
1406 */
1407 ENTRY(error_entry)
1408 XCPT_FRAME
1409 CFI_ADJUST_CFA_OFFSET 15*8
1410 /* oldrax contains error code */
1411 cld
1412 movq_cfi rdi, RDI+8
1413 movq_cfi rsi, RSI+8
1414 movq_cfi rdx, RDX+8
1415 movq_cfi rcx, RCX+8
1416 movq_cfi rax, RAX+8
1417 movq_cfi r8, R8+8
1418 movq_cfi r9, R9+8
1419 movq_cfi r10, R10+8
1420 movq_cfi r11, R11+8
1421 movq_cfi rbx, RBX+8
1422 movq_cfi rbp, RBP+8
1423 movq_cfi r12, R12+8
1424 movq_cfi r13, R13+8
1425 movq_cfi r14, R14+8
1426 movq_cfi r15, R15+8
1427 xorl %ebx,%ebx
1428 testl $3,CS+8(%rsp)
1429 je error_kernelspace
1430 error_swapgs:
1431 SWAPGS
1432 error_sti:
1433 TRACE_IRQS_OFF
1434 ret
1435
1436 /*
1437 * There are two places in the kernel that can potentially fault with
1438 * usergs. Handle them here. The exception handlers after iret run with
1439 * kernel gs again, so don't set the user space flag. B stepping K8s
1440 * sometimes report an truncated RIP for IRET exceptions returning to
1441 * compat mode. Check for these here too.
1442 */
1443 error_kernelspace:
1444 incl %ebx
1445 leaq irq_return(%rip),%rcx
1446 cmpq %rcx,RIP+8(%rsp)
1447 je error_swapgs
1448 movl %ecx,%eax /* zero extend */
1449 cmpq %rax,RIP+8(%rsp)
1450 je bstep_iret
1451 cmpq $gs_change,RIP+8(%rsp)
1452 je error_swapgs
1453 jmp error_sti
1454
1455 bstep_iret:
1456 /* Fix truncated RIP */
1457 movq %rcx,RIP+8(%rsp)
1458 jmp error_swapgs
1459 CFI_ENDPROC
1460 END(error_entry)
1461
1462
1463 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1464 ENTRY(error_exit)
1465 DEFAULT_FRAME
1466 movl %ebx,%eax
1467 RESTORE_REST
1468 DISABLE_INTERRUPTS(CLBR_NONE)
1469 TRACE_IRQS_OFF
1470 GET_THREAD_INFO(%rcx)
1471 testl %eax,%eax
1472 jne retint_kernel
1473 LOCKDEP_SYS_EXIT_IRQ
1474 movl TI_flags(%rcx),%edx
1475 movl $_TIF_WORK_MASK,%edi
1476 andl %edi,%edx
1477 jnz retint_careful
1478 jmp retint_swapgs
1479 CFI_ENDPROC
1480 END(error_exit)
1481
1482 /*
1483 * Test if a given stack is an NMI stack or not.
1484 */
1485 .macro test_in_nmi reg stack nmi_ret normal_ret
1486 cmpq %\reg, \stack
1487 ja \normal_ret
1488 subq $EXCEPTION_STKSZ, %\reg
1489 cmpq %\reg, \stack
1490 jb \normal_ret
1491 jmp \nmi_ret
1492 .endm
1493
1494 /* runs on exception stack */
1495 ENTRY(nmi)
1496 INTR_FRAME
1497 PARAVIRT_ADJUST_EXCEPTION_FRAME
1498 /*
1499 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1500 * the iretq it performs will take us out of NMI context.
1501 * This means that we can have nested NMIs where the next
1502 * NMI is using the top of the stack of the previous NMI. We
1503 * can't let it execute because the nested NMI will corrupt the
1504 * stack of the previous NMI. NMI handlers are not re-entrant
1505 * anyway.
1506 *
1507 * To handle this case we do the following:
1508 * Check the a special location on the stack that contains
1509 * a variable that is set when NMIs are executing.
1510 * The interrupted task's stack is also checked to see if it
1511 * is an NMI stack.
1512 * If the variable is not set and the stack is not the NMI
1513 * stack then:
1514 * o Set the special variable on the stack
1515 * o Copy the interrupt frame into a "saved" location on the stack
1516 * o Copy the interrupt frame into a "copy" location on the stack
1517 * o Continue processing the NMI
1518 * If the variable is set or the previous stack is the NMI stack:
1519 * o Modify the "copy" location to jump to the repeate_nmi
1520 * o return back to the first NMI
1521 *
1522 * Now on exit of the first NMI, we first clear the stack variable
1523 * The NMI stack will tell any nested NMIs at that point that it is
1524 * nested. Then we pop the stack normally with iret, and if there was
1525 * a nested NMI that updated the copy interrupt stack frame, a
1526 * jump will be made to the repeat_nmi code that will handle the second
1527 * NMI.
1528 */
1529
1530 /* Use %rdx as out temp variable throughout */
1531 pushq_cfi %rdx
1532
1533 /*
1534 * If %cs was not the kernel segment, then the NMI triggered in user
1535 * space, which means it is definitely not nested.
1536 */
1537 cmpl $__KERNEL_CS, 16(%rsp)
1538 jne first_nmi
1539
1540 /*
1541 * Check the special variable on the stack to see if NMIs are
1542 * executing.
1543 */
1544 cmpl $1, -8(%rsp)
1545 je nested_nmi
1546
1547 /*
1548 * Now test if the previous stack was an NMI stack.
1549 * We need the double check. We check the NMI stack to satisfy the
1550 * race when the first NMI clears the variable before returning.
1551 * We check the variable because the first NMI could be in a
1552 * breakpoint routine using a breakpoint stack.
1553 */
1554 lea 6*8(%rsp), %rdx
1555 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
1556
1557 nested_nmi:
1558 /*
1559 * Do nothing if we interrupted the fixup in repeat_nmi.
1560 * It's about to repeat the NMI handler, so we are fine
1561 * with ignoring this one.
1562 */
1563 movq $repeat_nmi, %rdx
1564 cmpq 8(%rsp), %rdx
1565 ja 1f
1566 movq $end_repeat_nmi, %rdx
1567 cmpq 8(%rsp), %rdx
1568 ja nested_nmi_out
1569
1570 1:
1571 /* Set up the interrupted NMIs stack to jump to repeat_nmi */
1572 leaq -6*8(%rsp), %rdx
1573 movq %rdx, %rsp
1574 CFI_ADJUST_CFA_OFFSET 6*8
1575 pushq_cfi $__KERNEL_DS
1576 pushq_cfi %rdx
1577 pushfq_cfi
1578 pushq_cfi $__KERNEL_CS
1579 pushq_cfi $repeat_nmi
1580
1581 /* Put stack back */
1582 addq $(11*8), %rsp
1583 CFI_ADJUST_CFA_OFFSET -11*8
1584
1585 nested_nmi_out:
1586 popq_cfi %rdx
1587
1588 /* No need to check faults here */
1589 INTERRUPT_RETURN
1590
1591 first_nmi:
1592 /*
1593 * Because nested NMIs will use the pushed location that we
1594 * stored in rdx, we must keep that space available.
1595 * Here's what our stack frame will look like:
1596 * +-------------------------+
1597 * | original SS |
1598 * | original Return RSP |
1599 * | original RFLAGS |
1600 * | original CS |
1601 * | original RIP |
1602 * +-------------------------+
1603 * | temp storage for rdx |
1604 * +-------------------------+
1605 * | NMI executing variable |
1606 * +-------------------------+
1607 * | Saved SS |
1608 * | Saved Return RSP |
1609 * | Saved RFLAGS |
1610 * | Saved CS |
1611 * | Saved RIP |
1612 * +-------------------------+
1613 * | copied SS |
1614 * | copied Return RSP |
1615 * | copied RFLAGS |
1616 * | copied CS |
1617 * | copied RIP |
1618 * +-------------------------+
1619 * | pt_regs |
1620 * +-------------------------+
1621 *
1622 * The saved RIP is used to fix up the copied RIP that a nested
1623 * NMI may zero out. The original stack frame and the temp storage
1624 * is also used by nested NMIs and can not be trusted on exit.
1625 */
1626 /* Set the NMI executing variable on the stack. */
1627 pushq_cfi $1
1628
1629 /* Copy the stack frame to the Saved frame */
1630 .rept 5
1631 pushq_cfi 6*8(%rsp)
1632 .endr
1633
1634 /* Make another copy, this one may be modified by nested NMIs */
1635 .rept 5
1636 pushq_cfi 4*8(%rsp)
1637 .endr
1638
1639 /* Do not pop rdx, nested NMIs will corrupt it */
1640 movq 11*8(%rsp), %rdx
1641
1642 /*
1643 * Everything below this point can be preempted by a nested
1644 * NMI if the first NMI took an exception. Repeated NMIs
1645 * caused by an exception and nested NMI will start here, and
1646 * can still be preempted by another NMI.
1647 */
1648 restart_nmi:
1649 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1650 subq $ORIG_RAX-R15, %rsp
1651 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1652 /*
1653 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
1654 * as we should not be calling schedule in NMI context.
1655 * Even with normal interrupts enabled. An NMI should not be
1656 * setting NEED_RESCHED or anything that normal interrupts and
1657 * exceptions might do.
1658 */
1659 call save_paranoid
1660 DEFAULT_FRAME 0
1661 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1662 movq %rsp,%rdi
1663 movq $-1,%rsi
1664 call do_nmi
1665 testl %ebx,%ebx /* swapgs needed? */
1666 jnz nmi_restore
1667 nmi_swapgs:
1668 SWAPGS_UNSAFE_STACK
1669 nmi_restore:
1670 RESTORE_ALL 8
1671 /* Clear the NMI executing stack variable */
1672 movq $0, 10*8(%rsp)
1673 jmp irq_return
1674 CFI_ENDPROC
1675 END(nmi)
1676
1677 /*
1678 * If an NMI hit an iret because of an exception or breakpoint,
1679 * it can lose its NMI context, and a nested NMI may come in.
1680 * In that case, the nested NMI will change the preempted NMI's
1681 * stack to jump to here when it does the final iret.
1682 */
1683 repeat_nmi:
1684 INTR_FRAME
1685 /* Update the stack variable to say we are still in NMI */
1686 movq $1, 5*8(%rsp)
1687
1688 /* copy the saved stack back to copy stack */
1689 .rept 5
1690 pushq_cfi 4*8(%rsp)
1691 .endr
1692
1693 jmp restart_nmi
1694 CFI_ENDPROC
1695 end_repeat_nmi:
1696
1697 ENTRY(ignore_sysret)
1698 CFI_STARTPROC
1699 mov $-ENOSYS,%eax
1700 sysret
1701 CFI_ENDPROC
1702 END(ignore_sysret)
1703
1704 /*
1705 * End of kprobes section
1706 */
1707 .popsection