]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/x86/entry/entry_64.S
x86/entry: Move PUSH_AND_CLEAR_REGS out of error_entry()
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / entry / entry_64.S
CommitLineData
b2441318 1/* SPDX-License-Identifier: GPL-2.0 */
1da177e4
LT
2/*
3 * linux/arch/x86_64/entry.S
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
7 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
4d732138 8 *
1da177e4
LT
9 * entry.S contains the system-call and fault low-level handling routines.
10 *
cb1aaebe 11 * Some of this is documented in Documentation/x86/entry_64.rst
8b4777a4 12 *
0bd7b798 13 * A note on terminology:
4d732138
IM
14 * - iret frame: Architecture defined interrupt frame from SS to RIP
15 * at the top of the kernel process stack.
2e91a17b
AK
16 *
17 * Some macro usage:
6dcc5627 18 * - SYM_FUNC_START/END:Define functions in the symbol table.
4d732138 19 * - idtentry: Define exception entry points.
1da177e4 20 */
1da177e4
LT
21#include <linux/linkage.h>
22#include <asm/segment.h>
1da177e4
LT
23#include <asm/cache.h>
24#include <asm/errno.h>
e2d5df93 25#include <asm/asm-offsets.h>
1da177e4
LT
26#include <asm/msr.h>
27#include <asm/unistd.h>
28#include <asm/thread_info.h>
29#include <asm/hw_irq.h>
0341c14d 30#include <asm/page_types.h>
2601e64d 31#include <asm/irqflags.h>
72fe4858 32#include <asm/paravirt.h>
9939ddaf 33#include <asm/percpu.h>
d7abc0fa 34#include <asm/asm.h>
63bcff2a 35#include <asm/smap.h>
3891a04a 36#include <asm/pgtable_types.h>
784d5699 37#include <asm/export.h>
8c1f7558 38#include <asm/frame.h>
cfa82a00 39#include <asm/trapnr.h>
2641f08b 40#include <asm/nospec-branch.h>
c82965f9 41#include <asm/fsgsbase.h>
d7e7528b 42#include <linux/err.h>
1da177e4 43
6fd166aa
PZ
44#include "calling.h"
45
4d732138
IM
46.code64
47.section .entry.text, "ax"
16444a8a 48
1da177e4 49/*
4d732138 50 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
1da177e4 51 *
fda57b22
AL
52 * This is the only entry point used for 64-bit system calls. The
53 * hardware interface is reasonably well designed and the register to
54 * argument mapping Linux uses fits well with the registers that are
55 * available when SYSCALL is used.
56 *
57 * SYSCALL instructions can be found inlined in libc implementations as
58 * well as some other programs and libraries. There are also a handful
59 * of SYSCALL instructions in the vDSO used, for example, as a
60 * clock_gettimeofday fallback.
61 *
4d732138 62 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
b87cf63e
DV
63 * then loads new ss, cs, and rip from previously programmed MSRs.
64 * rflags gets masked by a value from another MSR (so CLD and CLAC
65 * are not needed). SYSCALL does not save anything on the stack
66 * and does not change rsp.
67 *
68 * Registers on entry:
1da177e4 69 * rax system call number
b87cf63e
DV
70 * rcx return address
71 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
1da177e4 72 * rdi arg0
1da177e4 73 * rsi arg1
0bd7b798 74 * rdx arg2
b87cf63e 75 * r10 arg3 (needs to be moved to rcx to conform to C ABI)
1da177e4
LT
76 * r8 arg4
77 * r9 arg5
4d732138 78 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
0bd7b798 79 *
1da177e4
LT
80 * Only called from user space.
81 *
7fcb3bc3 82 * When user can change pt_regs->foo always force IRET. That is because
7bf36bbc
AK
83 * it deals with uncanonical addresses better. SYSRET has trouble
84 * with them due to bugs in both AMD and Intel CPUs.
0bd7b798 85 */
1da177e4 86
bc7b11c0 87SYM_CODE_START(entry_SYSCALL_64)
8c1f7558 88 UNWIND_HINT_EMPTY
72fe4858 89
8a9949bc 90 swapgs
bf904d27 91 /* tss.sp2 is scratch space. */
98f05b51 92 movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
bf904d27 93 SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
4d732138 94 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
9ed8e7d8 95
a13644f3
JR
96SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
97
9ed8e7d8 98 /* Construct struct pt_regs on stack */
98f05b51
AL
99 pushq $__USER_DS /* pt_regs->ss */
100 pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
101 pushq %r11 /* pt_regs->flags */
102 pushq $__USER_CS /* pt_regs->cs */
103 pushq %rcx /* pt_regs->ip */
26ba4e57 104SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
98f05b51 105 pushq %rax /* pt_regs->orig_ax */
30907fd1
DB
106
107 PUSH_AND_CLEAR_REGS rax=$-ENOSYS
4d732138 108
1e423bff 109 /* IRQs are off. */
3e5e7f77 110 movq %rsp, %rdi
05954948
PAI
111 /* Sign extend the lower 32bit as syscall numbers are treated as int */
112 movslq %eax, %rsi
1e423bff
AL
113 call do_syscall_64 /* returns with IRQs disabled */
114
fffbb5dc
DV
115 /*
116 * Try to use SYSRET instead of IRET if we're returning to
8a055d7f
AL
117 * a completely clean 64-bit userspace context. If we're not,
118 * go to the slow exit path.
afd30525 119 * In the Xen PV case we must use iret anyway.
fffbb5dc 120 */
afd30525
JG
121
122 ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
123 X86_FEATURE_XENPV
124
4d732138
IM
125 movq RCX(%rsp), %rcx
126 movq RIP(%rsp), %r11
8a055d7f
AL
127
128 cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
129 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
130
131 /*
132 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
133 * in kernel space. This essentially lets the user take over
17be0aec 134 * the kernel, since userspace controls RSP.
fffbb5dc 135 *
17be0aec 136 * If width of "canonical tail" ever becomes variable, this will need
fffbb5dc 137 * to be updated to remain correct on both old and new CPUs.
361b4b58 138 *
cbe0317b
KS
139 * Change top bits to match most significant bit (47th or 56th bit
140 * depending on paging mode) in the address.
fffbb5dc 141 */
09e61a77 142#ifdef CONFIG_X86_5LEVEL
39b95522
KS
143 ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
144 "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
09e61a77 145#else
17be0aec
DV
146 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
147 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
09e61a77 148#endif
4d732138 149
17be0aec
DV
150 /* If this changed %rcx, it was not canonical */
151 cmpq %rcx, %r11
8a055d7f 152 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc 153
4d732138 154 cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
8a055d7f 155 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc 156
4d732138
IM
157 movq R11(%rsp), %r11
158 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
8a055d7f 159 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
160
161 /*
3e035305
BP
162 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
163 * restore RF properly. If the slowpath sets it for whatever reason, we
164 * need to restore it correctly.
165 *
166 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
167 * trap from userspace immediately after SYSRET. This would cause an
168 * infinite loop whenever #DB happens with register state that satisfies
169 * the opportunistic SYSRET conditions. For example, single-stepping
170 * this user code:
fffbb5dc 171 *
4d732138 172 * movq $stuck_here, %rcx
fffbb5dc
DV
173 * pushfq
174 * popq %r11
175 * stuck_here:
176 *
177 * would never get past 'stuck_here'.
178 */
4d732138 179 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
8a055d7f 180 jnz swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
181
182 /* nothing to check for RSP */
183
4d732138 184 cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
8a055d7f 185 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
186
187 /*
4d732138
IM
188 * We win! This label is here just for ease of understanding
189 * perf profiles. Nothing jumps here.
fffbb5dc
DV
190 */
191syscall_return_via_sysret:
17be0aec 192 /* rcx and r11 are already restored (see code above) */
502af0d7 193 POP_REGS pop_rdi=0 skip_r11rcx=1
3e3b9293
AL
194
195 /*
196 * Now all regs are restored except RSP and RDI.
197 * Save old stack pointer and switch to trampoline stack.
198 */
199 movq %rsp, %rdi
c482feef 200 movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
1fb14363 201 UNWIND_HINT_EMPTY
3e3b9293
AL
202
203 pushq RSP-RDI(%rdi) /* RSP */
204 pushq (%rdi) /* RDI */
205
206 /*
207 * We are on the trampoline stack. All regs except RDI are live.
208 * We can do future final exit work right here.
209 */
afaef01c
AP
210 STACKLEAK_ERASE_NOCLOBBER
211
6fd166aa 212 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
3e3b9293 213
4fbb3910 214 popq %rdi
3e3b9293 215 popq %rsp
afd30525
JG
216 swapgs
217 sysretq
bc7b11c0 218SYM_CODE_END(entry_SYSCALL_64)
0bd7b798 219
0100301b
BG
220/*
221 * %rdi: prev task
222 * %rsi: next task
223 */
b9f6976b 224.pushsection .text, "ax"
96c64806 225SYM_FUNC_START(__switch_to_asm)
0100301b
BG
226 /*
227 * Save callee-saved registers
228 * This must match the order in inactive_task_frame
229 */
230 pushq %rbp
231 pushq %rbx
232 pushq %r12
233 pushq %r13
234 pushq %r14
235 pushq %r15
236
237 /* switch stack */
238 movq %rsp, TASK_threadsp(%rdi)
239 movq TASK_threadsp(%rsi), %rsp
240
050e9baa 241#ifdef CONFIG_STACKPROTECTOR
0100301b 242 movq TASK_stack_canary(%rsi), %rbx
e6401c13 243 movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
0100301b
BG
244#endif
245
c995efd5
DW
246#ifdef CONFIG_RETPOLINE
247 /*
248 * When switching from a shallower to a deeper call stack
249 * the RSB may either underflow or use entries populated
250 * with userspace addresses. On CPUs where those concerns
251 * exist, overwrite the RSB with entries which capture
252 * speculative execution to prevent attack.
253 */
d1c99108 254 FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
c995efd5
DW
255#endif
256
0100301b
BG
257 /* restore callee-saved registers */
258 popq %r15
259 popq %r14
260 popq %r13
261 popq %r12
262 popq %rbx
263 popq %rbp
264
265 jmp __switch_to
96c64806 266SYM_FUNC_END(__switch_to_asm)
b9f6976b 267.popsection
0100301b 268
1eeb207f
DV
269/*
270 * A newly forked process directly context switches into this address.
271 *
0100301b 272 * rax: prev task we switched from
616d2483
BG
273 * rbx: kernel thread func (NULL for user thread)
274 * r12: kernel thread arg
1eeb207f 275 */
b9f6976b 276.pushsection .text, "ax"
bc7b11c0 277SYM_CODE_START(ret_from_fork)
8c1f7558 278 UNWIND_HINT_EMPTY
0100301b 279 movq %rax, %rdi
ebd57499 280 call schedule_tail /* rdi: 'prev' task parameter */
1eeb207f 281
ebd57499
JP
282 testq %rbx, %rbx /* from kernel_thread? */
283 jnz 1f /* kernel threads are uncommon */
24d978b7 284
616d2483 2852:
8c1f7558 286 UNWIND_HINT_REGS
ebd57499 287 movq %rsp, %rdi
167fd210 288 call syscall_exit_to_user_mode /* returns with IRQs disabled */
8a055d7f 289 jmp swapgs_restore_regs_and_return_to_usermode
616d2483
BG
290
2911:
292 /* kernel thread */
d31a5802 293 UNWIND_HINT_EMPTY
616d2483 294 movq %r12, %rdi
34fdce69 295 CALL_NOSPEC rbx
616d2483
BG
296 /*
297 * A kernel thread is allowed to return here after successfully
be619f7f 298 * calling kernel_execve(). Exit to userspace to complete the execve()
616d2483
BG
299 * syscall.
300 */
301 movq $0, RAX(%rsp)
302 jmp 2b
bc7b11c0 303SYM_CODE_END(ret_from_fork)
b9f6976b 304.popsection
1eeb207f 305
1d3e53e8
AL
306.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
307#ifdef CONFIG_DEBUG_ENTRY
e17f8234 308 pushq %rax
fafe5e74 309 SAVE_FLAGS
e17f8234 310 testl $X86_EFLAGS_IF, %eax
1d3e53e8
AL
311 jz .Lokay_\@
312 ud2
313.Lokay_\@:
e17f8234 314 popq %rax
1d3e53e8
AL
315#endif
316.endm
317
7f0f2134
LJ
318/* Save all registers in pt_regs */
319SYM_CODE_START_LOCAL(push_and_clear_regs)
320 UNWIND_HINT_FUNC
321 PUSH_AND_CLEAR_REGS save_ret=1
322 ENCODE_FRAME_POINTER 8
323 RET
324SYM_CODE_END(push_and_clear_regs)
325
cfa82a00
TG
326/**
327 * idtentry_body - Macro to emit code calling the C function
cfa82a00
TG
328 * @cfunc: C function to be called
329 * @has_error_code: Hardware pushed error code on stack
330 */
e2dcb5f1 331.macro idtentry_body cfunc has_error_code:req
cfa82a00 332
7f0f2134
LJ
333 call push_and_clear_regs
334 UNWIND_HINT_REGS
335
cfa82a00 336 call error_entry
90f93ae2
LJ
337 movq %rax, %rsp /* switch to the task stack if from userspace */
338 ENCODE_FRAME_POINTER
cfa82a00
TG
339 UNWIND_HINT_REGS
340
cfa82a00
TG
341 movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
342
343 .if \has_error_code == 1
344 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
345 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
cfa82a00
TG
346 .endif
347
cfa82a00
TG
348 call \cfunc
349
424c7d0a 350 jmp error_return
cfa82a00
TG
351.endm
352
353/**
354 * idtentry - Macro to generate entry stubs for simple IDT entries
355 * @vector: Vector number
356 * @asmsym: ASM symbol for the entry point
357 * @cfunc: C function to be called
358 * @has_error_code: Hardware pushed error code on stack
359 *
360 * The macro emits code to set up the kernel context for straight forward
361 * and simple IDT entries. No IST stack, no paranoid entry checks.
362 */
e2dcb5f1 363.macro idtentry vector asmsym cfunc has_error_code:req
cfa82a00
TG
364SYM_CODE_START(\asmsym)
365 UNWIND_HINT_IRET_REGS offset=\has_error_code*8
366 ASM_CLAC
367
368 .if \has_error_code == 0
369 pushq $-1 /* ORIG_RAX: no syscall to restart */
370 .endif
371
372 .if \vector == X86_TRAP_BP
373 /*
374 * If coming from kernel space, create a 6-word gap to allow the
375 * int3 handler to emulate a call instruction.
376 */
377 testb $3, CS-ORIG_RAX(%rsp)
378 jnz .Lfrom_usermode_no_gap_\@
379 .rept 6
380 pushq 5*8(%rsp)
381 .endr
382 UNWIND_HINT_IRET_REGS offset=8
383.Lfrom_usermode_no_gap_\@:
384 .endif
385
e2dcb5f1 386 idtentry_body \cfunc \has_error_code
cfa82a00
TG
387
388_ASM_NOKPROBE(\asmsym)
389SYM_CODE_END(\asmsym)
390.endm
391
0bf7c314
TG
392/*
393 * Interrupt entry/exit.
394 *
395 + The interrupt stubs push (vector) onto the stack, which is the error_code
396 * position of idtentry exceptions, and jump to one of the two idtentry points
397 * (common/spurious).
398 *
399 * common_interrupt is a hotpath, align it to a cache line
400 */
401.macro idtentry_irq vector cfunc
402 .p2align CONFIG_X86_L1_CACHE_SHIFT
403 idtentry \vector asm_\cfunc \cfunc has_error_code=1
404.endm
405
6368558c
TG
406/*
407 * System vectors which invoke their handlers directly and are not
408 * going through the regular common device interrupt handling code.
409 */
410.macro idtentry_sysvec vector cfunc
411 idtentry \vector asm_\cfunc \cfunc has_error_code=0
412.endm
413
cfa82a00
TG
414/**
415 * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
416 * @vector: Vector number
417 * @asmsym: ASM symbol for the entry point
418 * @cfunc: C function to be called
419 *
420 * The macro emits code to set up the kernel context for #MC and #DB
421 *
422 * If the entry comes from user space it uses the normal entry path
423 * including the return to user space work and preemption checks on
424 * exit.
425 *
426 * If hits in kernel mode then it needs to go through the paranoid
427 * entry as the exception can hit any random state. No preemption
428 * check on exit to keep the paranoid path simple.
cfa82a00
TG
429 */
430.macro idtentry_mce_db vector asmsym cfunc
431SYM_CODE_START(\asmsym)
432 UNWIND_HINT_IRET_REGS
433 ASM_CLAC
434
435 pushq $-1 /* ORIG_RAX: no syscall to restart */
436
437 /*
438 * If the entry is from userspace, switch stacks and treat it as
439 * a normal entry.
440 */
441 testb $3, CS-ORIG_RAX(%rsp)
442 jnz .Lfrom_usermode_switch_stack_\@
443
c82965f9 444 /* paranoid_entry returns GS information for paranoid_exit in EBX. */
cfa82a00
TG
445 call paranoid_entry
446
447 UNWIND_HINT_REGS
448
cfa82a00 449 movq %rsp, %rdi /* pt_regs pointer */
cfa82a00 450
cfa82a00
TG
451 call \cfunc
452
cfa82a00
TG
453 jmp paranoid_exit
454
455 /* Switch to the regular task stack and use the noist entry point */
456.Lfrom_usermode_switch_stack_\@:
e2dcb5f1 457 idtentry_body noist_\cfunc, has_error_code=0
cfa82a00
TG
458
459_ASM_NOKPROBE(\asmsym)
460SYM_CODE_END(\asmsym)
461.endm
462
a13644f3
JR
463#ifdef CONFIG_AMD_MEM_ENCRYPT
464/**
465 * idtentry_vc - Macro to generate entry stub for #VC
466 * @vector: Vector number
467 * @asmsym: ASM symbol for the entry point
468 * @cfunc: C function to be called
469 *
470 * The macro emits code to set up the kernel context for #VC. The #VC handler
471 * runs on an IST stack and needs to be able to cause nested #VC exceptions.
472 *
473 * To make this work the #VC entry code tries its best to pretend it doesn't use
474 * an IST stack by switching to the task stack if coming from user-space (which
475 * includes early SYSCALL entry path) or back to the stack in the IRET frame if
476 * entered from kernel-mode.
477 *
478 * If entered from kernel-mode the return stack is validated first, and if it is
479 * not safe to use (e.g. because it points to the entry stack) the #VC handler
480 * will switch to a fall-back stack (VC2) and call a special handler function.
481 *
482 * The macro is only used for one vector, but it is planned to be extended in
483 * the future for the #HV exception.
484 */
485.macro idtentry_vc vector asmsym cfunc
486SYM_CODE_START(\asmsym)
487 UNWIND_HINT_IRET_REGS
488 ASM_CLAC
489
490 /*
491 * If the entry is from userspace, switch stacks and treat it as
492 * a normal entry.
493 */
494 testb $3, CS-ORIG_RAX(%rsp)
495 jnz .Lfrom_usermode_switch_stack_\@
496
497 /*
498 * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
499 * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
500 */
501 call paranoid_entry
502
503 UNWIND_HINT_REGS
504
505 /*
506 * Switch off the IST stack to make it free for nested exceptions. The
507 * vc_switch_off_ist() function will switch back to the interrupted
508 * stack if it is safe to do so. If not it switches to the VC fall-back
509 * stack.
510 */
511 movq %rsp, %rdi /* pt_regs pointer */
512 call vc_switch_off_ist
513 movq %rax, %rsp /* Switch to new stack */
514
515 UNWIND_HINT_REGS
516
517 /* Update pt_regs */
518 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
519 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
520
521 movq %rsp, %rdi /* pt_regs pointer */
522
be1a5408 523 call kernel_\cfunc
a13644f3
JR
524
525 /*
526 * No need to switch back to the IST stack. The current stack is either
527 * identical to the stack in the IRET frame or the VC fall-back stack,
163b0991 528 * so it is definitely mapped even with PTI enabled.
a13644f3
JR
529 */
530 jmp paranoid_exit
531
532 /* Switch to the regular task stack */
533.Lfrom_usermode_switch_stack_\@:
be1a5408 534 idtentry_body user_\cfunc, has_error_code=1
a13644f3
JR
535
536_ASM_NOKPROBE(\asmsym)
537SYM_CODE_END(\asmsym)
538.endm
539#endif
540
cfa82a00
TG
541/*
542 * Double fault entry. Straight paranoid. No checks from which context
543 * this comes because for the espfix induced #DF this would do the wrong
544 * thing.
545 */
546.macro idtentry_df vector asmsym cfunc
547SYM_CODE_START(\asmsym)
548 UNWIND_HINT_IRET_REGS offset=8
549 ASM_CLAC
550
c82965f9 551 /* paranoid_entry returns GS information for paranoid_exit in EBX. */
cfa82a00
TG
552 call paranoid_entry
553 UNWIND_HINT_REGS
554
cfa82a00
TG
555 movq %rsp, %rdi /* pt_regs pointer into first argument */
556 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
557 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
cfa82a00
TG
558 call \cfunc
559
560 jmp paranoid_exit
561
562_ASM_NOKPROBE(\asmsym)
563SYM_CODE_END(\asmsym)
564.endm
565
53aaf262
TG
566/*
567 * Include the defines which emit the idt entries which are shared
f0178fc0
TG
568 * shared between 32 and 64 bit and emit the __irqentry_text_* markers
569 * so the stacktrace boundary checks work.
53aaf262 570 */
f0178fc0
TG
571 .align 16
572 .globl __irqentry_text_start
573__irqentry_text_start:
574
53aaf262
TG
575#include <asm/idtentry.h>
576
f0178fc0
TG
577 .align 16
578 .globl __irqentry_text_end
579__irqentry_text_end:
580
fa5e5c40 581SYM_CODE_START_LOCAL(common_interrupt_return)
26ba4e57 582SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
26c4ef9c
AL
583#ifdef CONFIG_DEBUG_ENTRY
584 /* Assert that pt_regs indicates user mode. */
1e4c4f61 585 testb $3, CS(%rsp)
26c4ef9c
AL
586 jnz 1f
587 ud2
5881:
589#endif
98006843
LJ
590#ifdef CONFIG_XEN_PV
591 ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
592#endif
593
502af0d7 594 POP_REGS pop_rdi=0
3e3b9293
AL
595
596 /*
597 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
598 * Save old stack pointer and switch to trampoline stack.
599 */
600 movq %rsp, %rdi
c482feef 601 movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
1fb14363 602 UNWIND_HINT_EMPTY
3e3b9293
AL
603
604 /* Copy the IRET frame to the trampoline stack. */
605 pushq 6*8(%rdi) /* SS */
606 pushq 5*8(%rdi) /* RSP */
607 pushq 4*8(%rdi) /* EFLAGS */
608 pushq 3*8(%rdi) /* CS */
609 pushq 2*8(%rdi) /* RIP */
610
611 /* Push user RDI on the trampoline stack. */
612 pushq (%rdi)
613
614 /*
615 * We are on the trampoline stack. All regs except RDI are live.
616 * We can do future final exit work right here.
617 */
afaef01c 618 STACKLEAK_ERASE_NOCLOBBER
3e3b9293 619
6fd166aa 620 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
8a09317b 621
3e3b9293
AL
622 /* Restore RDI. */
623 popq %rdi
624 SWAPGS
26c4ef9c
AL
625 INTERRUPT_RETURN
626
2601e64d 627
26ba4e57 628SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
26c4ef9c
AL
629#ifdef CONFIG_DEBUG_ENTRY
630 /* Assert that pt_regs indicates kernel mode. */
1e4c4f61 631 testb $3, CS(%rsp)
26c4ef9c
AL
632 jz 1f
633 ud2
6341:
635#endif
502af0d7 636 POP_REGS
e872045b 637 addq $8, %rsp /* skip regs->orig_ax */
10bcc80e
MD
638 /*
639 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
640 * when returning from IPI handler.
641 */
7209a75d
AL
642 INTERRUPT_RETURN
643
cc66936e 644SYM_INNER_LABEL_ALIGN(native_iret, SYM_L_GLOBAL)
8c1f7558 645 UNWIND_HINT_IRET_REGS
3891a04a
PA
646 /*
647 * Are we returning to a stack segment from the LDT? Note: in
648 * 64-bit mode SS:RSP on the exception stack is always valid.
649 */
34273f41 650#ifdef CONFIG_X86_ESPFIX64
4d732138
IM
651 testb $4, (SS-RIP)(%rsp)
652 jnz native_irq_return_ldt
34273f41 653#endif
3891a04a 654
cc66936e 655SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
b645af2d
AL
656 /*
657 * This may fault. Non-paranoid faults on return to userspace are
658 * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
c29c775a 659 * Double-faults due to espfix64 are handled in exc_double_fault.
b645af2d
AL
660 * Other faults here are fatal.
661 */
1da177e4 662 iretq
3701d863 663
34273f41 664#ifdef CONFIG_X86_ESPFIX64
7209a75d 665native_irq_return_ldt:
85063fac
AL
666 /*
667 * We are running with user GSBASE. All GPRs contain their user
668 * values. We have a percpu ESPFIX stack that is eight slots
669 * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom
670 * of the ESPFIX stack.
671 *
672 * We clobber RAX and RDI in this code. We stash RDI on the
673 * normal stack and RAX on the ESPFIX stack.
674 *
675 * The ESPFIX stack layout we set up looks like this:
676 *
677 * --- top of ESPFIX stack ---
678 * SS
679 * RSP
680 * RFLAGS
681 * CS
682 * RIP <-- RSP points here when we're done
683 * RAX <-- espfix_waddr points here
684 * --- bottom of ESPFIX stack ---
685 */
686
687 pushq %rdi /* Stash user RDI */
53c9d924 688 swapgs /* to kernel GS */
8a09317b
DH
689 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
690
4d732138 691 movq PER_CPU_VAR(espfix_waddr), %rdi
85063fac
AL
692 movq %rax, (0*8)(%rdi) /* user RAX */
693 movq (1*8)(%rsp), %rax /* user RIP */
4d732138 694 movq %rax, (1*8)(%rdi)
85063fac 695 movq (2*8)(%rsp), %rax /* user CS */
4d732138 696 movq %rax, (2*8)(%rdi)
85063fac 697 movq (3*8)(%rsp), %rax /* user RFLAGS */
4d732138 698 movq %rax, (3*8)(%rdi)
85063fac 699 movq (5*8)(%rsp), %rax /* user SS */
4d732138 700 movq %rax, (5*8)(%rdi)
85063fac 701 movq (4*8)(%rsp), %rax /* user RSP */
4d732138 702 movq %rax, (4*8)(%rdi)
85063fac
AL
703 /* Now RAX == RSP. */
704
705 andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
85063fac
AL
706
707 /*
708 * espfix_stack[31:16] == 0. The page tables are set up such that
709 * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
710 * espfix_waddr for any X. That is, there are 65536 RO aliases of
711 * the same page. Set up RSP so that RSP[31:16] contains the
712 * respective 16 bits of the /userspace/ RSP and RSP nonetheless
713 * still points to an RO alias of the ESPFIX stack.
714 */
4d732138 715 orq PER_CPU_VAR(espfix_stack), %rax
8a09317b 716
6fd166aa 717 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
53c9d924 718 swapgs /* to user GS */
8a09317b
DH
719 popq %rdi /* Restore user RDI */
720
4d732138 721 movq %rax, %rsp
8c1f7558 722 UNWIND_HINT_IRET_REGS offset=8
85063fac
AL
723
724 /*
725 * At this point, we cannot write to the stack any more, but we can
726 * still read.
727 */
728 popq %rax /* Restore user RAX */
729
730 /*
731 * RSP now points to an ordinary IRET frame, except that the page
732 * is read-only and RSP[31:16] are preloaded with the userspace
733 * values. We can now IRET back to userspace.
734 */
4d732138 735 jmp native_irq_return_iret
34273f41 736#endif
fa5e5c40
TG
737SYM_CODE_END(common_interrupt_return)
738_ASM_NOKPROBE(common_interrupt_return)
3891a04a 739
b9f6976b
TG
740/*
741 * Reload gs selector with exception handling
742 * edi: new selector
743 *
744 * Is in entry.text as it shouldn't be instrumented.
745 */
410367e3 746SYM_FUNC_START(asm_load_gs_index)
8c1f7558 747 FRAME_BEGIN
c9317202 748 swapgs
42c748bb 749.Lgs_change:
4d732138 750 movl %edi, %gs
96e5d28a 7512: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
c9317202 752 swapgs
8c1f7558 753 FRAME_END
5a8cd547 754 RET
410367e3
TG
755SYM_FUNC_END(asm_load_gs_index)
756EXPORT_SYMBOL(asm_load_gs_index)
0bd7b798 757
98ededb6 758 _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
4d732138 759 .section .fixup, "ax"
1da177e4 760 /* running with kernelgs */
ef77e688 761SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
c9317202 762 swapgs /* switch back to user gs */
b038c842
AL
763.macro ZAP_GS
764 /* This can't be a string because the preprocessor needs to see it. */
765 movl $__USER_DS, %eax
766 movl %eax, %gs
767.endm
768 ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
4d732138
IM
769 xorl %eax, %eax
770 movl %eax, %gs
771 jmp 2b
ef77e688 772SYM_CODE_END(.Lbad_gs)
9f1e87ea 773 .previous
0bd7b798 774
28c11b0f 775#ifdef CONFIG_XEN_PV
3d75e1b8 776/*
9f1e87ea
CG
777 * A note on the "critical region" in our callback handler.
778 * We want to avoid stacking callback handlers due to events occurring
779 * during handling of the last event. To do this, we keep events disabled
780 * until we've done all processing. HOWEVER, we must enable events before
781 * popping the stack frame (can't be done atomically) and so it would still
782 * be possible to get enough handler activations to overflow the stack.
783 * Although unlikely, bugs of that kind are hard to track down, so we'd
784 * like to avoid the possibility.
785 * So, on entry to the handler we detect whether we interrupted an
786 * existing activation in its critical region -- if so, we pop the current
787 * activation and restart the handler using the previous one.
2f6474e4
TG
788 *
789 * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
9f1e87ea 790 */
2f6474e4 791SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
4d732138 792
9f1e87ea
CG
793/*
794 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
795 * see the correct pointer to the pt_regs
796 */
8c1f7558 797 UNWIND_HINT_FUNC
4d732138 798 movq %rdi, %rsp /* we don't return, adjust the stack frame */
8c1f7558 799 UNWIND_HINT_REGS
1d3e53e8 800
2f6474e4 801 call xen_pv_evtchn_do_upcall
1d3e53e8 802
2f6474e4
TG
803 jmp error_return
804SYM_CODE_END(exc_xen_hypervisor_callback)
3d75e1b8
JF
805
806/*
9f1e87ea
CG
807 * Hypervisor uses this for application faults while it executes.
808 * We get here for two reasons:
809 * 1. Fault while reloading DS, ES, FS or GS
810 * 2. Fault while executing IRET
811 * Category 1 we do not need to fix up as Xen has already reloaded all segment
812 * registers that could be reloaded and zeroed the others.
813 * Category 2 we fix up by killing the current process. We cannot use the
814 * normal Linux return path in this case because if we use the IRET hypercall
815 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
816 * We distinguish between categories by comparing each saved segment register
817 * with its current contents: any discrepancy means we in category 1.
818 */
bc7b11c0 819SYM_CODE_START(xen_failsafe_callback)
8c1f7558 820 UNWIND_HINT_EMPTY
4d732138
IM
821 movl %ds, %ecx
822 cmpw %cx, 0x10(%rsp)
823 jne 1f
824 movl %es, %ecx
825 cmpw %cx, 0x18(%rsp)
826 jne 1f
827 movl %fs, %ecx
828 cmpw %cx, 0x20(%rsp)
829 jne 1f
830 movl %gs, %ecx
831 cmpw %cx, 0x28(%rsp)
832 jne 1f
3d75e1b8 833 /* All segments match their saved values => Category 2 (Bad IRET). */
4d732138
IM
834 movq (%rsp), %rcx
835 movq 8(%rsp), %r11
836 addq $0x30, %rsp
837 pushq $0 /* RIP */
8c1f7558 838 UNWIND_HINT_IRET_REGS offset=8
be4c11af 839 jmp asm_exc_general_protection
3d75e1b8 8401: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
4d732138
IM
841 movq (%rsp), %rcx
842 movq 8(%rsp), %r11
843 addq $0x30, %rsp
8c1f7558 844 UNWIND_HINT_IRET_REGS
4d732138 845 pushq $-1 /* orig_ax = -1 => not a system call */
3f01daec 846 PUSH_AND_CLEAR_REGS
946c1911 847 ENCODE_FRAME_POINTER
e88d9741 848 jmp error_return
bc7b11c0 849SYM_CODE_END(xen_failsafe_callback)
28c11b0f 850#endif /* CONFIG_XEN_PV */
3d75e1b8 851
ebfc453e 852/*
c82965f9
CB
853 * Save all registers in pt_regs. Return GSBASE related information
854 * in EBX depending on the availability of the FSGSBASE instructions:
855 *
856 * FSGSBASE R/EBX
857 * N 0 -> SWAPGS on exit
858 * 1 -> no SWAPGS on exit
859 *
860 * Y GSBASE value at entry, must be restored in paranoid_exit
ebfc453e 861 */
ef1e0315 862SYM_CODE_START_LOCAL(paranoid_entry)
8c1f7558 863 UNWIND_HINT_FUNC
1eeb207f 864 cld
9e809d15
DB
865 PUSH_AND_CLEAR_REGS save_ret=1
866 ENCODE_FRAME_POINTER 8
8a09317b 867
16561f27
DH
868 /*
869 * Always stash CR3 in %r14. This value will be restored,
ae852495
AL
870 * verbatim, at exit. Needed if paranoid_entry interrupted
871 * another entry that already switched to the user CR3 value
872 * but has not yet returned to userspace.
16561f27
DH
873 *
874 * This is also why CS (stashed in the "iret frame" by the
875 * hardware at entry) can not be used: this may be a return
ae852495 876 * to kernel code, but with a user CR3 value.
96b23714
CB
877 *
878 * Switching CR3 does not depend on kernel GSBASE so it can
879 * be done before switching to the kernel GSBASE. This is
880 * required for FSGSBASE because the kernel GSBASE has to
881 * be retrieved from a kernel internal table.
16561f27 882 */
8a09317b
DH
883 SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
884
c82965f9
CB
885 /*
886 * Handling GSBASE depends on the availability of FSGSBASE.
887 *
888 * Without FSGSBASE the kernel enforces that negative GSBASE
889 * values indicate kernel GSBASE. With FSGSBASE no assumptions
890 * can be made about the GSBASE value when entering from user
891 * space.
892 */
893 ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
894
895 /*
896 * Read the current GSBASE and store it in %rbx unconditionally,
897 * retrieve and set the current CPUs kernel GSBASE. The stored value
898 * has to be restored in paranoid_exit unconditionally.
899 *
0b2c605f
BP
900 * The unconditional write to GS base below ensures that no subsequent
901 * loads based on a mispredicted GS base can happen, therefore no LFENCE
902 * is needed here.
c82965f9
CB
903 */
904 SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
5a8cd547 905 RET
c82965f9
CB
906
907.Lparanoid_entry_checkgs:
96b23714
CB
908 /* EBX = 1 -> kernel GSBASE active, no restore required */
909 movl $1, %ebx
488d5b3b 910
96b23714
CB
911 /*
912 * The kernel-enforced convention is a negative GSBASE indicates
913 * a kernel value. No SWAPGS needed on entry and exit.
914 */
915 movl $MSR_GS_BASE, %ecx
916 rdmsr
917 testl %edx, %edx
488d5b3b 918 js .Lparanoid_kernel_gsbase
96b23714 919
488d5b3b
LJ
920 /* EBX = 0 -> SWAPGS required on exit */
921 xorl %ebx, %ebx
53c9d924 922 swapgs
488d5b3b 923.Lparanoid_kernel_gsbase:
96b23714 924
18ec54fd 925 FENCE_SWAPGS_KERNEL_ENTRY
5a8cd547 926 RET
ef1e0315 927SYM_CODE_END(paranoid_entry)
ddeb8f21 928
ebfc453e
DV
929/*
930 * "Paranoid" exit path from exception stack. This is invoked
931 * only on return from non-NMI IST interrupts that came
932 * from kernel space.
933 *
934 * We may be returning to very strange contexts (e.g. very early
935 * in syscall entry), so checking for preemption here would
c82965f9
CB
936 * be complicated. Fortunately, there's no good reason to try
937 * to handle preemption here.
938 *
939 * R/EBX contains the GSBASE related information depending on the
940 * availability of the FSGSBASE instructions:
941 *
942 * FSGSBASE R/EBX
943 * N 0 -> SWAPGS on exit
944 * 1 -> no SWAPGS on exit
4d732138 945 *
c82965f9 946 * Y User space GSBASE, must be restored unconditionally
ebfc453e 947 */
ef1e0315 948SYM_CODE_START_LOCAL(paranoid_exit)
8c1f7558 949 UNWIND_HINT_REGS
c82965f9
CB
950 /*
951 * The order of operations is important. RESTORE_CR3 requires
952 * kernel GSBASE.
953 *
954 * NB to anyone to try to optimize this code: this code does
955 * not execute at all for exceptions from user mode. Those
956 * exceptions go through error_exit instead.
957 */
958 RESTORE_CR3 scratch_reg=%rax save_reg=%r14
959
960 /* Handle the three GSBASE cases */
961 ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
962
963 /* With FSGSBASE enabled, unconditionally restore GSBASE */
964 wrgsbase %rbx
965 jmp restore_regs_and_return_to_kernel
966
967.Lparanoid_exit_checkgs:
968 /* On non-FSGSBASE systems, conditionally do SWAPGS */
969 testl %ebx, %ebx
970 jnz restore_regs_and_return_to_kernel
971
972 /* We are returning to a context with user GSBASE */
53c9d924 973 swapgs
c82965f9 974 jmp restore_regs_and_return_to_kernel
ef1e0315 975SYM_CODE_END(paranoid_exit)
ddeb8f21
AH
976
977/*
7f0f2134 978 * Switch GS and CR3 if needed.
ddeb8f21 979 */
ef1e0315 980SYM_CODE_START_LOCAL(error_entry)
9e809d15 981 UNWIND_HINT_FUNC
ddeb8f21 982 cld
03335e95 983 testb $3, CS+8(%rsp)
cb6f64ed 984 jz .Lerror_kernelspace
539f5113 985
cb6f64ed
AL
986 /*
987 * We entered from user mode or we're pretending to have entered
988 * from user mode due to an IRET fault.
989 */
ddeb8f21 990 SWAPGS
18ec54fd 991 FENCE_SWAPGS_USER_ENTRY
8a09317b
DH
992 /* We have user CR3. Change to kernel CR3. */
993 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
539f5113 994
90f93ae2 995 leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
cb6f64ed 996.Lerror_entry_from_usermode_after_swapgs:
7f2590a1 997 /* Put us onto the real thread stack. */
7f2590a1 998 call sync_regs
5a8cd547 999 RET
02bc7768 1000
ebfc453e
DV
1001 /*
1002 * There are two places in the kernel that can potentially fault with
1003 * usergs. Handle them here. B stepping K8s sometimes report a
1004 * truncated RIP for IRET exceptions returning to compat mode. Check
1005 * for these here too.
1006 */
cb6f64ed 1007.Lerror_kernelspace:
4d732138
IM
1008 leaq native_irq_return_iret(%rip), %rcx
1009 cmpq %rcx, RIP+8(%rsp)
cb6f64ed 1010 je .Lerror_bad_iret
4d732138
IM
1011 movl %ecx, %eax /* zero extend */
1012 cmpq %rax, RIP+8(%rsp)
cb6f64ed 1013 je .Lbstep_iret
42c748bb 1014 cmpq $.Lgs_change, RIP+8(%rsp)
18ec54fd 1015 jne .Lerror_entry_done_lfence
539f5113
AL
1016
1017 /*
42c748bb 1018 * hack: .Lgs_change can fail with user gsbase. If this happens, fix up
539f5113 1019 * gsbase and proceed. We'll fix up the exception and land in
42c748bb 1020 * .Lgs_change's error handler with kernel gsbase.
539f5113 1021 */
2fa5f04f 1022 SWAPGS
2347f1ad
LJ
1023
1024 /*
1025 * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
1026 * kernel or user gsbase.
1027 */
1028.Lerror_entry_done_lfence:
1029 FENCE_SWAPGS_KERNEL_ENTRY
90f93ae2 1030 leaq 8(%rsp), %rax /* return pt_regs pointer */
5a8cd547 1031 RET
ae24ffe5 1032
cb6f64ed 1033.Lbstep_iret:
ae24ffe5 1034 /* Fix truncated RIP */
4d732138 1035 movq %rcx, RIP+8(%rsp)
b645af2d
AL
1036 /* fall through */
1037
cb6f64ed 1038.Lerror_bad_iret:
539f5113 1039 /*
8a09317b
DH
1040 * We came from an IRET to user mode, so we have user
1041 * gsbase and CR3. Switch to kernel gsbase and CR3:
539f5113 1042 */
b645af2d 1043 SWAPGS
18ec54fd 1044 FENCE_SWAPGS_USER_ENTRY
8a09317b 1045 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
539f5113
AL
1046
1047 /*
1048 * Pretend that the exception came from user mode: set up pt_regs
b3681dd5 1049 * as if we faulted immediately after IRET.
539f5113 1050 */
90f93ae2 1051 leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
4d732138 1052 call fixup_bad_iret
90f93ae2 1053 mov %rax, %rdi
cb6f64ed 1054 jmp .Lerror_entry_from_usermode_after_swapgs
ef1e0315 1055SYM_CODE_END(error_entry)
ddeb8f21 1056
424c7d0a
TG
1057SYM_CODE_START_LOCAL(error_return)
1058 UNWIND_HINT_REGS
1059 DEBUG_ENTRY_ASSERT_IRQS_OFF
1060 testb $3, CS(%rsp)
1061 jz restore_regs_and_return_to_kernel
1062 jmp swapgs_restore_regs_and_return_to_usermode
1063SYM_CODE_END(error_return)
1064
929bacec
AL
1065/*
1066 * Runs on exception stack. Xen PV does not go through this path at all,
1067 * so we can use real assembly here.
8a09317b
DH
1068 *
1069 * Registers:
1070 * %r14: Used to save/restore the CR3 of the interrupted context
1071 * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
929bacec 1072 */
6271fef0 1073SYM_CODE_START(asm_exc_nmi)
8c1f7558 1074 UNWIND_HINT_IRET_REGS
929bacec 1075
3f3c8b8c
SR
1076 /*
1077 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1078 * the iretq it performs will take us out of NMI context.
1079 * This means that we can have nested NMIs where the next
1080 * NMI is using the top of the stack of the previous NMI. We
1081 * can't let it execute because the nested NMI will corrupt the
1082 * stack of the previous NMI. NMI handlers are not re-entrant
1083 * anyway.
1084 *
1085 * To handle this case we do the following:
1086 * Check the a special location on the stack that contains
1087 * a variable that is set when NMIs are executing.
1088 * The interrupted task's stack is also checked to see if it
1089 * is an NMI stack.
1090 * If the variable is not set and the stack is not the NMI
1091 * stack then:
1092 * o Set the special variable on the stack
0b22930e
AL
1093 * o Copy the interrupt frame into an "outermost" location on the
1094 * stack
1095 * o Copy the interrupt frame into an "iret" location on the stack
3f3c8b8c
SR
1096 * o Continue processing the NMI
1097 * If the variable is set or the previous stack is the NMI stack:
0b22930e 1098 * o Modify the "iret" location to jump to the repeat_nmi
3f3c8b8c
SR
1099 * o return back to the first NMI
1100 *
1101 * Now on exit of the first NMI, we first clear the stack variable
1102 * The NMI stack will tell any nested NMIs at that point that it is
1103 * nested. Then we pop the stack normally with iret, and if there was
1104 * a nested NMI that updated the copy interrupt stack frame, a
1105 * jump will be made to the repeat_nmi code that will handle the second
1106 * NMI.
9b6e6a83
AL
1107 *
1108 * However, espfix prevents us from directly returning to userspace
1109 * with a single IRET instruction. Similarly, IRET to user mode
1110 * can fault. We therefore handle NMIs from user space like
1111 * other IST entries.
3f3c8b8c
SR
1112 */
1113
e93c1730
AL
1114 ASM_CLAC
1115
146b2b09 1116 /* Use %rdx as our temp variable throughout */
4d732138 1117 pushq %rdx
3f3c8b8c 1118
9b6e6a83
AL
1119 testb $3, CS-RIP+8(%rsp)
1120 jz .Lnmi_from_kernel
1121
1122 /*
1123 * NMI from user mode. We need to run on the thread stack, but we
1124 * can't go through the normal entry paths: NMIs are masked, and
1125 * we don't want to enable interrupts, because then we'll end
1126 * up in an awkward situation in which IRQs are on but NMIs
1127 * are off.
83c133cf
AL
1128 *
1129 * We also must not push anything to the stack before switching
1130 * stacks lest we corrupt the "NMI executing" variable.
9b6e6a83
AL
1131 */
1132
929bacec 1133 swapgs
9b6e6a83 1134 cld
18ec54fd 1135 FENCE_SWAPGS_USER_ENTRY
8a09317b 1136 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
9b6e6a83
AL
1137 movq %rsp, %rdx
1138 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
8c1f7558 1139 UNWIND_HINT_IRET_REGS base=%rdx offset=8
9b6e6a83
AL
1140 pushq 5*8(%rdx) /* pt_regs->ss */
1141 pushq 4*8(%rdx) /* pt_regs->rsp */
1142 pushq 3*8(%rdx) /* pt_regs->flags */
1143 pushq 2*8(%rdx) /* pt_regs->cs */
1144 pushq 1*8(%rdx) /* pt_regs->rip */
8c1f7558 1145 UNWIND_HINT_IRET_REGS
9b6e6a83 1146 pushq $-1 /* pt_regs->orig_ax */
30907fd1 1147 PUSH_AND_CLEAR_REGS rdx=(%rdx)
946c1911 1148 ENCODE_FRAME_POINTER
9b6e6a83
AL
1149
1150 /*
1151 * At this point we no longer need to worry about stack damage
1152 * due to nesting -- we're on the normal thread stack and we're
1153 * done with the NMI stack.
1154 */
1155
1156 movq %rsp, %rdi
1157 movq $-1, %rsi
6271fef0 1158 call exc_nmi
9b6e6a83 1159
45d5a168 1160 /*
9b6e6a83 1161 * Return back to user mode. We must *not* do the normal exit
946c1911 1162 * work, because we don't want to enable interrupts.
45d5a168 1163 */
8a055d7f 1164 jmp swapgs_restore_regs_and_return_to_usermode
45d5a168 1165
9b6e6a83 1166.Lnmi_from_kernel:
3f3c8b8c 1167 /*
0b22930e
AL
1168 * Here's what our stack frame will look like:
1169 * +---------------------------------------------------------+
1170 * | original SS |
1171 * | original Return RSP |
1172 * | original RFLAGS |
1173 * | original CS |
1174 * | original RIP |
1175 * +---------------------------------------------------------+
1176 * | temp storage for rdx |
1177 * +---------------------------------------------------------+
1178 * | "NMI executing" variable |
1179 * +---------------------------------------------------------+
1180 * | iret SS } Copied from "outermost" frame |
1181 * | iret Return RSP } on each loop iteration; overwritten |
1182 * | iret RFLAGS } by a nested NMI to force another |
1183 * | iret CS } iteration if needed. |
1184 * | iret RIP } |
1185 * +---------------------------------------------------------+
1186 * | outermost SS } initialized in first_nmi; |
1187 * | outermost Return RSP } will not be changed before |
1188 * | outermost RFLAGS } NMI processing is done. |
1189 * | outermost CS } Copied to "iret" frame on each |
1190 * | outermost RIP } iteration. |
1191 * +---------------------------------------------------------+
1192 * | pt_regs |
1193 * +---------------------------------------------------------+
1194 *
1195 * The "original" frame is used by hardware. Before re-enabling
1196 * NMIs, we need to be done with it, and we need to leave enough
1197 * space for the asm code here.
1198 *
1199 * We return by executing IRET while RSP points to the "iret" frame.
1200 * That will either return for real or it will loop back into NMI
1201 * processing.
1202 *
1203 * The "outermost" frame is copied to the "iret" frame on each
1204 * iteration of the loop, so each iteration starts with the "iret"
1205 * frame pointing to the final return target.
1206 */
1207
45d5a168 1208 /*
0b22930e
AL
1209 * Determine whether we're a nested NMI.
1210 *
a27507ca
AL
1211 * If we interrupted kernel code between repeat_nmi and
1212 * end_repeat_nmi, then we are a nested NMI. We must not
1213 * modify the "iret" frame because it's being written by
1214 * the outer NMI. That's okay; the outer NMI handler is
6271fef0 1215 * about to about to call exc_nmi() anyway, so we can just
a27507ca 1216 * resume the outer NMI.
45d5a168 1217 */
a27507ca
AL
1218
1219 movq $repeat_nmi, %rdx
1220 cmpq 8(%rsp), %rdx
1221 ja 1f
1222 movq $end_repeat_nmi, %rdx
1223 cmpq 8(%rsp), %rdx
1224 ja nested_nmi_out
12251:
45d5a168 1226
3f3c8b8c 1227 /*
a27507ca 1228 * Now check "NMI executing". If it's set, then we're nested.
0b22930e
AL
1229 * This will not detect if we interrupted an outer NMI just
1230 * before IRET.
3f3c8b8c 1231 */
4d732138
IM
1232 cmpl $1, -8(%rsp)
1233 je nested_nmi
3f3c8b8c
SR
1234
1235 /*
0b22930e
AL
1236 * Now test if the previous stack was an NMI stack. This covers
1237 * the case where we interrupt an outer NMI after it clears
810bc075
AL
1238 * "NMI executing" but before IRET. We need to be careful, though:
1239 * there is one case in which RSP could point to the NMI stack
1240 * despite there being no NMI active: naughty userspace controls
1241 * RSP at the very beginning of the SYSCALL targets. We can
1242 * pull a fast one on naughty userspace, though: we program
1243 * SYSCALL to mask DF, so userspace cannot cause DF to be set
1244 * if it controls the kernel's RSP. We set DF before we clear
1245 * "NMI executing".
3f3c8b8c 1246 */
0784b364
DV
1247 lea 6*8(%rsp), %rdx
1248 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1249 cmpq %rdx, 4*8(%rsp)
1250 /* If the stack pointer is above the NMI stack, this is a normal NMI */
1251 ja first_nmi
4d732138 1252
0784b364
DV
1253 subq $EXCEPTION_STKSZ, %rdx
1254 cmpq %rdx, 4*8(%rsp)
1255 /* If it is below the NMI stack, it is a normal NMI */
1256 jb first_nmi
810bc075
AL
1257
1258 /* Ah, it is within the NMI stack. */
1259
1260 testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
1261 jz first_nmi /* RSP was user controlled. */
1262
1263 /* This is a nested NMI. */
0784b364 1264
3f3c8b8c
SR
1265nested_nmi:
1266 /*
0b22930e
AL
1267 * Modify the "iret" frame to point to repeat_nmi, forcing another
1268 * iteration of NMI handling.
3f3c8b8c 1269 */
23a781e9 1270 subq $8, %rsp
4d732138
IM
1271 leaq -10*8(%rsp), %rdx
1272 pushq $__KERNEL_DS
1273 pushq %rdx
131484c8 1274 pushfq
4d732138
IM
1275 pushq $__KERNEL_CS
1276 pushq $repeat_nmi
3f3c8b8c
SR
1277
1278 /* Put stack back */
4d732138 1279 addq $(6*8), %rsp
3f3c8b8c
SR
1280
1281nested_nmi_out:
4d732138 1282 popq %rdx
3f3c8b8c 1283
0b22930e 1284 /* We are returning to kernel mode, so this cannot result in a fault. */
929bacec 1285 iretq
3f3c8b8c
SR
1286
1287first_nmi:
0b22930e 1288 /* Restore rdx. */
4d732138 1289 movq (%rsp), %rdx
62610913 1290
36f1a77b
AL
1291 /* Make room for "NMI executing". */
1292 pushq $0
3f3c8b8c 1293
0b22930e 1294 /* Leave room for the "iret" frame */
4d732138 1295 subq $(5*8), %rsp
28696f43 1296
0b22930e 1297 /* Copy the "original" frame to the "outermost" frame */
3f3c8b8c 1298 .rept 5
4d732138 1299 pushq 11*8(%rsp)
3f3c8b8c 1300 .endr
8c1f7558 1301 UNWIND_HINT_IRET_REGS
62610913 1302
79fb4ad6
SR
1303 /* Everything up to here is safe from nested NMIs */
1304
a97439aa
AL
1305#ifdef CONFIG_DEBUG_ENTRY
1306 /*
1307 * For ease of testing, unmask NMIs right away. Disabled by
1308 * default because IRET is very expensive.
1309 */
1310 pushq $0 /* SS */
1311 pushq %rsp /* RSP (minus 8 because of the previous push) */
1312 addq $8, (%rsp) /* Fix up RSP */
1313 pushfq /* RFLAGS */
1314 pushq $__KERNEL_CS /* CS */
1315 pushq $1f /* RIP */
929bacec 1316 iretq /* continues at repeat_nmi below */
8c1f7558 1317 UNWIND_HINT_IRET_REGS
a97439aa
AL
13181:
1319#endif
1320
0b22930e 1321repeat_nmi:
62610913
JB
1322 /*
1323 * If there was a nested NMI, the first NMI's iret will return
1324 * here. But NMIs are still enabled and we can take another
1325 * nested NMI. The nested NMI checks the interrupted RIP to see
1326 * if it is between repeat_nmi and end_repeat_nmi, and if so
1327 * it will just return, as we are about to repeat an NMI anyway.
1328 * This makes it safe to copy to the stack frame that a nested
1329 * NMI will update.
0b22930e
AL
1330 *
1331 * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
1332 * we're repeating an NMI, gsbase has the same value that it had on
1333 * the first iteration. paranoid_entry will load the kernel
6271fef0 1334 * gsbase if needed before we call exc_nmi(). "NMI executing"
36f1a77b 1335 * is zero.
62610913 1336 */
36f1a77b 1337 movq $1, 10*8(%rsp) /* Set "NMI executing". */
3f3c8b8c 1338
62610913 1339 /*
0b22930e
AL
1340 * Copy the "outermost" frame to the "iret" frame. NMIs that nest
1341 * here must not modify the "iret" frame while we're writing to
1342 * it or it will end up containing garbage.
62610913 1343 */
4d732138 1344 addq $(10*8), %rsp
3f3c8b8c 1345 .rept 5
4d732138 1346 pushq -6*8(%rsp)
3f3c8b8c 1347 .endr
4d732138 1348 subq $(5*8), %rsp
62610913 1349end_repeat_nmi:
3f3c8b8c
SR
1350
1351 /*
0b22930e
AL
1352 * Everything below this point can be preempted by a nested NMI.
1353 * If this happens, then the inner NMI will change the "iret"
1354 * frame to point back to repeat_nmi.
3f3c8b8c 1355 */
4d732138 1356 pushq $-1 /* ORIG_RAX: no syscall to restart */
76f5df43 1357
1fd466ef 1358 /*
ebfc453e 1359 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1fd466ef
SR
1360 * as we should not be calling schedule in NMI context.
1361 * Even with normal interrupts enabled. An NMI should not be
1362 * setting NEED_RESCHED or anything that normal interrupts and
1363 * exceptions might do.
1364 */
4d732138 1365 call paranoid_entry
8c1f7558 1366 UNWIND_HINT_REGS
7fbb98c5 1367
4d732138
IM
1368 movq %rsp, %rdi
1369 movq $-1, %rsi
6271fef0 1370 call exc_nmi
7fbb98c5 1371
16561f27 1372 /* Always restore stashed CR3 value (see paranoid_entry) */
21e94459 1373 RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
8a09317b 1374
c82965f9
CB
1375 /*
1376 * The above invocation of paranoid_entry stored the GSBASE
1377 * related information in R/EBX depending on the availability
1378 * of FSGSBASE.
1379 *
1380 * If FSGSBASE is enabled, restore the saved GSBASE value
1381 * unconditionally, otherwise take the conditional SWAPGS path.
1382 */
1383 ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
1384
1385 wrgsbase %rbx
1386 jmp nmi_restore
1387
1388nmi_no_fsgsbase:
1389 /* EBX == 0 -> invoke SWAPGS */
1390 testl %ebx, %ebx
4d732138 1391 jnz nmi_restore
c82965f9 1392
ddeb8f21 1393nmi_swapgs:
53c9d924 1394 swapgs
c82965f9 1395
ddeb8f21 1396nmi_restore:
502af0d7 1397 POP_REGS
0b22930e 1398
471ee483
AL
1399 /*
1400 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
1401 * at the "iret" frame.
1402 */
1403 addq $6*8, %rsp
28696f43 1404
810bc075
AL
1405 /*
1406 * Clear "NMI executing". Set DF first so that we can easily
1407 * distinguish the remaining code between here and IRET from
929bacec
AL
1408 * the SYSCALL entry and exit paths.
1409 *
1410 * We arguably should just inspect RIP instead, but I (Andy) wrote
1411 * this code when I had the misapprehension that Xen PV supported
1412 * NMIs, and Xen PV would break that approach.
810bc075
AL
1413 */
1414 std
1415 movq $0, 5*8(%rsp) /* clear "NMI executing" */
0b22930e
AL
1416
1417 /*
929bacec
AL
1418 * iretq reads the "iret" frame and exits the NMI stack in a
1419 * single instruction. We are returning to kernel mode, so this
1420 * cannot result in a fault. Similarly, we don't need to worry
1421 * about espfix64 on the way back to kernel mode.
0b22930e 1422 */
929bacec 1423 iretq
6271fef0 1424SYM_CODE_END(asm_exc_nmi)
ddeb8f21 1425
dffb3f9d
AL
1426#ifndef CONFIG_IA32_EMULATION
1427/*
1428 * This handles SYSCALL from 32-bit code. There is no way to program
1429 * MSRs to fully disable 32-bit SYSCALL.
1430 */
bc7b11c0 1431SYM_CODE_START(ignore_sysret)
8c1f7558 1432 UNWIND_HINT_EMPTY
4d732138 1433 mov $-ENOSYS, %eax
b2b1d94c 1434 sysretl
bc7b11c0 1435SYM_CODE_END(ignore_sysret)
dffb3f9d 1436#endif
2deb4be2 1437
b9f6976b 1438.pushsection .text, "ax"
bc7b11c0 1439SYM_CODE_START(rewind_stack_do_exit)
8c1f7558 1440 UNWIND_HINT_FUNC
2deb4be2
AL
1441 /* Prevent any naive code from trying to unwind to our caller. */
1442 xorl %ebp, %ebp
1443
1444 movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
8c1f7558 1445 leaq -PTREGS_SIZE(%rax), %rsp
f977df7b 1446 UNWIND_HINT_REGS
2deb4be2
AL
1447
1448 call do_exit
bc7b11c0 1449SYM_CODE_END(rewind_stack_do_exit)
b9f6976b 1450.popsection