]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - arch/x86/entry/entry_64.S
x86/entry: Remove the apic/BUILD interrupt leftovers
[mirror_ubuntu-hirsute-kernel.git] / arch / x86 / entry / entry_64.S
CommitLineData
b2441318 1/* SPDX-License-Identifier: GPL-2.0 */
1da177e4
LT
2/*
3 * linux/arch/x86_64/entry.S
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
7 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
4d732138 8 *
1da177e4
LT
9 * entry.S contains the system-call and fault low-level handling routines.
10 *
cb1aaebe 11 * Some of this is documented in Documentation/x86/entry_64.rst
8b4777a4 12 *
0bd7b798 13 * A note on terminology:
4d732138
IM
14 * - iret frame: Architecture defined interrupt frame from SS to RIP
15 * at the top of the kernel process stack.
2e91a17b
AK
16 *
17 * Some macro usage:
6dcc5627 18 * - SYM_FUNC_START/END:Define functions in the symbol table.
4d732138
IM
19 * - TRACE_IRQ_*: Trace hardirq state for lock debugging.
20 * - idtentry: Define exception entry points.
1da177e4 21 */
1da177e4
LT
22#include <linux/linkage.h>
23#include <asm/segment.h>
1da177e4
LT
24#include <asm/cache.h>
25#include <asm/errno.h>
e2d5df93 26#include <asm/asm-offsets.h>
1da177e4
LT
27#include <asm/msr.h>
28#include <asm/unistd.h>
29#include <asm/thread_info.h>
30#include <asm/hw_irq.h>
0341c14d 31#include <asm/page_types.h>
2601e64d 32#include <asm/irqflags.h>
72fe4858 33#include <asm/paravirt.h>
9939ddaf 34#include <asm/percpu.h>
d7abc0fa 35#include <asm/asm.h>
63bcff2a 36#include <asm/smap.h>
3891a04a 37#include <asm/pgtable_types.h>
784d5699 38#include <asm/export.h>
8c1f7558 39#include <asm/frame.h>
cfa82a00 40#include <asm/trapnr.h>
2641f08b 41#include <asm/nospec-branch.h>
d7e7528b 42#include <linux/err.h>
1da177e4 43
6fd166aa
PZ
44#include "calling.h"
45
4d732138
IM
46.code64
47.section .entry.text, "ax"
16444a8a 48
72fe4858 49#ifdef CONFIG_PARAVIRT
bc7b11c0 50SYM_CODE_START(native_usergs_sysret64)
8c1f7558 51 UNWIND_HINT_EMPTY
72fe4858
GOC
52 swapgs
53 sysretq
bc7b11c0 54SYM_CODE_END(native_usergs_sysret64)
72fe4858
GOC
55#endif /* CONFIG_PARAVIRT */
56
ca37e57b 57.macro TRACE_IRQS_FLAGS flags:req
2601e64d 58#ifdef CONFIG_TRACE_IRQFLAGS
a368d7fd 59 btl $9, \flags /* interrupts off? */
4d732138 60 jnc 1f
2601e64d
IM
61 TRACE_IRQS_ON
621:
63#endif
64.endm
65
ca37e57b
AL
66.macro TRACE_IRQS_IRETQ
67 TRACE_IRQS_FLAGS EFLAGS(%rsp)
68.endm
69
5963e317
SR
70/*
71 * When dynamic function tracer is enabled it will add a breakpoint
72 * to all locations that it is about to modify, sync CPUs, update
73 * all the code, sync CPUs, then remove the breakpoints. In this time
74 * if lockdep is enabled, it might jump back into the debug handler
75 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
76 *
77 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
78 * make sure the stack pointer does not get reset back to the top
79 * of the debug stack, and instead just reuses the current stack.
80 */
81#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
82
83.macro TRACE_IRQS_OFF_DEBUG
4d732138 84 call debug_stack_set_zero
5963e317 85 TRACE_IRQS_OFF
4d732138 86 call debug_stack_reset
5963e317
SR
87.endm
88
89.macro TRACE_IRQS_ON_DEBUG
4d732138 90 call debug_stack_set_zero
5963e317 91 TRACE_IRQS_ON
4d732138 92 call debug_stack_reset
5963e317
SR
93.endm
94
f2db9382 95.macro TRACE_IRQS_IRETQ_DEBUG
6709812f 96 btl $9, EFLAGS(%rsp) /* interrupts off? */
4d732138 97 jnc 1f
5963e317
SR
98 TRACE_IRQS_ON_DEBUG
991:
100.endm
101
102#else
4d732138
IM
103# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
104# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
105# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
5963e317
SR
106#endif
107
1da177e4 108/*
4d732138 109 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
1da177e4 110 *
fda57b22
AL
111 * This is the only entry point used for 64-bit system calls. The
112 * hardware interface is reasonably well designed and the register to
113 * argument mapping Linux uses fits well with the registers that are
114 * available when SYSCALL is used.
115 *
116 * SYSCALL instructions can be found inlined in libc implementations as
117 * well as some other programs and libraries. There are also a handful
118 * of SYSCALL instructions in the vDSO used, for example, as a
119 * clock_gettimeofday fallback.
120 *
4d732138 121 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
b87cf63e
DV
122 * then loads new ss, cs, and rip from previously programmed MSRs.
123 * rflags gets masked by a value from another MSR (so CLD and CLAC
124 * are not needed). SYSCALL does not save anything on the stack
125 * and does not change rsp.
126 *
127 * Registers on entry:
1da177e4 128 * rax system call number
b87cf63e
DV
129 * rcx return address
130 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
1da177e4 131 * rdi arg0
1da177e4 132 * rsi arg1
0bd7b798 133 * rdx arg2
b87cf63e 134 * r10 arg3 (needs to be moved to rcx to conform to C ABI)
1da177e4
LT
135 * r8 arg4
136 * r9 arg5
4d732138 137 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
0bd7b798 138 *
1da177e4
LT
139 * Only called from user space.
140 *
7fcb3bc3 141 * When user can change pt_regs->foo always force IRET. That is because
7bf36bbc
AK
142 * it deals with uncanonical addresses better. SYSRET has trouble
143 * with them due to bugs in both AMD and Intel CPUs.
0bd7b798 144 */
1da177e4 145
bc7b11c0 146SYM_CODE_START(entry_SYSCALL_64)
8c1f7558 147 UNWIND_HINT_EMPTY
9ed8e7d8
DV
148 /*
149 * Interrupts are off on entry.
150 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
151 * it is too small to ever cause noticeable irq latency.
152 */
72fe4858 153
8a9949bc 154 swapgs
bf904d27 155 /* tss.sp2 is scratch space. */
98f05b51 156 movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
bf904d27 157 SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
4d732138 158 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
9ed8e7d8
DV
159
160 /* Construct struct pt_regs on stack */
98f05b51
AL
161 pushq $__USER_DS /* pt_regs->ss */
162 pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
163 pushq %r11 /* pt_regs->flags */
164 pushq $__USER_CS /* pt_regs->cs */
165 pushq %rcx /* pt_regs->ip */
26ba4e57 166SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
98f05b51 167 pushq %rax /* pt_regs->orig_ax */
30907fd1
DB
168
169 PUSH_AND_CLEAR_REGS rax=$-ENOSYS
4d732138 170
1e423bff 171 /* IRQs are off. */
dfe64506
LT
172 movq %rax, %rdi
173 movq %rsp, %rsi
1e423bff
AL
174 call do_syscall_64 /* returns with IRQs disabled */
175
fffbb5dc
DV
176 /*
177 * Try to use SYSRET instead of IRET if we're returning to
8a055d7f
AL
178 * a completely clean 64-bit userspace context. If we're not,
179 * go to the slow exit path.
fffbb5dc 180 */
4d732138
IM
181 movq RCX(%rsp), %rcx
182 movq RIP(%rsp), %r11
8a055d7f
AL
183
184 cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
185 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
186
187 /*
188 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
189 * in kernel space. This essentially lets the user take over
17be0aec 190 * the kernel, since userspace controls RSP.
fffbb5dc 191 *
17be0aec 192 * If width of "canonical tail" ever becomes variable, this will need
fffbb5dc 193 * to be updated to remain correct on both old and new CPUs.
361b4b58 194 *
cbe0317b
KS
195 * Change top bits to match most significant bit (47th or 56th bit
196 * depending on paging mode) in the address.
fffbb5dc 197 */
09e61a77 198#ifdef CONFIG_X86_5LEVEL
39b95522
KS
199 ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
200 "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
09e61a77 201#else
17be0aec
DV
202 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
203 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
09e61a77 204#endif
4d732138 205
17be0aec
DV
206 /* If this changed %rcx, it was not canonical */
207 cmpq %rcx, %r11
8a055d7f 208 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc 209
4d732138 210 cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
8a055d7f 211 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc 212
4d732138
IM
213 movq R11(%rsp), %r11
214 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
8a055d7f 215 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
216
217 /*
3e035305
BP
218 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
219 * restore RF properly. If the slowpath sets it for whatever reason, we
220 * need to restore it correctly.
221 *
222 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
223 * trap from userspace immediately after SYSRET. This would cause an
224 * infinite loop whenever #DB happens with register state that satisfies
225 * the opportunistic SYSRET conditions. For example, single-stepping
226 * this user code:
fffbb5dc 227 *
4d732138 228 * movq $stuck_here, %rcx
fffbb5dc
DV
229 * pushfq
230 * popq %r11
231 * stuck_here:
232 *
233 * would never get past 'stuck_here'.
234 */
4d732138 235 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
8a055d7f 236 jnz swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
237
238 /* nothing to check for RSP */
239
4d732138 240 cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
8a055d7f 241 jne swapgs_restore_regs_and_return_to_usermode
fffbb5dc
DV
242
243 /*
4d732138
IM
244 * We win! This label is here just for ease of understanding
245 * perf profiles. Nothing jumps here.
fffbb5dc
DV
246 */
247syscall_return_via_sysret:
17be0aec 248 /* rcx and r11 are already restored (see code above) */
502af0d7 249 POP_REGS pop_rdi=0 skip_r11rcx=1
3e3b9293
AL
250
251 /*
252 * Now all regs are restored except RSP and RDI.
253 * Save old stack pointer and switch to trampoline stack.
254 */
255 movq %rsp, %rdi
c482feef 256 movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
1fb14363 257 UNWIND_HINT_EMPTY
3e3b9293
AL
258
259 pushq RSP-RDI(%rdi) /* RSP */
260 pushq (%rdi) /* RDI */
261
262 /*
263 * We are on the trampoline stack. All regs except RDI are live.
264 * We can do future final exit work right here.
265 */
afaef01c
AP
266 STACKLEAK_ERASE_NOCLOBBER
267
6fd166aa 268 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
3e3b9293 269
4fbb3910 270 popq %rdi
3e3b9293 271 popq %rsp
fffbb5dc 272 USERGS_SYSRET64
bc7b11c0 273SYM_CODE_END(entry_SYSCALL_64)
0bd7b798 274
0100301b
BG
275/*
276 * %rdi: prev task
277 * %rsi: next task
278 */
b9f6976b 279.pushsection .text, "ax"
96c64806 280SYM_FUNC_START(__switch_to_asm)
0100301b
BG
281 /*
282 * Save callee-saved registers
283 * This must match the order in inactive_task_frame
284 */
285 pushq %rbp
286 pushq %rbx
287 pushq %r12
288 pushq %r13
289 pushq %r14
290 pushq %r15
291
292 /* switch stack */
293 movq %rsp, TASK_threadsp(%rdi)
294 movq TASK_threadsp(%rsi), %rsp
295
050e9baa 296#ifdef CONFIG_STACKPROTECTOR
0100301b 297 movq TASK_stack_canary(%rsi), %rbx
e6401c13 298 movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
0100301b
BG
299#endif
300
c995efd5
DW
301#ifdef CONFIG_RETPOLINE
302 /*
303 * When switching from a shallower to a deeper call stack
304 * the RSB may either underflow or use entries populated
305 * with userspace addresses. On CPUs where those concerns
306 * exist, overwrite the RSB with entries which capture
307 * speculative execution to prevent attack.
308 */
d1c99108 309 FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
c995efd5
DW
310#endif
311
0100301b
BG
312 /* restore callee-saved registers */
313 popq %r15
314 popq %r14
315 popq %r13
316 popq %r12
317 popq %rbx
318 popq %rbp
319
320 jmp __switch_to
96c64806 321SYM_FUNC_END(__switch_to_asm)
b9f6976b 322.popsection
0100301b 323
1eeb207f
DV
324/*
325 * A newly forked process directly context switches into this address.
326 *
0100301b 327 * rax: prev task we switched from
616d2483
BG
328 * rbx: kernel thread func (NULL for user thread)
329 * r12: kernel thread arg
1eeb207f 330 */
b9f6976b 331.pushsection .text, "ax"
bc7b11c0 332SYM_CODE_START(ret_from_fork)
8c1f7558 333 UNWIND_HINT_EMPTY
0100301b 334 movq %rax, %rdi
ebd57499 335 call schedule_tail /* rdi: 'prev' task parameter */
1eeb207f 336
ebd57499
JP
337 testq %rbx, %rbx /* from kernel_thread? */
338 jnz 1f /* kernel threads are uncommon */
24d978b7 339
616d2483 3402:
8c1f7558 341 UNWIND_HINT_REGS
ebd57499 342 movq %rsp, %rdi
24d978b7 343 call syscall_return_slowpath /* returns with IRQs disabled */
8a055d7f 344 jmp swapgs_restore_regs_and_return_to_usermode
616d2483
BG
345
3461:
347 /* kernel thread */
d31a5802 348 UNWIND_HINT_EMPTY
616d2483 349 movq %r12, %rdi
34fdce69 350 CALL_NOSPEC rbx
616d2483
BG
351 /*
352 * A kernel thread is allowed to return here after successfully
353 * calling do_execve(). Exit to userspace to complete the execve()
354 * syscall.
355 */
356 movq $0, RAX(%rsp)
357 jmp 2b
bc7b11c0 358SYM_CODE_END(ret_from_fork)
b9f6976b 359.popsection
1eeb207f 360
1d3e53e8
AL
361.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
362#ifdef CONFIG_DEBUG_ENTRY
e17f8234
BO
363 pushq %rax
364 SAVE_FLAGS(CLBR_RAX)
365 testl $X86_EFLAGS_IF, %eax
1d3e53e8
AL
366 jz .Lokay_\@
367 ud2
368.Lokay_\@:
e17f8234 369 popq %rax
1d3e53e8
AL
370#endif
371.endm
372
373/*
374 * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers
375 * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
376 * Requires kernel GSBASE.
377 *
378 * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
379 */
2ba64741 380.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0
1d3e53e8 381 DEBUG_ENTRY_ASSERT_IRQS_OFF
2ba64741
DB
382
383 .if \save_ret
384 /*
385 * If save_ret is set, the original stack contains one additional
386 * entry -- the return address. Therefore, move the address one
387 * entry below %rsp to \old_rsp.
388 */
389 leaq 8(%rsp), \old_rsp
390 .else
1d3e53e8 391 movq %rsp, \old_rsp
2ba64741 392 .endif
8c1f7558
JP
393
394 .if \regs
395 UNWIND_HINT_REGS base=\old_rsp
396 .endif
397
1d3e53e8 398 incl PER_CPU_VAR(irq_count)
29955909 399 jnz .Lirq_stack_push_old_rsp_\@
1d3e53e8
AL
400
401 /*
402 * Right now, if we just incremented irq_count to zero, we've
403 * claimed the IRQ stack but we haven't switched to it yet.
404 *
405 * If anything is added that can interrupt us here without using IST,
406 * it must be *extremely* careful to limit its stack usage. This
407 * could include kprobes and a hypothetical future IST-less #DB
408 * handler.
29955909
AL
409 *
410 * The OOPS unwinder relies on the word at the top of the IRQ
411 * stack linking back to the previous RSP for the entire time we're
412 * on the IRQ stack. For this to work reliably, we need to write
413 * it before we actually move ourselves to the IRQ stack.
414 */
415
e6401c13 416 movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8)
758a2e31 417 movq PER_CPU_VAR(hardirq_stack_ptr), %rsp
29955909
AL
418
419#ifdef CONFIG_DEBUG_ENTRY
420 /*
421 * If the first movq above becomes wrong due to IRQ stack layout
422 * changes, the only way we'll notice is if we try to unwind right
423 * here. Assert that we set up the stack right to catch this type
424 * of bug quickly.
1d3e53e8 425 */
29955909
AL
426 cmpq -8(%rsp), \old_rsp
427 je .Lirq_stack_okay\@
428 ud2
429 .Lirq_stack_okay\@:
430#endif
1d3e53e8 431
29955909 432.Lirq_stack_push_old_rsp_\@:
1d3e53e8 433 pushq \old_rsp
8c1f7558
JP
434
435 .if \regs
436 UNWIND_HINT_REGS indirect=1
437 .endif
2ba64741
DB
438
439 .if \save_ret
440 /*
441 * Push the return address to the stack. This return address can
442 * be found at the "real" original RSP, which was offset by 8 at
443 * the beginning of this macro.
444 */
445 pushq -8(\old_rsp)
446 .endif
1d3e53e8
AL
447.endm
448
449/*
450 * Undoes ENTER_IRQ_STACK.
451 */
8c1f7558 452.macro LEAVE_IRQ_STACK regs=1
1d3e53e8
AL
453 DEBUG_ENTRY_ASSERT_IRQS_OFF
454 /* We need to be off the IRQ stack before decrementing irq_count. */
455 popq %rsp
456
8c1f7558
JP
457 .if \regs
458 UNWIND_HINT_REGS
459 .endif
460
1d3e53e8
AL
461 /*
462 * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
463 * the irq stack but we're not on it.
464 */
465
466 decl PER_CPU_VAR(irq_count)
467.endm
468
cfa82a00
TG
469/**
470 * idtentry_body - Macro to emit code calling the C function
cfa82a00
TG
471 * @cfunc: C function to be called
472 * @has_error_code: Hardware pushed error code on stack
473 */
e2dcb5f1 474.macro idtentry_body cfunc has_error_code:req
cfa82a00
TG
475
476 call error_entry
477 UNWIND_HINT_REGS
478
cfa82a00
TG
479 movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
480
481 .if \has_error_code == 1
482 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
483 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
cfa82a00
TG
484 .endif
485
cfa82a00
TG
486 call \cfunc
487
424c7d0a 488 jmp error_return
cfa82a00
TG
489.endm
490
491/**
492 * idtentry - Macro to generate entry stubs for simple IDT entries
493 * @vector: Vector number
494 * @asmsym: ASM symbol for the entry point
495 * @cfunc: C function to be called
496 * @has_error_code: Hardware pushed error code on stack
497 *
498 * The macro emits code to set up the kernel context for straight forward
499 * and simple IDT entries. No IST stack, no paranoid entry checks.
500 */
e2dcb5f1 501.macro idtentry vector asmsym cfunc has_error_code:req
cfa82a00
TG
502SYM_CODE_START(\asmsym)
503 UNWIND_HINT_IRET_REGS offset=\has_error_code*8
504 ASM_CLAC
505
506 .if \has_error_code == 0
507 pushq $-1 /* ORIG_RAX: no syscall to restart */
508 .endif
509
510 .if \vector == X86_TRAP_BP
511 /*
512 * If coming from kernel space, create a 6-word gap to allow the
513 * int3 handler to emulate a call instruction.
514 */
515 testb $3, CS-ORIG_RAX(%rsp)
516 jnz .Lfrom_usermode_no_gap_\@
517 .rept 6
518 pushq 5*8(%rsp)
519 .endr
520 UNWIND_HINT_IRET_REGS offset=8
521.Lfrom_usermode_no_gap_\@:
522 .endif
523
e2dcb5f1 524 idtentry_body \cfunc \has_error_code
cfa82a00
TG
525
526_ASM_NOKPROBE(\asmsym)
527SYM_CODE_END(\asmsym)
528.endm
529
0bf7c314
TG
530/*
531 * Interrupt entry/exit.
532 *
533 + The interrupt stubs push (vector) onto the stack, which is the error_code
534 * position of idtentry exceptions, and jump to one of the two idtentry points
535 * (common/spurious).
536 *
537 * common_interrupt is a hotpath, align it to a cache line
538 */
539.macro idtentry_irq vector cfunc
540 .p2align CONFIG_X86_L1_CACHE_SHIFT
541 idtentry \vector asm_\cfunc \cfunc has_error_code=1
542.endm
543
6368558c
TG
544/*
545 * System vectors which invoke their handlers directly and are not
546 * going through the regular common device interrupt handling code.
547 */
548.macro idtentry_sysvec vector cfunc
549 idtentry \vector asm_\cfunc \cfunc has_error_code=0
550.endm
551
cfa82a00
TG
552/*
553 * MCE and DB exceptions
554 */
555#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
556
557/**
558 * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
559 * @vector: Vector number
560 * @asmsym: ASM symbol for the entry point
561 * @cfunc: C function to be called
562 *
563 * The macro emits code to set up the kernel context for #MC and #DB
564 *
565 * If the entry comes from user space it uses the normal entry path
566 * including the return to user space work and preemption checks on
567 * exit.
568 *
569 * If hits in kernel mode then it needs to go through the paranoid
570 * entry as the exception can hit any random state. No preemption
571 * check on exit to keep the paranoid path simple.
572 *
573 * If the trap is #DB then the interrupt stack entry in the IST is
574 * moved to the second stack, so a potential recursion will have a
575 * fresh IST.
576 */
577.macro idtentry_mce_db vector asmsym cfunc
578SYM_CODE_START(\asmsym)
579 UNWIND_HINT_IRET_REGS
580 ASM_CLAC
581
582 pushq $-1 /* ORIG_RAX: no syscall to restart */
583
584 /*
585 * If the entry is from userspace, switch stacks and treat it as
586 * a normal entry.
587 */
588 testb $3, CS-ORIG_RAX(%rsp)
589 jnz .Lfrom_usermode_switch_stack_\@
590
591 /*
592 * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
593 * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
594 */
595 call paranoid_entry
596
597 UNWIND_HINT_REGS
598
599 .if \vector == X86_TRAP_DB
600 TRACE_IRQS_OFF_DEBUG
601 .else
602 TRACE_IRQS_OFF
603 .endif
604
605 movq %rsp, %rdi /* pt_regs pointer */
cfa82a00
TG
606
607 .if \vector == X86_TRAP_DB
608 subq $DB_STACK_OFFSET, CPU_TSS_IST(IST_INDEX_DB)
609 .endif
610
611 call \cfunc
612
613 .if \vector == X86_TRAP_DB
614 addq $DB_STACK_OFFSET, CPU_TSS_IST(IST_INDEX_DB)
615 .endif
616
617 jmp paranoid_exit
618
619 /* Switch to the regular task stack and use the noist entry point */
620.Lfrom_usermode_switch_stack_\@:
e2dcb5f1 621 idtentry_body noist_\cfunc, has_error_code=0
cfa82a00
TG
622
623_ASM_NOKPROBE(\asmsym)
624SYM_CODE_END(\asmsym)
625.endm
626
627/*
628 * Double fault entry. Straight paranoid. No checks from which context
629 * this comes because for the espfix induced #DF this would do the wrong
630 * thing.
631 */
632.macro idtentry_df vector asmsym cfunc
633SYM_CODE_START(\asmsym)
634 UNWIND_HINT_IRET_REGS offset=8
635 ASM_CLAC
636
637 /*
638 * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
639 * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
640 */
641 call paranoid_entry
642 UNWIND_HINT_REGS
643
cfa82a00
TG
644 movq %rsp, %rdi /* pt_regs pointer into first argument */
645 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
646 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
cfa82a00
TG
647 call \cfunc
648
649 jmp paranoid_exit
650
651_ASM_NOKPROBE(\asmsym)
652SYM_CODE_END(\asmsym)
653.endm
654
53aaf262
TG
655/*
656 * Include the defines which emit the idt entries which are shared
657 * shared between 32 and 64 bit.
658 */
659#include <asm/idtentry.h>
660
fa5e5c40 661SYM_CODE_START_LOCAL(common_interrupt_return)
26ba4e57 662SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
26c4ef9c
AL
663#ifdef CONFIG_DEBUG_ENTRY
664 /* Assert that pt_regs indicates user mode. */
1e4c4f61 665 testb $3, CS(%rsp)
26c4ef9c
AL
666 jnz 1f
667 ud2
6681:
669#endif
502af0d7 670 POP_REGS pop_rdi=0
3e3b9293
AL
671
672 /*
673 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
674 * Save old stack pointer and switch to trampoline stack.
675 */
676 movq %rsp, %rdi
c482feef 677 movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
1fb14363 678 UNWIND_HINT_EMPTY
3e3b9293
AL
679
680 /* Copy the IRET frame to the trampoline stack. */
681 pushq 6*8(%rdi) /* SS */
682 pushq 5*8(%rdi) /* RSP */
683 pushq 4*8(%rdi) /* EFLAGS */
684 pushq 3*8(%rdi) /* CS */
685 pushq 2*8(%rdi) /* RIP */
686
687 /* Push user RDI on the trampoline stack. */
688 pushq (%rdi)
689
690 /*
691 * We are on the trampoline stack. All regs except RDI are live.
692 * We can do future final exit work right here.
693 */
afaef01c 694 STACKLEAK_ERASE_NOCLOBBER
3e3b9293 695
6fd166aa 696 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
8a09317b 697
3e3b9293
AL
698 /* Restore RDI. */
699 popq %rdi
700 SWAPGS
26c4ef9c
AL
701 INTERRUPT_RETURN
702
2601e64d 703
26ba4e57 704SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
26c4ef9c
AL
705#ifdef CONFIG_DEBUG_ENTRY
706 /* Assert that pt_regs indicates kernel mode. */
1e4c4f61 707 testb $3, CS(%rsp)
26c4ef9c
AL
708 jz 1f
709 ud2
7101:
711#endif
502af0d7 712 POP_REGS
e872045b 713 addq $8, %rsp /* skip regs->orig_ax */
10bcc80e
MD
714 /*
715 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
716 * when returning from IPI handler.
717 */
7209a75d
AL
718 INTERRUPT_RETURN
719
cc66936e 720SYM_INNER_LABEL_ALIGN(native_iret, SYM_L_GLOBAL)
8c1f7558 721 UNWIND_HINT_IRET_REGS
3891a04a
PA
722 /*
723 * Are we returning to a stack segment from the LDT? Note: in
724 * 64-bit mode SS:RSP on the exception stack is always valid.
725 */
34273f41 726#ifdef CONFIG_X86_ESPFIX64
4d732138
IM
727 testb $4, (SS-RIP)(%rsp)
728 jnz native_irq_return_ldt
34273f41 729#endif
3891a04a 730
cc66936e 731SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
b645af2d
AL
732 /*
733 * This may fault. Non-paranoid faults on return to userspace are
734 * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
c29c775a 735 * Double-faults due to espfix64 are handled in exc_double_fault.
b645af2d
AL
736 * Other faults here are fatal.
737 */
1da177e4 738 iretq
3701d863 739
34273f41 740#ifdef CONFIG_X86_ESPFIX64
7209a75d 741native_irq_return_ldt:
85063fac
AL
742 /*
743 * We are running with user GSBASE. All GPRs contain their user
744 * values. We have a percpu ESPFIX stack that is eight slots
745 * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom
746 * of the ESPFIX stack.
747 *
748 * We clobber RAX and RDI in this code. We stash RDI on the
749 * normal stack and RAX on the ESPFIX stack.
750 *
751 * The ESPFIX stack layout we set up looks like this:
752 *
753 * --- top of ESPFIX stack ---
754 * SS
755 * RSP
756 * RFLAGS
757 * CS
758 * RIP <-- RSP points here when we're done
759 * RAX <-- espfix_waddr points here
760 * --- bottom of ESPFIX stack ---
761 */
762
763 pushq %rdi /* Stash user RDI */
8a09317b
DH
764 SWAPGS /* to kernel GS */
765 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
766
4d732138 767 movq PER_CPU_VAR(espfix_waddr), %rdi
85063fac
AL
768 movq %rax, (0*8)(%rdi) /* user RAX */
769 movq (1*8)(%rsp), %rax /* user RIP */
4d732138 770 movq %rax, (1*8)(%rdi)
85063fac 771 movq (2*8)(%rsp), %rax /* user CS */
4d732138 772 movq %rax, (2*8)(%rdi)
85063fac 773 movq (3*8)(%rsp), %rax /* user RFLAGS */
4d732138 774 movq %rax, (3*8)(%rdi)
85063fac 775 movq (5*8)(%rsp), %rax /* user SS */
4d732138 776 movq %rax, (5*8)(%rdi)
85063fac 777 movq (4*8)(%rsp), %rax /* user RSP */
4d732138 778 movq %rax, (4*8)(%rdi)
85063fac
AL
779 /* Now RAX == RSP. */
780
781 andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
85063fac
AL
782
783 /*
784 * espfix_stack[31:16] == 0. The page tables are set up such that
785 * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
786 * espfix_waddr for any X. That is, there are 65536 RO aliases of
787 * the same page. Set up RSP so that RSP[31:16] contains the
788 * respective 16 bits of the /userspace/ RSP and RSP nonetheless
789 * still points to an RO alias of the ESPFIX stack.
790 */
4d732138 791 orq PER_CPU_VAR(espfix_stack), %rax
8a09317b 792
6fd166aa 793 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
8a09317b
DH
794 SWAPGS /* to user GS */
795 popq %rdi /* Restore user RDI */
796
4d732138 797 movq %rax, %rsp
8c1f7558 798 UNWIND_HINT_IRET_REGS offset=8
85063fac
AL
799
800 /*
801 * At this point, we cannot write to the stack any more, but we can
802 * still read.
803 */
804 popq %rax /* Restore user RAX */
805
806 /*
807 * RSP now points to an ordinary IRET frame, except that the page
808 * is read-only and RSP[31:16] are preloaded with the userspace
809 * values. We can now IRET back to userspace.
810 */
4d732138 811 jmp native_irq_return_iret
34273f41 812#endif
fa5e5c40
TG
813SYM_CODE_END(common_interrupt_return)
814_ASM_NOKPROBE(common_interrupt_return)
3891a04a 815
b9f6976b
TG
816/*
817 * Reload gs selector with exception handling
818 * edi: new selector
819 *
820 * Is in entry.text as it shouldn't be instrumented.
821 */
410367e3 822SYM_FUNC_START(asm_load_gs_index)
8c1f7558 823 FRAME_BEGIN
c9317202 824 swapgs
42c748bb 825.Lgs_change:
4d732138 826 movl %edi, %gs
96e5d28a 8272: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
c9317202 828 swapgs
8c1f7558 829 FRAME_END
9f1e87ea 830 ret
410367e3
TG
831SYM_FUNC_END(asm_load_gs_index)
832EXPORT_SYMBOL(asm_load_gs_index)
0bd7b798 833
98ededb6 834 _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
4d732138 835 .section .fixup, "ax"
1da177e4 836 /* running with kernelgs */
ef77e688 837SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
c9317202 838 swapgs /* switch back to user gs */
b038c842
AL
839.macro ZAP_GS
840 /* This can't be a string because the preprocessor needs to see it. */
841 movl $__USER_DS, %eax
842 movl %eax, %gs
843.endm
844 ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
4d732138
IM
845 xorl %eax, %eax
846 movl %eax, %gs
847 jmp 2b
ef77e688 848SYM_CODE_END(.Lbad_gs)
9f1e87ea 849 .previous
0bd7b798 850
931b9414
TG
851/*
852 * rdi: New stack pointer points to the top word of the stack
853 * rsi: Function pointer
854 * rdx: Function argument (can be NULL if none)
855 */
856SYM_FUNC_START(asm_call_on_stack)
857 /*
858 * Save the frame pointer unconditionally. This allows the ORC
859 * unwinder to handle the stack switch.
860 */
861 pushq %rbp
862 mov %rsp, %rbp
863
864 /*
865 * The unwinder relies on the word at the top of the new stack
866 * page linking back to the previous RSP.
867 */
868 mov %rsp, (%rdi)
869 mov %rdi, %rsp
870 /* Move the argument to the right place */
871 mov %rdx, %rdi
872
8731:
874 .pushsection .discard.instr_begin
875 .long 1b - .
876 .popsection
877
878 CALL_NOSPEC rsi
879
8802:
881 .pushsection .discard.instr_end
882 .long 2b - .
883 .popsection
884
885 /* Restore the previous stack pointer from RBP. */
886 leaveq
887 ret
888SYM_FUNC_END(asm_call_on_stack)
889
28c11b0f 890#ifdef CONFIG_XEN_PV
3d75e1b8 891/*
9f1e87ea
CG
892 * A note on the "critical region" in our callback handler.
893 * We want to avoid stacking callback handlers due to events occurring
894 * during handling of the last event. To do this, we keep events disabled
895 * until we've done all processing. HOWEVER, we must enable events before
896 * popping the stack frame (can't be done atomically) and so it would still
897 * be possible to get enough handler activations to overflow the stack.
898 * Although unlikely, bugs of that kind are hard to track down, so we'd
899 * like to avoid the possibility.
900 * So, on entry to the handler we detect whether we interrupted an
901 * existing activation in its critical region -- if so, we pop the current
902 * activation and restart the handler using the previous one.
2f6474e4
TG
903 *
904 * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
9f1e87ea 905 */
2f6474e4 906SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
4d732138 907
9f1e87ea
CG
908/*
909 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
910 * see the correct pointer to the pt_regs
911 */
8c1f7558 912 UNWIND_HINT_FUNC
4d732138 913 movq %rdi, %rsp /* we don't return, adjust the stack frame */
8c1f7558 914 UNWIND_HINT_REGS
1d3e53e8 915
2f6474e4 916 call xen_pv_evtchn_do_upcall
1d3e53e8 917
2f6474e4
TG
918 jmp error_return
919SYM_CODE_END(exc_xen_hypervisor_callback)
3d75e1b8
JF
920
921/*
9f1e87ea
CG
922 * Hypervisor uses this for application faults while it executes.
923 * We get here for two reasons:
924 * 1. Fault while reloading DS, ES, FS or GS
925 * 2. Fault while executing IRET
926 * Category 1 we do not need to fix up as Xen has already reloaded all segment
927 * registers that could be reloaded and zeroed the others.
928 * Category 2 we fix up by killing the current process. We cannot use the
929 * normal Linux return path in this case because if we use the IRET hypercall
930 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
931 * We distinguish between categories by comparing each saved segment register
932 * with its current contents: any discrepancy means we in category 1.
933 */
bc7b11c0 934SYM_CODE_START(xen_failsafe_callback)
8c1f7558 935 UNWIND_HINT_EMPTY
4d732138
IM
936 movl %ds, %ecx
937 cmpw %cx, 0x10(%rsp)
938 jne 1f
939 movl %es, %ecx
940 cmpw %cx, 0x18(%rsp)
941 jne 1f
942 movl %fs, %ecx
943 cmpw %cx, 0x20(%rsp)
944 jne 1f
945 movl %gs, %ecx
946 cmpw %cx, 0x28(%rsp)
947 jne 1f
3d75e1b8 948 /* All segments match their saved values => Category 2 (Bad IRET). */
4d732138
IM
949 movq (%rsp), %rcx
950 movq 8(%rsp), %r11
951 addq $0x30, %rsp
952 pushq $0 /* RIP */
8c1f7558 953 UNWIND_HINT_IRET_REGS offset=8
be4c11af 954 jmp asm_exc_general_protection
3d75e1b8 9551: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
4d732138
IM
956 movq (%rsp), %rcx
957 movq 8(%rsp), %r11
958 addq $0x30, %rsp
8c1f7558 959 UNWIND_HINT_IRET_REGS
4d732138 960 pushq $-1 /* orig_ax = -1 => not a system call */
3f01daec 961 PUSH_AND_CLEAR_REGS
946c1911 962 ENCODE_FRAME_POINTER
e88d9741 963 jmp error_return
bc7b11c0 964SYM_CODE_END(xen_failsafe_callback)
28c11b0f 965#endif /* CONFIG_XEN_PV */
3d75e1b8 966
ebfc453e 967/*
9e809d15 968 * Save all registers in pt_regs, and switch gs if needed.
ebfc453e
DV
969 * Use slow, but surefire "are we in kernel?" check.
970 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
971 */
ef1e0315 972SYM_CODE_START_LOCAL(paranoid_entry)
8c1f7558 973 UNWIND_HINT_FUNC
1eeb207f 974 cld
9e809d15
DB
975 PUSH_AND_CLEAR_REGS save_ret=1
976 ENCODE_FRAME_POINTER 8
4d732138
IM
977 movl $1, %ebx
978 movl $MSR_GS_BASE, %ecx
1eeb207f 979 rdmsr
4d732138
IM
980 testl %edx, %edx
981 js 1f /* negative -> in kernel */
1eeb207f 982 SWAPGS
4d732138 983 xorl %ebx, %ebx
8a09317b
DH
984
9851:
16561f27
DH
986 /*
987 * Always stash CR3 in %r14. This value will be restored,
ae852495
AL
988 * verbatim, at exit. Needed if paranoid_entry interrupted
989 * another entry that already switched to the user CR3 value
990 * but has not yet returned to userspace.
16561f27
DH
991 *
992 * This is also why CS (stashed in the "iret frame" by the
993 * hardware at entry) can not be used: this may be a return
ae852495 994 * to kernel code, but with a user CR3 value.
16561f27 995 */
8a09317b
DH
996 SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
997
18ec54fd
JP
998 /*
999 * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
1000 * unconditional CR3 write, even in the PTI case. So do an lfence
1001 * to prevent GS speculation, regardless of whether PTI is enabled.
1002 */
1003 FENCE_SWAPGS_KERNEL_ENTRY
1004
8a09317b 1005 ret
ef1e0315 1006SYM_CODE_END(paranoid_entry)
ddeb8f21 1007
ebfc453e
DV
1008/*
1009 * "Paranoid" exit path from exception stack. This is invoked
1010 * only on return from non-NMI IST interrupts that came
1011 * from kernel space.
1012 *
1013 * We may be returning to very strange contexts (e.g. very early
1014 * in syscall entry), so checking for preemption here would
1015 * be complicated. Fortunately, we there's no good reason
1016 * to try to handle preemption here.
4d732138
IM
1017 *
1018 * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
ebfc453e 1019 */
ef1e0315 1020SYM_CODE_START_LOCAL(paranoid_exit)
8c1f7558 1021 UNWIND_HINT_REGS
2140a994 1022 DISABLE_INTERRUPTS(CLBR_ANY)
5963e317 1023 TRACE_IRQS_OFF_DEBUG
4d732138 1024 testl %ebx, %ebx /* swapgs needed? */
e5317832 1025 jnz .Lparanoid_exit_no_swapgs
f2db9382 1026 TRACE_IRQS_IRETQ
16561f27 1027 /* Always restore stashed CR3 value (see paranoid_entry) */
21e94459 1028 RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
ddeb8f21 1029 SWAPGS_UNSAFE_STACK
45c08383 1030 jmp restore_regs_and_return_to_kernel
e5317832 1031.Lparanoid_exit_no_swapgs:
f2db9382 1032 TRACE_IRQS_IRETQ_DEBUG
16561f27 1033 /* Always restore stashed CR3 value (see paranoid_entry) */
e4865757 1034 RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
e5317832 1035 jmp restore_regs_and_return_to_kernel
ef1e0315 1036SYM_CODE_END(paranoid_exit)
ddeb8f21
AH
1037
1038/*
9e809d15 1039 * Save all registers in pt_regs, and switch GS if needed.
ddeb8f21 1040 */
ef1e0315 1041SYM_CODE_START_LOCAL(error_entry)
9e809d15 1042 UNWIND_HINT_FUNC
ddeb8f21 1043 cld
9e809d15
DB
1044 PUSH_AND_CLEAR_REGS save_ret=1
1045 ENCODE_FRAME_POINTER 8
03335e95 1046 testb $3, CS+8(%rsp)
cb6f64ed 1047 jz .Lerror_kernelspace
539f5113 1048
cb6f64ed
AL
1049 /*
1050 * We entered from user mode or we're pretending to have entered
1051 * from user mode due to an IRET fault.
1052 */
ddeb8f21 1053 SWAPGS
18ec54fd 1054 FENCE_SWAPGS_USER_ENTRY
8a09317b
DH
1055 /* We have user CR3. Change to kernel CR3. */
1056 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
539f5113 1057
cb6f64ed 1058.Lerror_entry_from_usermode_after_swapgs:
7f2590a1
AL
1059 /* Put us onto the real thread stack. */
1060 popq %r12 /* save return addr in %12 */
1061 movq %rsp, %rdi /* arg0 = pt_regs pointer */
1062 call sync_regs
1063 movq %rax, %rsp /* switch stack */
1064 ENCODE_FRAME_POINTER
1065 pushq %r12
f1075053 1066 ret
02bc7768 1067
18ec54fd
JP
1068.Lerror_entry_done_lfence:
1069 FENCE_SWAPGS_KERNEL_ENTRY
cb6f64ed 1070.Lerror_entry_done:
ddeb8f21 1071 ret
ddeb8f21 1072
ebfc453e
DV
1073 /*
1074 * There are two places in the kernel that can potentially fault with
1075 * usergs. Handle them here. B stepping K8s sometimes report a
1076 * truncated RIP for IRET exceptions returning to compat mode. Check
1077 * for these here too.
1078 */
cb6f64ed 1079.Lerror_kernelspace:
4d732138
IM
1080 leaq native_irq_return_iret(%rip), %rcx
1081 cmpq %rcx, RIP+8(%rsp)
cb6f64ed 1082 je .Lerror_bad_iret
4d732138
IM
1083 movl %ecx, %eax /* zero extend */
1084 cmpq %rax, RIP+8(%rsp)
cb6f64ed 1085 je .Lbstep_iret
42c748bb 1086 cmpq $.Lgs_change, RIP+8(%rsp)
18ec54fd 1087 jne .Lerror_entry_done_lfence
539f5113
AL
1088
1089 /*
42c748bb 1090 * hack: .Lgs_change can fail with user gsbase. If this happens, fix up
539f5113 1091 * gsbase and proceed. We'll fix up the exception and land in
42c748bb 1092 * .Lgs_change's error handler with kernel gsbase.
539f5113 1093 */
2fa5f04f 1094 SWAPGS
18ec54fd 1095 FENCE_SWAPGS_USER_ENTRY
2fa5f04f 1096 jmp .Lerror_entry_done
ae24ffe5 1097
cb6f64ed 1098.Lbstep_iret:
ae24ffe5 1099 /* Fix truncated RIP */
4d732138 1100 movq %rcx, RIP+8(%rsp)
b645af2d
AL
1101 /* fall through */
1102
cb6f64ed 1103.Lerror_bad_iret:
539f5113 1104 /*
8a09317b
DH
1105 * We came from an IRET to user mode, so we have user
1106 * gsbase and CR3. Switch to kernel gsbase and CR3:
539f5113 1107 */
b645af2d 1108 SWAPGS
18ec54fd 1109 FENCE_SWAPGS_USER_ENTRY
8a09317b 1110 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
539f5113
AL
1111
1112 /*
1113 * Pretend that the exception came from user mode: set up pt_regs
b3681dd5 1114 * as if we faulted immediately after IRET.
539f5113 1115 */
4d732138
IM
1116 mov %rsp, %rdi
1117 call fixup_bad_iret
1118 mov %rax, %rsp
cb6f64ed 1119 jmp .Lerror_entry_from_usermode_after_swapgs
ef1e0315 1120SYM_CODE_END(error_entry)
ddeb8f21 1121
424c7d0a
TG
1122SYM_CODE_START_LOCAL(error_return)
1123 UNWIND_HINT_REGS
1124 DEBUG_ENTRY_ASSERT_IRQS_OFF
1125 testb $3, CS(%rsp)
1126 jz restore_regs_and_return_to_kernel
1127 jmp swapgs_restore_regs_and_return_to_usermode
1128SYM_CODE_END(error_return)
1129
929bacec
AL
1130/*
1131 * Runs on exception stack. Xen PV does not go through this path at all,
1132 * so we can use real assembly here.
8a09317b
DH
1133 *
1134 * Registers:
1135 * %r14: Used to save/restore the CR3 of the interrupted context
1136 * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
929bacec 1137 */
6271fef0 1138SYM_CODE_START(asm_exc_nmi)
8c1f7558 1139 UNWIND_HINT_IRET_REGS
929bacec 1140
3f3c8b8c
SR
1141 /*
1142 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1143 * the iretq it performs will take us out of NMI context.
1144 * This means that we can have nested NMIs where the next
1145 * NMI is using the top of the stack of the previous NMI. We
1146 * can't let it execute because the nested NMI will corrupt the
1147 * stack of the previous NMI. NMI handlers are not re-entrant
1148 * anyway.
1149 *
1150 * To handle this case we do the following:
1151 * Check the a special location on the stack that contains
1152 * a variable that is set when NMIs are executing.
1153 * The interrupted task's stack is also checked to see if it
1154 * is an NMI stack.
1155 * If the variable is not set and the stack is not the NMI
1156 * stack then:
1157 * o Set the special variable on the stack
0b22930e
AL
1158 * o Copy the interrupt frame into an "outermost" location on the
1159 * stack
1160 * o Copy the interrupt frame into an "iret" location on the stack
3f3c8b8c
SR
1161 * o Continue processing the NMI
1162 * If the variable is set or the previous stack is the NMI stack:
0b22930e 1163 * o Modify the "iret" location to jump to the repeat_nmi
3f3c8b8c
SR
1164 * o return back to the first NMI
1165 *
1166 * Now on exit of the first NMI, we first clear the stack variable
1167 * The NMI stack will tell any nested NMIs at that point that it is
1168 * nested. Then we pop the stack normally with iret, and if there was
1169 * a nested NMI that updated the copy interrupt stack frame, a
1170 * jump will be made to the repeat_nmi code that will handle the second
1171 * NMI.
9b6e6a83
AL
1172 *
1173 * However, espfix prevents us from directly returning to userspace
1174 * with a single IRET instruction. Similarly, IRET to user mode
1175 * can fault. We therefore handle NMIs from user space like
1176 * other IST entries.
3f3c8b8c
SR
1177 */
1178
e93c1730
AL
1179 ASM_CLAC
1180
146b2b09 1181 /* Use %rdx as our temp variable throughout */
4d732138 1182 pushq %rdx
3f3c8b8c 1183
9b6e6a83
AL
1184 testb $3, CS-RIP+8(%rsp)
1185 jz .Lnmi_from_kernel
1186
1187 /*
1188 * NMI from user mode. We need to run on the thread stack, but we
1189 * can't go through the normal entry paths: NMIs are masked, and
1190 * we don't want to enable interrupts, because then we'll end
1191 * up in an awkward situation in which IRQs are on but NMIs
1192 * are off.
83c133cf
AL
1193 *
1194 * We also must not push anything to the stack before switching
1195 * stacks lest we corrupt the "NMI executing" variable.
9b6e6a83
AL
1196 */
1197
929bacec 1198 swapgs
9b6e6a83 1199 cld
18ec54fd 1200 FENCE_SWAPGS_USER_ENTRY
8a09317b 1201 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
9b6e6a83
AL
1202 movq %rsp, %rdx
1203 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
8c1f7558 1204 UNWIND_HINT_IRET_REGS base=%rdx offset=8
9b6e6a83
AL
1205 pushq 5*8(%rdx) /* pt_regs->ss */
1206 pushq 4*8(%rdx) /* pt_regs->rsp */
1207 pushq 3*8(%rdx) /* pt_regs->flags */
1208 pushq 2*8(%rdx) /* pt_regs->cs */
1209 pushq 1*8(%rdx) /* pt_regs->rip */
8c1f7558 1210 UNWIND_HINT_IRET_REGS
9b6e6a83 1211 pushq $-1 /* pt_regs->orig_ax */
30907fd1 1212 PUSH_AND_CLEAR_REGS rdx=(%rdx)
946c1911 1213 ENCODE_FRAME_POINTER
9b6e6a83
AL
1214
1215 /*
1216 * At this point we no longer need to worry about stack damage
1217 * due to nesting -- we're on the normal thread stack and we're
1218 * done with the NMI stack.
1219 */
1220
1221 movq %rsp, %rdi
1222 movq $-1, %rsi
6271fef0 1223 call exc_nmi
9b6e6a83 1224
45d5a168 1225 /*
9b6e6a83 1226 * Return back to user mode. We must *not* do the normal exit
946c1911 1227 * work, because we don't want to enable interrupts.
45d5a168 1228 */
8a055d7f 1229 jmp swapgs_restore_regs_and_return_to_usermode
45d5a168 1230
9b6e6a83 1231.Lnmi_from_kernel:
3f3c8b8c 1232 /*
0b22930e
AL
1233 * Here's what our stack frame will look like:
1234 * +---------------------------------------------------------+
1235 * | original SS |
1236 * | original Return RSP |
1237 * | original RFLAGS |
1238 * | original CS |
1239 * | original RIP |
1240 * +---------------------------------------------------------+
1241 * | temp storage for rdx |
1242 * +---------------------------------------------------------+
1243 * | "NMI executing" variable |
1244 * +---------------------------------------------------------+
1245 * | iret SS } Copied from "outermost" frame |
1246 * | iret Return RSP } on each loop iteration; overwritten |
1247 * | iret RFLAGS } by a nested NMI to force another |
1248 * | iret CS } iteration if needed. |
1249 * | iret RIP } |
1250 * +---------------------------------------------------------+
1251 * | outermost SS } initialized in first_nmi; |
1252 * | outermost Return RSP } will not be changed before |
1253 * | outermost RFLAGS } NMI processing is done. |
1254 * | outermost CS } Copied to "iret" frame on each |
1255 * | outermost RIP } iteration. |
1256 * +---------------------------------------------------------+
1257 * | pt_regs |
1258 * +---------------------------------------------------------+
1259 *
1260 * The "original" frame is used by hardware. Before re-enabling
1261 * NMIs, we need to be done with it, and we need to leave enough
1262 * space for the asm code here.
1263 *
1264 * We return by executing IRET while RSP points to the "iret" frame.
1265 * That will either return for real or it will loop back into NMI
1266 * processing.
1267 *
1268 * The "outermost" frame is copied to the "iret" frame on each
1269 * iteration of the loop, so each iteration starts with the "iret"
1270 * frame pointing to the final return target.
1271 */
1272
45d5a168 1273 /*
0b22930e
AL
1274 * Determine whether we're a nested NMI.
1275 *
a27507ca
AL
1276 * If we interrupted kernel code between repeat_nmi and
1277 * end_repeat_nmi, then we are a nested NMI. We must not
1278 * modify the "iret" frame because it's being written by
1279 * the outer NMI. That's okay; the outer NMI handler is
6271fef0 1280 * about to about to call exc_nmi() anyway, so we can just
a27507ca 1281 * resume the outer NMI.
45d5a168 1282 */
a27507ca
AL
1283
1284 movq $repeat_nmi, %rdx
1285 cmpq 8(%rsp), %rdx
1286 ja 1f
1287 movq $end_repeat_nmi, %rdx
1288 cmpq 8(%rsp), %rdx
1289 ja nested_nmi_out
12901:
45d5a168 1291
3f3c8b8c 1292 /*
a27507ca 1293 * Now check "NMI executing". If it's set, then we're nested.
0b22930e
AL
1294 * This will not detect if we interrupted an outer NMI just
1295 * before IRET.
3f3c8b8c 1296 */
4d732138
IM
1297 cmpl $1, -8(%rsp)
1298 je nested_nmi
3f3c8b8c
SR
1299
1300 /*
0b22930e
AL
1301 * Now test if the previous stack was an NMI stack. This covers
1302 * the case where we interrupt an outer NMI after it clears
810bc075
AL
1303 * "NMI executing" but before IRET. We need to be careful, though:
1304 * there is one case in which RSP could point to the NMI stack
1305 * despite there being no NMI active: naughty userspace controls
1306 * RSP at the very beginning of the SYSCALL targets. We can
1307 * pull a fast one on naughty userspace, though: we program
1308 * SYSCALL to mask DF, so userspace cannot cause DF to be set
1309 * if it controls the kernel's RSP. We set DF before we clear
1310 * "NMI executing".
3f3c8b8c 1311 */
0784b364
DV
1312 lea 6*8(%rsp), %rdx
1313 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1314 cmpq %rdx, 4*8(%rsp)
1315 /* If the stack pointer is above the NMI stack, this is a normal NMI */
1316 ja first_nmi
4d732138 1317
0784b364
DV
1318 subq $EXCEPTION_STKSZ, %rdx
1319 cmpq %rdx, 4*8(%rsp)
1320 /* If it is below the NMI stack, it is a normal NMI */
1321 jb first_nmi
810bc075
AL
1322
1323 /* Ah, it is within the NMI stack. */
1324
1325 testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
1326 jz first_nmi /* RSP was user controlled. */
1327
1328 /* This is a nested NMI. */
0784b364 1329
3f3c8b8c
SR
1330nested_nmi:
1331 /*
0b22930e
AL
1332 * Modify the "iret" frame to point to repeat_nmi, forcing another
1333 * iteration of NMI handling.
3f3c8b8c 1334 */
23a781e9 1335 subq $8, %rsp
4d732138
IM
1336 leaq -10*8(%rsp), %rdx
1337 pushq $__KERNEL_DS
1338 pushq %rdx
131484c8 1339 pushfq
4d732138
IM
1340 pushq $__KERNEL_CS
1341 pushq $repeat_nmi
3f3c8b8c
SR
1342
1343 /* Put stack back */
4d732138 1344 addq $(6*8), %rsp
3f3c8b8c
SR
1345
1346nested_nmi_out:
4d732138 1347 popq %rdx
3f3c8b8c 1348
0b22930e 1349 /* We are returning to kernel mode, so this cannot result in a fault. */
929bacec 1350 iretq
3f3c8b8c
SR
1351
1352first_nmi:
0b22930e 1353 /* Restore rdx. */
4d732138 1354 movq (%rsp), %rdx
62610913 1355
36f1a77b
AL
1356 /* Make room for "NMI executing". */
1357 pushq $0
3f3c8b8c 1358
0b22930e 1359 /* Leave room for the "iret" frame */
4d732138 1360 subq $(5*8), %rsp
28696f43 1361
0b22930e 1362 /* Copy the "original" frame to the "outermost" frame */
3f3c8b8c 1363 .rept 5
4d732138 1364 pushq 11*8(%rsp)
3f3c8b8c 1365 .endr
8c1f7558 1366 UNWIND_HINT_IRET_REGS
62610913 1367
79fb4ad6
SR
1368 /* Everything up to here is safe from nested NMIs */
1369
a97439aa
AL
1370#ifdef CONFIG_DEBUG_ENTRY
1371 /*
1372 * For ease of testing, unmask NMIs right away. Disabled by
1373 * default because IRET is very expensive.
1374 */
1375 pushq $0 /* SS */
1376 pushq %rsp /* RSP (minus 8 because of the previous push) */
1377 addq $8, (%rsp) /* Fix up RSP */
1378 pushfq /* RFLAGS */
1379 pushq $__KERNEL_CS /* CS */
1380 pushq $1f /* RIP */
929bacec 1381 iretq /* continues at repeat_nmi below */
8c1f7558 1382 UNWIND_HINT_IRET_REGS
a97439aa
AL
13831:
1384#endif
1385
0b22930e 1386repeat_nmi:
62610913
JB
1387 /*
1388 * If there was a nested NMI, the first NMI's iret will return
1389 * here. But NMIs are still enabled and we can take another
1390 * nested NMI. The nested NMI checks the interrupted RIP to see
1391 * if it is between repeat_nmi and end_repeat_nmi, and if so
1392 * it will just return, as we are about to repeat an NMI anyway.
1393 * This makes it safe to copy to the stack frame that a nested
1394 * NMI will update.
0b22930e
AL
1395 *
1396 * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
1397 * we're repeating an NMI, gsbase has the same value that it had on
1398 * the first iteration. paranoid_entry will load the kernel
6271fef0 1399 * gsbase if needed before we call exc_nmi(). "NMI executing"
36f1a77b 1400 * is zero.
62610913 1401 */
36f1a77b 1402 movq $1, 10*8(%rsp) /* Set "NMI executing". */
3f3c8b8c 1403
62610913 1404 /*
0b22930e
AL
1405 * Copy the "outermost" frame to the "iret" frame. NMIs that nest
1406 * here must not modify the "iret" frame while we're writing to
1407 * it or it will end up containing garbage.
62610913 1408 */
4d732138 1409 addq $(10*8), %rsp
3f3c8b8c 1410 .rept 5
4d732138 1411 pushq -6*8(%rsp)
3f3c8b8c 1412 .endr
4d732138 1413 subq $(5*8), %rsp
62610913 1414end_repeat_nmi:
3f3c8b8c
SR
1415
1416 /*
0b22930e
AL
1417 * Everything below this point can be preempted by a nested NMI.
1418 * If this happens, then the inner NMI will change the "iret"
1419 * frame to point back to repeat_nmi.
3f3c8b8c 1420 */
4d732138 1421 pushq $-1 /* ORIG_RAX: no syscall to restart */
76f5df43 1422
1fd466ef 1423 /*
ebfc453e 1424 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1fd466ef
SR
1425 * as we should not be calling schedule in NMI context.
1426 * Even with normal interrupts enabled. An NMI should not be
1427 * setting NEED_RESCHED or anything that normal interrupts and
1428 * exceptions might do.
1429 */
4d732138 1430 call paranoid_entry
8c1f7558 1431 UNWIND_HINT_REGS
7fbb98c5 1432
6271fef0 1433 /* paranoidentry exc_nmi(), 0; without TRACE_IRQS_OFF */
4d732138
IM
1434 movq %rsp, %rdi
1435 movq $-1, %rsi
6271fef0 1436 call exc_nmi
7fbb98c5 1437
16561f27 1438 /* Always restore stashed CR3 value (see paranoid_entry) */
21e94459 1439 RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
8a09317b 1440
4d732138
IM
1441 testl %ebx, %ebx /* swapgs needed? */
1442 jnz nmi_restore
ddeb8f21
AH
1443nmi_swapgs:
1444 SWAPGS_UNSAFE_STACK
1445nmi_restore:
502af0d7 1446 POP_REGS
0b22930e 1447
471ee483
AL
1448 /*
1449 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
1450 * at the "iret" frame.
1451 */
1452 addq $6*8, %rsp
28696f43 1453
810bc075
AL
1454 /*
1455 * Clear "NMI executing". Set DF first so that we can easily
1456 * distinguish the remaining code between here and IRET from
929bacec
AL
1457 * the SYSCALL entry and exit paths.
1458 *
1459 * We arguably should just inspect RIP instead, but I (Andy) wrote
1460 * this code when I had the misapprehension that Xen PV supported
1461 * NMIs, and Xen PV would break that approach.
810bc075
AL
1462 */
1463 std
1464 movq $0, 5*8(%rsp) /* clear "NMI executing" */
0b22930e
AL
1465
1466 /*
929bacec
AL
1467 * iretq reads the "iret" frame and exits the NMI stack in a
1468 * single instruction. We are returning to kernel mode, so this
1469 * cannot result in a fault. Similarly, we don't need to worry
1470 * about espfix64 on the way back to kernel mode.
0b22930e 1471 */
929bacec 1472 iretq
6271fef0 1473SYM_CODE_END(asm_exc_nmi)
ddeb8f21 1474
dffb3f9d
AL
1475#ifndef CONFIG_IA32_EMULATION
1476/*
1477 * This handles SYSCALL from 32-bit code. There is no way to program
1478 * MSRs to fully disable 32-bit SYSCALL.
1479 */
bc7b11c0 1480SYM_CODE_START(ignore_sysret)
8c1f7558 1481 UNWIND_HINT_EMPTY
4d732138 1482 mov $-ENOSYS, %eax
b2b1d94c 1483 sysretl
bc7b11c0 1484SYM_CODE_END(ignore_sysret)
dffb3f9d 1485#endif
2deb4be2 1486
b9f6976b 1487.pushsection .text, "ax"
bc7b11c0 1488SYM_CODE_START(rewind_stack_do_exit)
8c1f7558 1489 UNWIND_HINT_FUNC
2deb4be2
AL
1490 /* Prevent any naive code from trying to unwind to our caller. */
1491 xorl %ebp, %ebp
1492
1493 movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
8c1f7558 1494 leaq -PTREGS_SIZE(%rax), %rsp
f977df7b 1495 UNWIND_HINT_REGS
2deb4be2
AL
1496
1497 call do_exit
bc7b11c0 1498SYM_CODE_END(rewind_stack_do_exit)
b9f6976b 1499.popsection