]>
Commit | Line | Data |
---|---|---|
1da177e4 | 1 | /* |
54ad726c IM |
2 | * Compatibility mode system call entry point for x86-64. |
3 | * | |
1da177e4 | 4 | * Copyright 2000-2002 Andi Kleen, SuSE Labs. |
54ad726c | 5 | */ |
d36f9479 | 6 | #include "calling.h" |
e2d5df93 | 7 | #include <asm/asm-offsets.h> |
1da177e4 LT |
8 | #include <asm/current.h> |
9 | #include <asm/errno.h> | |
54ad726c IM |
10 | #include <asm/ia32_unistd.h> |
11 | #include <asm/thread_info.h> | |
1da177e4 | 12 | #include <asm/segment.h> |
2601e64d | 13 | #include <asm/irqflags.h> |
1ce6f868 | 14 | #include <asm/asm.h> |
63bcff2a | 15 | #include <asm/smap.h> |
d7eb5f9e | 16 | #include <asm/spec_ctrl.h> |
1da177e4 | 17 | #include <linux/linkage.h> |
d7e7528b | 18 | #include <linux/err.h> |
1da177e4 | 19 | |
ea714547 JO |
20 | .section .entry.text, "ax" |
21 | ||
1da177e4 | 22 | /* |
fda57b22 | 23 | * 32-bit SYSENTER entry. |
1da177e4 | 24 | * |
fda57b22 AL |
25 | * 32-bit system calls through the vDSO's __kernel_vsyscall enter here |
26 | * on 64-bit kernels running on Intel CPUs. | |
27 | * | |
28 | * The SYSENTER instruction, in principle, should *only* occur in the | |
29 | * vDSO. In practice, a small number of Android devices were shipped | |
30 | * with a copy of Bionic that inlined a SYSENTER instruction. This | |
31 | * never happened in any of Google's Bionic versions -- it only happened | |
32 | * in a narrow range of Intel-provided versions. | |
33 | * | |
34 | * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs. | |
35 | * IF and VM in RFLAGS are cleared (IOW: interrupts are off). | |
b87cf63e | 36 | * SYSENTER does not save anything on the stack, |
fda57b22 | 37 | * and does not save old RIP (!!!), RSP, or RFLAGS. |
b87cf63e | 38 | * |
1da177e4 | 39 | * Arguments: |
b87cf63e DV |
40 | * eax system call number |
41 | * ebx arg1 | |
42 | * ecx arg2 | |
43 | * edx arg3 | |
44 | * esi arg4 | |
45 | * edi arg5 | |
46 | * ebp user stack | |
47 | * 0(%ebp) arg6 | |
b87cf63e | 48 | */ |
4c8cd0c5 | 49 | ENTRY(entry_SYSENTER_compat) |
b611acf4 | 50 | /* Interrupts are off on entry. */ |
8e621515 | 51 | SWAPGS |
313dfb59 DH |
52 | |
53 | /* We are about to clobber %rsp anyway, clobbering here is OK */ | |
54 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp | |
55 | ||
3a23208e | 56 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
a232e3d5 | 57 | |
a474e67c AL |
58 | /* |
59 | * User tracing code (ptrace or signal handlers) might assume that | |
60 | * the saved RAX contains a 32-bit number when we're invoking a 32-bit | |
61 | * syscall. Just in case the high bits are nonzero, zero-extend | |
62 | * the syscall number. (This could almost certainly be deleted | |
63 | * with no ill effects.) | |
64 | */ | |
4ee8ec17 DV |
65 | movl %eax, %eax |
66 | ||
4c9c0e91 | 67 | /* Construct struct pt_regs on stack */ |
131484c8 | 68 | pushq $__USER32_DS /* pt_regs->ss */ |
30bfa7b3 | 69 | pushq %rbp /* pt_regs->sp (stashed in bp) */ |
b611acf4 AL |
70 | |
71 | /* | |
72 | * Push flags. This is nasty. First, interrupts are currently | |
73 | * off, but we need pt_regs->flags to have IF set. Second, even | |
74 | * if TF was set when SYSENTER started, it's clear by now. We fix | |
75 | * that later using TIF_SINGLESTEP. | |
76 | */ | |
77 | pushfq /* pt_regs->flags (except IF = 0) */ | |
78 | orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ | |
131484c8 | 79 | pushq $__USER32_CS /* pt_regs->cs */ |
778843f9 | 80 | pushq $0 /* pt_regs->ip = 0 (placeholder) */ |
131484c8 IM |
81 | pushq %rax /* pt_regs->orig_ax */ |
82 | pushq %rdi /* pt_regs->di */ | |
83 | pushq %rsi /* pt_regs->si */ | |
84 | pushq %rdx /* pt_regs->dx */ | |
30bfa7b3 | 85 | pushq %rcx /* pt_regs->cx */ |
131484c8 | 86 | pushq $-ENOSYS /* pt_regs->ax */ |
778843f9 DV |
87 | pushq $0 /* pt_regs->r8 = 0 */ |
88 | pushq $0 /* pt_regs->r9 = 0 */ | |
89 | pushq $0 /* pt_regs->r10 = 0 */ | |
90 | pushq $0 /* pt_regs->r11 = 0 */ | |
a474e67c | 91 | pushq %rbx /* pt_regs->rbx */ |
30bfa7b3 | 92 | pushq %rbp /* pt_regs->rbp (will be overwritten) */ |
778843f9 DV |
93 | pushq $0 /* pt_regs->r12 = 0 */ |
94 | pushq $0 /* pt_regs->r13 = 0 */ | |
95 | pushq $0 /* pt_regs->r14 = 0 */ | |
96 | pushq $0 /* pt_regs->r15 = 0 */ | |
1da177e4 | 97 | cld |
4c9c0e91 | 98 | |
d7eb5f9e | 99 | ENABLE_IBRS |
b82785ac | 100 | STUFF_RSB |
d7eb5f9e | 101 | |
8c7aa698 | 102 | /* |
e7860411 | 103 | * SYSENTER doesn't filter flags, so we need to clear NT and AC |
8c7aa698 | 104 | * ourselves. To save a few cycles, we can check whether |
e7860411 | 105 | * either was set instead of doing an unconditional popfq. |
b611acf4 AL |
106 | * This needs to happen before enabling interrupts so that |
107 | * we don't get preempted with NT set. | |
374a3a39 | 108 | * |
f2b37575 AL |
109 | * If TF is set, we will single-step all the way to here -- do_debug |
110 | * will ignore all the traps. (Yes, this is slow, but so is | |
111 | * single-stepping in general. This allows us to avoid having | |
112 | * a more complicated code to handle the case where a user program | |
113 | * forces us to single-step through the SYSENTER entry code.) | |
114 | * | |
f74acf0e | 115 | * NB.: .Lsysenter_fix_flags is a label with the code under it moved |
374a3a39 BP |
116 | * out-of-line as an optimization: NT is unlikely to be set in the |
117 | * majority of the cases and instead of polluting the I$ unnecessarily, | |
118 | * we're keeping that code behind a branch which will predict as | |
119 | * not-taken and therefore its instructions won't be fetched. | |
8c7aa698 | 120 | */ |
f2b37575 | 121 | testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp) |
f74acf0e BP |
122 | jnz .Lsysenter_fix_flags |
123 | .Lsysenter_flags_fixed: | |
8c7aa698 | 124 | |
a474e67c AL |
125 | /* |
126 | * User mode is traced as though IRQs are on, and SYSENTER | |
127 | * turned them off. | |
128 | */ | |
129 | TRACE_IRQS_OFF | |
e62a254a | 130 | |
a474e67c AL |
131 | movq %rsp, %rdi |
132 | call do_fast_syscall_32 | |
91e2eea9 BO |
133 | /* XEN PV guests always use IRET path */ |
134 | ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ | |
135 | "jmp .Lsyscall_32_done", X86_FEATURE_XENPV | |
7841b408 | 136 | jmp sysret32_from_system_call |
1da177e4 | 137 | |
f74acf0e | 138 | .Lsysenter_fix_flags: |
b611acf4 | 139 | pushq $X86_EFLAGS_FIXED |
131484c8 | 140 | popfq |
f74acf0e | 141 | jmp .Lsysenter_flags_fixed |
f2b37575 | 142 | GLOBAL(__end_entry_SYSENTER_compat) |
4c8cd0c5 | 143 | ENDPROC(entry_SYSENTER_compat) |
1da177e4 LT |
144 | |
145 | /* | |
fda57b22 AL |
146 | * 32-bit SYSCALL entry. |
147 | * | |
148 | * 32-bit system calls through the vDSO's __kernel_vsyscall enter here | |
149 | * on 64-bit kernels running on AMD CPUs. | |
150 | * | |
151 | * The SYSCALL instruction, in principle, should *only* occur in the | |
152 | * vDSO. In practice, it appears that this really is the case. | |
153 | * As evidence: | |
154 | * | |
155 | * - The calling convention for SYSCALL has changed several times without | |
156 | * anyone noticing. | |
157 | * | |
158 | * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything | |
159 | * user task that did SYSCALL without immediately reloading SS | |
160 | * would randomly crash. | |
1da177e4 | 161 | * |
fda57b22 AL |
162 | * - Most programmers do not directly target AMD CPUs, and the 32-bit |
163 | * SYSCALL instruction does not exist on Intel CPUs. Even on AMD | |
164 | * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels | |
165 | * because the SYSCALL instruction in legacy/native 32-bit mode (as | |
166 | * opposed to compat mode) is sufficiently poorly designed as to be | |
167 | * essentially unusable. | |
b87cf63e | 168 | * |
fda57b22 AL |
169 | * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves |
170 | * RFLAGS to R11, then loads new SS, CS, and RIP from previously | |
171 | * programmed MSRs. RFLAGS gets masked by a value from another MSR | |
172 | * (so CLD and CLAC are not needed). SYSCALL does not save anything on | |
173 | * the stack and does not change RSP. | |
174 | * | |
175 | * Note: RFLAGS saving+masking-with-MSR happens only in Long mode | |
54ad726c | 176 | * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). |
fda57b22 | 177 | * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit |
b87cf63e DV |
178 | * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes |
179 | * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). | |
180 | * | |
1da177e4 | 181 | * Arguments: |
b87cf63e DV |
182 | * eax system call number |
183 | * ecx return address | |
184 | * ebx arg1 | |
185 | * ebp arg2 (note: not saved in the stack frame, should not be touched) | |
186 | * edx arg3 | |
187 | * esi arg4 | |
188 | * edi arg5 | |
189 | * esp user stack | |
190 | * 0(%esp) arg6 | |
b87cf63e | 191 | */ |
2cd23553 | 192 | ENTRY(entry_SYSCALL_compat) |
a474e67c | 193 | /* Interrupts are off on entry. */ |
b8cec41e | 194 | swapgs |
e62a254a | 195 | |
2f45cd7a | 196 | /* Stash user ESP.*/ |
54ad726c | 197 | movl %esp, %r8d |
2f45cd7a TG |
198 | |
199 | /* Use %rsp as scratch reg. User ESP is stashed in r8 */ | |
200 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp | |
d7eb5f9e | 201 | ENABLE_IBRS |
2f45cd7a TG |
202 | |
203 | /* Switch to the kernel stack */ | |
54ad726c | 204 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
a232e3d5 | 205 | |
4c9c0e91 | 206 | /* Construct struct pt_regs on stack */ |
131484c8 IM |
207 | pushq $__USER32_DS /* pt_regs->ss */ |
208 | pushq %r8 /* pt_regs->sp */ | |
209 | pushq %r11 /* pt_regs->flags */ | |
210 | pushq $__USER32_CS /* pt_regs->cs */ | |
211 | pushq %rcx /* pt_regs->ip */ | |
b8cec41e AL |
212 | GLOBAL(entry_SYSCALL_compat_after_hwframe) |
213 | movl %eax, %eax /* discard orig_ax high bits */ | |
131484c8 IM |
214 | pushq %rax /* pt_regs->orig_ax */ |
215 | pushq %rdi /* pt_regs->di */ | |
216 | pushq %rsi /* pt_regs->si */ | |
217 | pushq %rdx /* pt_regs->dx */ | |
30bfa7b3 | 218 | pushq %rbp /* pt_regs->cx (stashed in bp) */ |
131484c8 | 219 | pushq $-ENOSYS /* pt_regs->ax */ |
778843f9 DV |
220 | pushq $0 /* pt_regs->r8 = 0 */ |
221 | pushq $0 /* pt_regs->r9 = 0 */ | |
222 | pushq $0 /* pt_regs->r10 = 0 */ | |
223 | pushq $0 /* pt_regs->r11 = 0 */ | |
a474e67c | 224 | pushq %rbx /* pt_regs->rbx */ |
30bfa7b3 | 225 | pushq %rbp /* pt_regs->rbp (will be overwritten) */ |
e7571a03 AW |
226 | pushq $0 /* pt_regs->r12 = 0 */ |
227 | pushq $0 /* pt_regs->r13 = 0 */ | |
228 | pushq $0 /* pt_regs->r14 = 0 */ | |
229 | pushq $0 /* pt_regs->r15 = 0 */ | |
4c9c0e91 | 230 | |
b82785ac TC |
231 | STUFF_RSB |
232 | ||
a474e67c AL |
233 | /* |
234 | * User mode is traced as though IRQs are on, and SYSENTER | |
235 | * turned them off. | |
236 | */ | |
237 | TRACE_IRQS_OFF | |
238 | ||
239 | movq %rsp, %rdi | |
240 | call do_fast_syscall_32 | |
91e2eea9 BO |
241 | /* XEN PV guests always use IRET path */ |
242 | ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ | |
243 | "jmp .Lsyscall_32_done", X86_FEATURE_XENPV | |
7841b408 AL |
244 | |
245 | /* Opportunistic SYSRET */ | |
246 | sysret32_from_system_call: | |
247 | TRACE_IRQS_ON /* User mode traces as IRQs on. */ | |
248 | movq RBX(%rsp), %rbx /* pt_regs->rbx */ | |
249 | movq RBP(%rsp), %rbp /* pt_regs->rbp */ | |
250 | movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ | |
251 | movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */ | |
252 | addq $RAX, %rsp /* Skip r8-r15 */ | |
253 | popq %rax /* pt_regs->rax */ | |
254 | popq %rdx /* Skip pt_regs->cx */ | |
255 | popq %rdx /* pt_regs->dx */ | |
256 | popq %rsi /* pt_regs->si */ | |
257 | popq %rdi /* pt_regs->di */ | |
258 | ||
d7eb5f9e | 259 | DISABLE_IBRS |
7841b408 AL |
260 | /* |
261 | * USERGS_SYSRET32 does: | |
262 | * GSBASE = user's GS base | |
263 | * EIP = ECX | |
264 | * RFLAGS = R11 | |
265 | * CS = __USER32_CS | |
266 | * SS = __USER_DS | |
267 | * | |
268 | * ECX will not match pt_regs->cx, but we're returning to a vDSO | |
269 | * trampoline that will fix up RCX, so this is okay. | |
270 | * | |
271 | * R12-R15 are callee-saved, so they contain whatever was in them | |
272 | * when the system call started, which is already known to user | |
273 | * code. We zero R8-R10 to avoid info leaks. | |
274 | */ | |
313dfb59 DH |
275 | movq RSP-ORIG_RAX(%rsp), %rsp |
276 | ||
277 | /* | |
278 | * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored | |
279 | * on the process stack which is not mapped to userspace and | |
280 | * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 | |
281 | * switch until after after the last reference to the process | |
282 | * stack. | |
283 | * | |
ac747136 | 284 | * %r8/%r9 are zeroed before the sysret, thus safe to clobber. |
313dfb59 | 285 | */ |
ac747136 | 286 | SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 |
313dfb59 | 287 | |
7841b408 AL |
288 | xorq %r8, %r8 |
289 | xorq %r9, %r9 | |
290 | xorq %r10, %r10 | |
75ef8219 BO |
291 | swapgs |
292 | sysretl | |
2cd23553 | 293 | END(entry_SYSCALL_compat) |
54ad726c | 294 | |
b87cf63e | 295 | /* |
fda57b22 AL |
296 | * 32-bit legacy system call entry. |
297 | * | |
298 | * 32-bit x86 Linux system calls traditionally used the INT $0x80 | |
299 | * instruction. INT $0x80 lands here. | |
300 | * | |
301 | * This entry point can be used by 32-bit and 64-bit programs to perform | |
302 | * 32-bit system calls. Instances of INT $0x80 can be found inline in | |
303 | * various programs and libraries. It is also used by the vDSO's | |
304 | * __kernel_vsyscall fallback for hardware that doesn't support a faster | |
305 | * entry method. Restarted 32-bit system calls also fall back to INT | |
306 | * $0x80 regardless of what instruction was originally used to do the | |
307 | * system call. | |
308 | * | |
309 | * This is considered a slow path. It is not used by most libc | |
310 | * implementations on modern hardware except during process startup. | |
1da177e4 | 311 | * |
b87cf63e DV |
312 | * Arguments: |
313 | * eax system call number | |
314 | * ebx arg1 | |
315 | * ecx arg2 | |
316 | * edx arg3 | |
317 | * esi arg4 | |
318 | * edi arg5 | |
fda57b22 | 319 | * ebp arg6 |
b87cf63e | 320 | */ |
2cd23553 | 321 | ENTRY(entry_INT80_compat) |
2601e64d | 322 | /* |
a232e3d5 | 323 | * Interrupts are off on entry. |
2601e64d | 324 | */ |
3d44d51b | 325 | ASM_CLAC /* Do this early to minimize exposure */ |
a232e3d5 | 326 | SWAPGS |
a232e3d5 | 327 | |
ee08c6bd AL |
328 | /* |
329 | * User tracing code (ptrace or signal handlers) might assume that | |
330 | * the saved RAX contains a 32-bit number when we're invoking a 32-bit | |
331 | * syscall. Just in case the high bits are nonzero, zero-extend | |
332 | * the syscall number. (This could almost certainly be deleted | |
333 | * with no ill effects.) | |
334 | */ | |
54ad726c | 335 | movl %eax, %eax |
4ee8ec17 | 336 | |
131484c8 | 337 | pushq %rax /* pt_regs->orig_ax */ |
bfb2d0ed AL |
338 | |
339 | /* switch to thread stack expects orig_ax to be pushed */ | |
340 | call switch_to_thread_stack | |
341 | ||
131484c8 IM |
342 | pushq %rdi /* pt_regs->di */ |
343 | pushq %rsi /* pt_regs->si */ | |
344 | pushq %rdx /* pt_regs->dx */ | |
345 | pushq %rcx /* pt_regs->cx */ | |
346 | pushq $-ENOSYS /* pt_regs->ax */ | |
778843f9 DV |
347 | pushq $0 /* pt_regs->r8 = 0 */ |
348 | pushq $0 /* pt_regs->r9 = 0 */ | |
349 | pushq $0 /* pt_regs->r10 = 0 */ | |
350 | pushq $0 /* pt_regs->r11 = 0 */ | |
8169aff6 AL |
351 | pushq %rbx /* pt_regs->rbx */ |
352 | pushq %rbp /* pt_regs->rbp */ | |
353 | pushq %r12 /* pt_regs->r12 */ | |
354 | pushq %r13 /* pt_regs->r13 */ | |
355 | pushq %r14 /* pt_regs->r14 */ | |
356 | pushq %r15 /* pt_regs->r15 */ | |
1da177e4 | 357 | cld |
54ad726c | 358 | |
d7eb5f9e | 359 | ENABLE_IBRS |
b82785ac | 360 | STUFF_RSB |
d7eb5f9e | 361 | |
73cbf687 | 362 | /* |
ee08c6bd AL |
363 | * User mode is traced as though IRQs are on, and the interrupt |
364 | * gate turned them off. | |
73cbf687 | 365 | */ |
ee08c6bd AL |
366 | TRACE_IRQS_OFF |
367 | ||
368 | movq %rsp, %rdi | |
a798f091 | 369 | call do_int80_syscall_32 |
a474e67c | 370 | .Lsyscall_32_done: |
ee08c6bd AL |
371 | |
372 | /* Go back to user mode. */ | |
373 | TRACE_IRQS_ON | |
62a85594 | 374 | jmp swapgs_restore_regs_and_return_to_usermode |
2cd23553 | 375 | END(entry_INT80_compat) |
1da177e4 | 376 | |
1d4b4b29 AV |
377 | ALIGN |
378 | GLOBAL(stub32_clone) | |
5cdc683b | 379 | /* |
7a5a9824 DV |
380 | * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). |
381 | * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). | |
382 | * | |
383 | * The native 64-bit kernel's sys_clone() implements the latter, | |
384 | * so we need to swap arguments here before calling it: | |
5cdc683b | 385 | */ |
7a5a9824 | 386 | xchg %r8, %rcx |
8169aff6 | 387 | jmp sys_clone |