]>
Commit | Line | Data |
---|---|---|
1da177e4 | 1 | /* |
54ad726c IM |
2 | * Compatibility mode system call entry point for x86-64. |
3 | * | |
1da177e4 | 4 | * Copyright 2000-2002 Andi Kleen, SuSE Labs. |
54ad726c | 5 | */ |
d36f9479 | 6 | #include "calling.h" |
e2d5df93 | 7 | #include <asm/asm-offsets.h> |
1da177e4 LT |
8 | #include <asm/current.h> |
9 | #include <asm/errno.h> | |
54ad726c IM |
10 | #include <asm/ia32_unistd.h> |
11 | #include <asm/thread_info.h> | |
1da177e4 | 12 | #include <asm/segment.h> |
2601e64d | 13 | #include <asm/irqflags.h> |
1ce6f868 | 14 | #include <asm/asm.h> |
63bcff2a | 15 | #include <asm/smap.h> |
1da177e4 | 16 | #include <linux/linkage.h> |
d7e7528b | 17 | #include <linux/err.h> |
1da177e4 | 18 | |
ea714547 JO |
19 | .section .entry.text, "ax" |
20 | ||
1da177e4 | 21 | /* |
54ad726c | 22 | * 32-bit SYSENTER instruction entry. |
1da177e4 | 23 | * |
b87cf63e DV |
24 | * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. |
25 | * IF and VM in rflags are cleared (IOW: interrupts are off). | |
26 | * SYSENTER does not save anything on the stack, | |
27 | * and does not save old rip (!!!) and rflags. | |
28 | * | |
1da177e4 | 29 | * Arguments: |
b87cf63e DV |
30 | * eax system call number |
31 | * ebx arg1 | |
32 | * ecx arg2 | |
33 | * edx arg3 | |
34 | * esi arg4 | |
35 | * edi arg5 | |
36 | * ebp user stack | |
37 | * 0(%ebp) arg6 | |
38 | * | |
1da177e4 | 39 | * This is purely a fast path. For anything complicated we use the int 0x80 |
b87cf63e | 40 | * path below. We set up a complete hardware stack frame to share code |
1da177e4 | 41 | * with the int 0x80 path. |
b87cf63e | 42 | */ |
4c8cd0c5 | 43 | ENTRY(entry_SYSENTER_compat) |
b611acf4 | 44 | /* Interrupts are off on entry. */ |
a232e3d5 | 45 | SWAPGS_UNSAFE_STACK |
3a23208e | 46 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
a232e3d5 | 47 | |
a474e67c AL |
48 | /* |
49 | * User tracing code (ptrace or signal handlers) might assume that | |
50 | * the saved RAX contains a 32-bit number when we're invoking a 32-bit | |
51 | * syscall. Just in case the high bits are nonzero, zero-extend | |
52 | * the syscall number. (This could almost certainly be deleted | |
53 | * with no ill effects.) | |
54 | */ | |
4ee8ec17 DV |
55 | movl %eax, %eax |
56 | ||
4c9c0e91 | 57 | /* Construct struct pt_regs on stack */ |
131484c8 | 58 | pushq $__USER32_DS /* pt_regs->ss */ |
30bfa7b3 | 59 | pushq %rbp /* pt_regs->sp (stashed in bp) */ |
b611acf4 AL |
60 | |
61 | /* | |
62 | * Push flags. This is nasty. First, interrupts are currently | |
63 | * off, but we need pt_regs->flags to have IF set. Second, even | |
64 | * if TF was set when SYSENTER started, it's clear by now. We fix | |
65 | * that later using TIF_SINGLESTEP. | |
66 | */ | |
67 | pushfq /* pt_regs->flags (except IF = 0) */ | |
68 | orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ | |
a474e67c | 69 | ASM_CLAC /* Clear AC after saving FLAGS */ |
b611acf4 | 70 | |
131484c8 | 71 | pushq $__USER32_CS /* pt_regs->cs */ |
a474e67c AL |
72 | xorq %r8,%r8 |
73 | pushq %r8 /* pt_regs->ip = 0 (placeholder) */ | |
131484c8 IM |
74 | pushq %rax /* pt_regs->orig_ax */ |
75 | pushq %rdi /* pt_regs->di */ | |
76 | pushq %rsi /* pt_regs->si */ | |
77 | pushq %rdx /* pt_regs->dx */ | |
30bfa7b3 | 78 | pushq %rcx /* pt_regs->cx */ |
131484c8 | 79 | pushq $-ENOSYS /* pt_regs->ax */ |
a474e67c AL |
80 | pushq %r8 /* pt_regs->r8 = 0 */ |
81 | pushq %r8 /* pt_regs->r9 = 0 */ | |
82 | pushq %r8 /* pt_regs->r10 = 0 */ | |
83 | pushq %r8 /* pt_regs->r11 = 0 */ | |
84 | pushq %rbx /* pt_regs->rbx */ | |
30bfa7b3 | 85 | pushq %rbp /* pt_regs->rbp (will be overwritten) */ |
a474e67c AL |
86 | pushq %r8 /* pt_regs->r12 = 0 */ |
87 | pushq %r8 /* pt_regs->r13 = 0 */ | |
88 | pushq %r8 /* pt_regs->r14 = 0 */ | |
89 | pushq %r8 /* pt_regs->r15 = 0 */ | |
1da177e4 | 90 | cld |
4c9c0e91 | 91 | |
8c7aa698 AL |
92 | /* |
93 | * Sysenter doesn't filter flags, so we need to clear NT | |
94 | * ourselves. To save a few cycles, we can check whether | |
95 | * NT was set instead of doing an unconditional popfq. | |
b611acf4 AL |
96 | * This needs to happen before enabling interrupts so that |
97 | * we don't get preempted with NT set. | |
374a3a39 | 98 | * |
f74acf0e | 99 | * NB.: .Lsysenter_fix_flags is a label with the code under it moved |
374a3a39 BP |
100 | * out-of-line as an optimization: NT is unlikely to be set in the |
101 | * majority of the cases and instead of polluting the I$ unnecessarily, | |
102 | * we're keeping that code behind a branch which will predict as | |
103 | * not-taken and therefore its instructions won't be fetched. | |
8c7aa698 | 104 | */ |
54ad726c | 105 | testl $X86_EFLAGS_NT, EFLAGS(%rsp) |
f74acf0e BP |
106 | jnz .Lsysenter_fix_flags |
107 | .Lsysenter_flags_fixed: | |
8c7aa698 | 108 | |
a474e67c AL |
109 | /* |
110 | * User mode is traced as though IRQs are on, and SYSENTER | |
111 | * turned them off. | |
112 | */ | |
113 | TRACE_IRQS_OFF | |
e62a254a | 114 | |
a474e67c AL |
115 | movq %rsp, %rdi |
116 | call do_fast_syscall_32 | |
91e2eea9 BO |
117 | /* XEN PV guests always use IRET path */ |
118 | ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ | |
119 | "jmp .Lsyscall_32_done", X86_FEATURE_XENPV | |
7841b408 | 120 | jmp sysret32_from_system_call |
1da177e4 | 121 | |
f74acf0e | 122 | .Lsysenter_fix_flags: |
b611acf4 | 123 | pushq $X86_EFLAGS_FIXED |
131484c8 | 124 | popfq |
f74acf0e | 125 | jmp .Lsysenter_flags_fixed |
4c8cd0c5 | 126 | ENDPROC(entry_SYSENTER_compat) |
1da177e4 LT |
127 | |
128 | /* | |
54ad726c | 129 | * 32-bit SYSCALL instruction entry. |
1da177e4 | 130 | * |
54ad726c | 131 | * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, |
b87cf63e DV |
132 | * then loads new ss, cs, and rip from previously programmed MSRs. |
133 | * rflags gets masked by a value from another MSR (so CLD and CLAC | |
134 | * are not needed). SYSCALL does not save anything on the stack | |
135 | * and does not change rsp. | |
136 | * | |
137 | * Note: rflags saving+masking-with-MSR happens only in Long mode | |
54ad726c | 138 | * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). |
b87cf63e DV |
139 | * Don't get confused: rflags saving+masking depends on Long Mode Active bit |
140 | * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes | |
141 | * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). | |
142 | * | |
1da177e4 | 143 | * Arguments: |
b87cf63e DV |
144 | * eax system call number |
145 | * ecx return address | |
146 | * ebx arg1 | |
147 | * ebp arg2 (note: not saved in the stack frame, should not be touched) | |
148 | * edx arg3 | |
149 | * esi arg4 | |
150 | * edi arg5 | |
151 | * esp user stack | |
152 | * 0(%esp) arg6 | |
b87cf63e | 153 | */ |
2cd23553 | 154 | ENTRY(entry_SYSCALL_compat) |
a474e67c | 155 | /* Interrupts are off on entry. */ |
457da70e | 156 | SWAPGS_UNSAFE_STACK |
e62a254a | 157 | |
a474e67c | 158 | /* Stash user ESP and switch to the kernel stack. */ |
54ad726c IM |
159 | movl %esp, %r8d |
160 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | |
a232e3d5 | 161 | |
4ee8ec17 | 162 | /* Zero-extending 32-bit regs, do not remove */ |
54ad726c | 163 | movl %eax, %eax |
4ee8ec17 | 164 | |
4c9c0e91 | 165 | /* Construct struct pt_regs on stack */ |
131484c8 IM |
166 | pushq $__USER32_DS /* pt_regs->ss */ |
167 | pushq %r8 /* pt_regs->sp */ | |
168 | pushq %r11 /* pt_regs->flags */ | |
169 | pushq $__USER32_CS /* pt_regs->cs */ | |
170 | pushq %rcx /* pt_regs->ip */ | |
171 | pushq %rax /* pt_regs->orig_ax */ | |
172 | pushq %rdi /* pt_regs->di */ | |
173 | pushq %rsi /* pt_regs->si */ | |
174 | pushq %rdx /* pt_regs->dx */ | |
30bfa7b3 | 175 | pushq %rbp /* pt_regs->cx (stashed in bp) */ |
131484c8 | 176 | pushq $-ENOSYS /* pt_regs->ax */ |
a474e67c AL |
177 | xorq %r8,%r8 |
178 | pushq %r8 /* pt_regs->r8 = 0 */ | |
179 | pushq %r8 /* pt_regs->r9 = 0 */ | |
180 | pushq %r8 /* pt_regs->r10 = 0 */ | |
181 | pushq %r8 /* pt_regs->r11 = 0 */ | |
182 | pushq %rbx /* pt_regs->rbx */ | |
30bfa7b3 | 183 | pushq %rbp /* pt_regs->rbp (will be overwritten) */ |
a474e67c AL |
184 | pushq %r8 /* pt_regs->r12 = 0 */ |
185 | pushq %r8 /* pt_regs->r13 = 0 */ | |
186 | pushq %r8 /* pt_regs->r14 = 0 */ | |
187 | pushq %r8 /* pt_regs->r15 = 0 */ | |
4c9c0e91 | 188 | |
a474e67c AL |
189 | /* |
190 | * User mode is traced as though IRQs are on, and SYSENTER | |
191 | * turned them off. | |
192 | */ | |
193 | TRACE_IRQS_OFF | |
194 | ||
195 | movq %rsp, %rdi | |
196 | call do_fast_syscall_32 | |
91e2eea9 BO |
197 | /* XEN PV guests always use IRET path */ |
198 | ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ | |
199 | "jmp .Lsyscall_32_done", X86_FEATURE_XENPV | |
7841b408 AL |
200 | |
201 | /* Opportunistic SYSRET */ | |
202 | sysret32_from_system_call: | |
203 | TRACE_IRQS_ON /* User mode traces as IRQs on. */ | |
204 | movq RBX(%rsp), %rbx /* pt_regs->rbx */ | |
205 | movq RBP(%rsp), %rbp /* pt_regs->rbp */ | |
206 | movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ | |
207 | movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */ | |
208 | addq $RAX, %rsp /* Skip r8-r15 */ | |
209 | popq %rax /* pt_regs->rax */ | |
210 | popq %rdx /* Skip pt_regs->cx */ | |
211 | popq %rdx /* pt_regs->dx */ | |
212 | popq %rsi /* pt_regs->si */ | |
213 | popq %rdi /* pt_regs->di */ | |
214 | ||
215 | /* | |
216 | * USERGS_SYSRET32 does: | |
217 | * GSBASE = user's GS base | |
218 | * EIP = ECX | |
219 | * RFLAGS = R11 | |
220 | * CS = __USER32_CS | |
221 | * SS = __USER_DS | |
222 | * | |
223 | * ECX will not match pt_regs->cx, but we're returning to a vDSO | |
224 | * trampoline that will fix up RCX, so this is okay. | |
225 | * | |
226 | * R12-R15 are callee-saved, so they contain whatever was in them | |
227 | * when the system call started, which is already known to user | |
228 | * code. We zero R8-R10 to avoid info leaks. | |
229 | */ | |
230 | xorq %r8, %r8 | |
231 | xorq %r9, %r9 | |
232 | xorq %r10, %r10 | |
233 | movq RSP-ORIG_RAX(%rsp), %rsp | |
75ef8219 BO |
234 | swapgs |
235 | sysretl | |
2cd23553 | 236 | END(entry_SYSCALL_compat) |
54ad726c | 237 | |
b87cf63e DV |
238 | /* |
239 | * Emulated IA32 system calls via int 0x80. | |
1da177e4 | 240 | * |
b87cf63e DV |
241 | * Arguments: |
242 | * eax system call number | |
243 | * ebx arg1 | |
244 | * ecx arg2 | |
245 | * edx arg3 | |
246 | * esi arg4 | |
247 | * edi arg5 | |
248 | * ebp arg6 (note: not saved in the stack frame, should not be touched) | |
1da177e4 LT |
249 | * |
250 | * Notes: | |
b87cf63e DV |
251 | * Uses the same stack frame as the x86-64 version. |
252 | * All registers except eax must be saved (but ptrace may violate that). | |
1da177e4 LT |
253 | * Arguments are zero extended. For system calls that want sign extension and |
254 | * take long arguments a wrapper is needed. Most calls can just be called | |
255 | * directly. | |
b87cf63e DV |
256 | * Assumes it is only called from user space and entered with interrupts off. |
257 | */ | |
1da177e4 | 258 | |
2cd23553 | 259 | ENTRY(entry_INT80_compat) |
2601e64d | 260 | /* |
a232e3d5 | 261 | * Interrupts are off on entry. |
2601e64d | 262 | */ |
a232e3d5 | 263 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
3d44d51b | 264 | ASM_CLAC /* Do this early to minimize exposure */ |
a232e3d5 | 265 | SWAPGS |
a232e3d5 | 266 | |
ee08c6bd AL |
267 | /* |
268 | * User tracing code (ptrace or signal handlers) might assume that | |
269 | * the saved RAX contains a 32-bit number when we're invoking a 32-bit | |
270 | * syscall. Just in case the high bits are nonzero, zero-extend | |
271 | * the syscall number. (This could almost certainly be deleted | |
272 | * with no ill effects.) | |
273 | */ | |
54ad726c | 274 | movl %eax, %eax |
4ee8ec17 | 275 | |
4c9c0e91 | 276 | /* Construct struct pt_regs on stack (iret frame is already on stack) */ |
131484c8 IM |
277 | pushq %rax /* pt_regs->orig_ax */ |
278 | pushq %rdi /* pt_regs->di */ | |
279 | pushq %rsi /* pt_regs->si */ | |
280 | pushq %rdx /* pt_regs->dx */ | |
281 | pushq %rcx /* pt_regs->cx */ | |
282 | pushq $-ENOSYS /* pt_regs->ax */ | |
8169aff6 AL |
283 | xorq %r8,%r8 |
284 | pushq %r8 /* pt_regs->r8 = 0 */ | |
285 | pushq %r8 /* pt_regs->r9 = 0 */ | |
286 | pushq %r8 /* pt_regs->r10 = 0 */ | |
287 | pushq %r8 /* pt_regs->r11 = 0 */ | |
288 | pushq %rbx /* pt_regs->rbx */ | |
289 | pushq %rbp /* pt_regs->rbp */ | |
290 | pushq %r12 /* pt_regs->r12 */ | |
291 | pushq %r13 /* pt_regs->r13 */ | |
292 | pushq %r14 /* pt_regs->r14 */ | |
293 | pushq %r15 /* pt_regs->r15 */ | |
1da177e4 | 294 | cld |
54ad726c | 295 | |
73cbf687 | 296 | /* |
ee08c6bd AL |
297 | * User mode is traced as though IRQs are on, and the interrupt |
298 | * gate turned them off. | |
73cbf687 | 299 | */ |
ee08c6bd AL |
300 | TRACE_IRQS_OFF |
301 | ||
302 | movq %rsp, %rdi | |
657c1eea | 303 | call do_syscall_32_irqs_off |
a474e67c | 304 | .Lsyscall_32_done: |
ee08c6bd AL |
305 | |
306 | /* Go back to user mode. */ | |
307 | TRACE_IRQS_ON | |
308 | SWAPGS | |
309 | jmp restore_regs_and_iret | |
2cd23553 | 310 | END(entry_INT80_compat) |
1da177e4 | 311 | |
1d4b4b29 AV |
312 | ALIGN |
313 | GLOBAL(stub32_clone) | |
5cdc683b | 314 | /* |
7a5a9824 DV |
315 | * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). |
316 | * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). | |
317 | * | |
318 | * The native 64-bit kernel's sys_clone() implements the latter, | |
319 | * so we need to swap arguments here before calling it: | |
5cdc683b | 320 | */ |
7a5a9824 | 321 | xchg %r8, %rcx |
8169aff6 | 322 | jmp sys_clone |