]>
Commit | Line | Data |
---|---|---|
59d5af67 | 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
321d628a FG |
2 | From: Andy Lutomirski <luto@kernel.org> |
3 | Date: Mon, 4 Dec 2017 15:07:25 +0100 | |
59d5af67 | 4 | Subject: [PATCH] x86/entry/64: Create a per-CPU SYSCALL entry trampoline |
321d628a FG |
5 | MIME-Version: 1.0 |
6 | Content-Type: text/plain; charset=UTF-8 | |
7 | Content-Transfer-Encoding: 8bit | |
8 | ||
9 | CVE-2017-5754 | |
10 | ||
11 | Handling SYSCALL is tricky: the SYSCALL handler is entered with every | |
12 | single register (except FLAGS), including RSP, live. It somehow needs | |
13 | to set RSP to point to a valid stack, which means it needs to save the | |
14 | user RSP somewhere and find its own stack pointer. The canonical way | |
15 | to do this is with SWAPGS, which lets us access percpu data using the | |
16 | %gs prefix. | |
17 | ||
18 | With PAGE_TABLE_ISOLATION-like pagetable switching, this is | |
19 | problematic. Without a scratch register, switching CR3 is impossible, so | |
20 | %gs-based percpu memory would need to be mapped in the user pagetables. | |
21 | Doing that without information leaks is difficult or impossible. | |
22 | ||
23 | Instead, use a different sneaky trick. Map a copy of the first part | |
24 | of the SYSCALL asm at a different address for each CPU. Now RIP | |
25 | varies depending on the CPU, so we can use RIP-relative memory access | |
26 | to access percpu memory. By putting the relevant information (one | |
27 | scratch slot and the stack address) at a constant offset relative to | |
28 | RIP, we can make SYSCALL work without relying on %gs. | |
29 | ||
30 | A nice thing about this approach is that we can easily switch it on | |
31 | and off if we want pagetable switching to be configurable. | |
32 | ||
33 | The compat variant of SYSCALL doesn't have this problem in the first | |
34 | place -- there are plenty of scratch registers, since we don't care | |
35 | about preserving r8-r15. This patch therefore doesn't touch SYSCALL32 | |
36 | at all. | |
37 | ||
38 | This patch actually seems to be a small speedup. With this patch, | |
39 | SYSCALL touches an extra cache line and an extra virtual page, but | |
40 | the pipeline no longer stalls waiting for SWAPGS. It seems that, at | |
41 | least in a tight loop, the latter outweights the former. | |
42 | ||
43 | Thanks to David Laight for an optimization tip. | |
44 | ||
45 | Signed-off-by: Andy Lutomirski <luto@kernel.org> | |
46 | Signed-off-by: Thomas Gleixner <tglx@linutronix.de> | |
47 | Reviewed-by: Thomas Gleixner <tglx@linutronix.de> | |
48 | Reviewed-by: Borislav Petkov <bpetkov@suse.de> | |
49 | Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> | |
50 | Cc: Borislav Petkov <bp@alien8.de> | |
51 | Cc: Brian Gerst <brgerst@gmail.com> | |
52 | Cc: Dave Hansen <dave.hansen@intel.com> | |
53 | Cc: Dave Hansen <dave.hansen@linux.intel.com> | |
54 | Cc: David Laight <David.Laight@aculab.com> | |
55 | Cc: Denys Vlasenko <dvlasenk@redhat.com> | |
56 | Cc: Eduardo Valentin <eduval@amazon.com> | |
57 | Cc: Greg KH <gregkh@linuxfoundation.org> | |
58 | Cc: H. Peter Anvin <hpa@zytor.com> | |
59 | Cc: Josh Poimboeuf <jpoimboe@redhat.com> | |
60 | Cc: Juergen Gross <jgross@suse.com> | |
61 | Cc: Linus Torvalds <torvalds@linux-foundation.org> | |
62 | Cc: Peter Zijlstra <peterz@infradead.org> | |
63 | Cc: Rik van Riel <riel@redhat.com> | |
64 | Cc: Will Deacon <will.deacon@arm.com> | |
65 | Cc: aliguori@amazon.com | |
66 | Cc: daniel.gruss@iaik.tugraz.at | |
67 | Cc: hughd@google.com | |
68 | Cc: keescook@google.com | |
69 | Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de | |
70 | Signed-off-by: Ingo Molnar <mingo@kernel.org> | |
71 | (cherry picked from commit 3386bc8aed825e9f1f65ce38df4b109b2019b71a) | |
72 | Signed-off-by: Andy Whitcroft <apw@canonical.com> | |
73 | Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com> | |
74 | (cherry picked from commit 9fec5954d068a19bbf134da7af66db94699b03a3) | |
75 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
76 | --- | |
77 | arch/x86/include/asm/fixmap.h | 2 ++ | |
78 | arch/x86/kernel/asm-offsets.c | 1 + | |
79 | arch/x86/kernel/cpu/common.c | 15 ++++++++++- | |
80 | arch/x86/entry/entry_64.S | 58 +++++++++++++++++++++++++++++++++++++++++++ | |
81 | arch/x86/kernel/vmlinux.lds.S | 9 +++++++ | |
82 | 5 files changed, 84 insertions(+), 1 deletion(-) | |
83 | ||
84 | diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h | |
85 | index c92fc30e6def..189d12d8afe0 100644 | |
86 | --- a/arch/x86/include/asm/fixmap.h | |
87 | +++ b/arch/x86/include/asm/fixmap.h | |
88 | @@ -61,6 +61,8 @@ struct cpu_entry_area { | |
89 | * of the TSS region. | |
90 | */ | |
91 | struct tss_struct tss; | |
92 | + | |
93 | + char entry_trampoline[PAGE_SIZE]; | |
94 | }; | |
95 | ||
96 | #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) | |
97 | diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c | |
98 | index f765c3253ec3..822be00c85ff 100644 | |
99 | --- a/arch/x86/kernel/asm-offsets.c | |
100 | +++ b/arch/x86/kernel/asm-offsets.c | |
101 | @@ -100,4 +100,5 @@ void common(void) { | |
102 | ||
103 | /* Layout info for cpu_entry_area */ | |
104 | OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); | |
105 | + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); | |
106 | } | |
107 | diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c | |
108 | index 404e4b75db6e..c2b2ee73b8a1 100644 | |
109 | --- a/arch/x86/kernel/cpu/common.c | |
110 | +++ b/arch/x86/kernel/cpu/common.c | |
111 | @@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); | |
112 | static inline void setup_cpu_entry_area(int cpu) | |
113 | { | |
114 | #ifdef CONFIG_X86_64 | |
115 | + extern char _entry_trampoline[]; | |
116 | + | |
117 | /* On 64-bit systems, we use a read-only fixmap GDT. */ | |
118 | pgprot_t gdt_prot = PAGE_KERNEL_RO; | |
119 | #else | |
120 | @@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area(int cpu) | |
121 | #ifdef CONFIG_X86_32 | |
122 | this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); | |
123 | #endif | |
124 | + | |
125 | +#ifdef CONFIG_X86_64 | |
126 | + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), | |
127 | + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); | |
128 | +#endif | |
129 | } | |
130 | ||
131 | /* Load the original GDT from the per-cpu structure */ | |
132 | @@ -1396,10 +1403,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks | |
133 | /* May not be marked __init: used by software suspend */ | |
134 | void syscall_init(void) | |
135 | { | |
136 | + extern char _entry_trampoline[]; | |
137 | + extern char entry_SYSCALL_64_trampoline[]; | |
138 | + | |
139 | int cpu = smp_processor_id(); | |
140 | + unsigned long SYSCALL64_entry_trampoline = | |
141 | + (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + | |
142 | + (entry_SYSCALL_64_trampoline - _entry_trampoline); | |
143 | ||
144 | wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); | |
145 | - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); | |
146 | + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); | |
147 | ||
148 | #ifdef CONFIG_IA32_EMULATION | |
149 | wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); | |
150 | diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S | |
151 | index 4abe5b806d2a..dc100a7052ee 100644 | |
152 | --- a/arch/x86/entry/entry_64.S | |
153 | +++ b/arch/x86/entry/entry_64.S | |
154 | @@ -135,6 +135,64 @@ END(native_usergs_sysret64) | |
155 | * with them due to bugs in both AMD and Intel CPUs. | |
156 | */ | |
157 | ||
158 | + .pushsection .entry_trampoline, "ax" | |
159 | + | |
160 | +/* | |
161 | + * The code in here gets remapped into cpu_entry_area's trampoline. This means | |
162 | + * that the assembler and linker have the wrong idea as to where this code | |
163 | + * lives (and, in fact, it's mapped more than once, so it's not even at a | |
164 | + * fixed address). So we can't reference any symbols outside the entry | |
165 | + * trampoline and expect it to work. | |
166 | + * | |
167 | + * Instead, we carefully abuse %rip-relative addressing. | |
168 | + * _entry_trampoline(%rip) refers to the start of the remapped) entry | |
169 | + * trampoline. We can thus find cpu_entry_area with this macro: | |
170 | + */ | |
171 | + | |
172 | +#define CPU_ENTRY_AREA \ | |
173 | + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) | |
174 | + | |
175 | +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ | |
176 | +#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ | |
177 | + SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA | |
178 | + | |
179 | +ENTRY(entry_SYSCALL_64_trampoline) | |
180 | + UNWIND_HINT_EMPTY | |
181 | + swapgs | |
182 | + | |
183 | + /* Stash the user RSP. */ | |
184 | + movq %rsp, RSP_SCRATCH | |
185 | + | |
186 | + /* Load the top of the task stack into RSP */ | |
187 | + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp | |
188 | + | |
189 | + /* Start building the simulated IRET frame. */ | |
190 | + pushq $__USER_DS /* pt_regs->ss */ | |
191 | + pushq RSP_SCRATCH /* pt_regs->sp */ | |
192 | + pushq %r11 /* pt_regs->flags */ | |
193 | + pushq $__USER_CS /* pt_regs->cs */ | |
194 | + pushq %rcx /* pt_regs->ip */ | |
195 | + | |
196 | + /* | |
197 | + * x86 lacks a near absolute jump, and we can't jump to the real | |
198 | + * entry text with a relative jump. We could push the target | |
199 | + * address and then use retq, but this destroys the pipeline on | |
200 | + * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, | |
201 | + * spill RDI and restore it in a second-stage trampoline. | |
202 | + */ | |
203 | + pushq %rdi | |
204 | + movq $entry_SYSCALL_64_stage2, %rdi | |
205 | + jmp *%rdi | |
206 | +END(entry_SYSCALL_64_trampoline) | |
207 | + | |
208 | + .popsection | |
209 | + | |
210 | +ENTRY(entry_SYSCALL_64_stage2) | |
211 | + UNWIND_HINT_EMPTY | |
212 | + popq %rdi | |
213 | + jmp entry_SYSCALL_64_after_hwframe | |
214 | +END(entry_SYSCALL_64_stage2) | |
215 | + | |
216 | ENTRY(entry_SYSCALL_64) | |
217 | UNWIND_HINT_EMPTY | |
218 | /* | |
219 | diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S | |
220 | index f05f00acac89..423aa36f0150 100644 | |
221 | --- a/arch/x86/kernel/vmlinux.lds.S | |
222 | +++ b/arch/x86/kernel/vmlinux.lds.S | |
223 | @@ -106,6 +106,15 @@ SECTIONS | |
224 | SOFTIRQENTRY_TEXT | |
225 | *(.fixup) | |
226 | *(.gnu.warning) | |
227 | + | |
228 | +#ifdef CONFIG_X86_64 | |
229 | + . = ALIGN(PAGE_SIZE); | |
230 | + _entry_trampoline = .; | |
231 | + *(.entry_trampoline) | |
232 | + . = ALIGN(PAGE_SIZE); | |
233 | + ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); | |
234 | +#endif | |
235 | + | |
236 | /* End of text section */ | |
237 | _etext = .; | |
238 | } :text = 0x9090 | |
239 | -- | |
240 | 2.14.2 | |
241 |