]>
Commit | Line | Data |
---|---|---|
321d628a FG |
1 | From 2ae2b7902084742e84eac3e32409f0d9ff4811d8 Mon Sep 17 00:00:00 2001 |
2 | From: Andy Lutomirski <luto@kernel.org> | |
3 | Date: Mon, 4 Dec 2017 15:07:25 +0100 | |
e4cdf2a5 | 4 | Subject: [PATCH 155/241] x86/entry/64: Create a per-CPU SYSCALL entry |
321d628a FG |
5 | trampoline |
6 | MIME-Version: 1.0 | |
7 | Content-Type: text/plain; charset=UTF-8 | |
8 | Content-Transfer-Encoding: 8bit | |
9 | ||
10 | CVE-2017-5754 | |
11 | ||
12 | Handling SYSCALL is tricky: the SYSCALL handler is entered with every | |
13 | single register (except FLAGS), including RSP, live. It somehow needs | |
14 | to set RSP to point to a valid stack, which means it needs to save the | |
15 | user RSP somewhere and find its own stack pointer. The canonical way | |
16 | to do this is with SWAPGS, which lets us access percpu data using the | |
17 | %gs prefix. | |
18 | ||
19 | With PAGE_TABLE_ISOLATION-like pagetable switching, this is | |
20 | problematic. Without a scratch register, switching CR3 is impossible, so | |
21 | %gs-based percpu memory would need to be mapped in the user pagetables. | |
22 | Doing that without information leaks is difficult or impossible. | |
23 | ||
24 | Instead, use a different sneaky trick. Map a copy of the first part | |
25 | of the SYSCALL asm at a different address for each CPU. Now RIP | |
26 | varies depending on the CPU, so we can use RIP-relative memory access | |
27 | to access percpu memory. By putting the relevant information (one | |
28 | scratch slot and the stack address) at a constant offset relative to | |
29 | RIP, we can make SYSCALL work without relying on %gs. | |
30 | ||
31 | A nice thing about this approach is that we can easily switch it on | |
32 | and off if we want pagetable switching to be configurable. | |
33 | ||
34 | The compat variant of SYSCALL doesn't have this problem in the first | |
35 | place -- there are plenty of scratch registers, since we don't care | |
36 | about preserving r8-r15. This patch therefore doesn't touch SYSCALL32 | |
37 | at all. | |
38 | ||
39 | This patch actually seems to be a small speedup. With this patch, | |
40 | SYSCALL touches an extra cache line and an extra virtual page, but | |
41 | the pipeline no longer stalls waiting for SWAPGS. It seems that, at | |
42 | least in a tight loop, the latter outweights the former. | |
43 | ||
44 | Thanks to David Laight for an optimization tip. | |
45 | ||
46 | Signed-off-by: Andy Lutomirski <luto@kernel.org> | |
47 | Signed-off-by: Thomas Gleixner <tglx@linutronix.de> | |
48 | Reviewed-by: Thomas Gleixner <tglx@linutronix.de> | |
49 | Reviewed-by: Borislav Petkov <bpetkov@suse.de> | |
50 | Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> | |
51 | Cc: Borislav Petkov <bp@alien8.de> | |
52 | Cc: Brian Gerst <brgerst@gmail.com> | |
53 | Cc: Dave Hansen <dave.hansen@intel.com> | |
54 | Cc: Dave Hansen <dave.hansen@linux.intel.com> | |
55 | Cc: David Laight <David.Laight@aculab.com> | |
56 | Cc: Denys Vlasenko <dvlasenk@redhat.com> | |
57 | Cc: Eduardo Valentin <eduval@amazon.com> | |
58 | Cc: Greg KH <gregkh@linuxfoundation.org> | |
59 | Cc: H. Peter Anvin <hpa@zytor.com> | |
60 | Cc: Josh Poimboeuf <jpoimboe@redhat.com> | |
61 | Cc: Juergen Gross <jgross@suse.com> | |
62 | Cc: Linus Torvalds <torvalds@linux-foundation.org> | |
63 | Cc: Peter Zijlstra <peterz@infradead.org> | |
64 | Cc: Rik van Riel <riel@redhat.com> | |
65 | Cc: Will Deacon <will.deacon@arm.com> | |
66 | Cc: aliguori@amazon.com | |
67 | Cc: daniel.gruss@iaik.tugraz.at | |
68 | Cc: hughd@google.com | |
69 | Cc: keescook@google.com | |
70 | Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de | |
71 | Signed-off-by: Ingo Molnar <mingo@kernel.org> | |
72 | (cherry picked from commit 3386bc8aed825e9f1f65ce38df4b109b2019b71a) | |
73 | Signed-off-by: Andy Whitcroft <apw@canonical.com> | |
74 | Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com> | |
75 | (cherry picked from commit 9fec5954d068a19bbf134da7af66db94699b03a3) | |
76 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
77 | --- | |
78 | arch/x86/include/asm/fixmap.h | 2 ++ | |
79 | arch/x86/kernel/asm-offsets.c | 1 + | |
80 | arch/x86/kernel/cpu/common.c | 15 ++++++++++- | |
81 | arch/x86/entry/entry_64.S | 58 +++++++++++++++++++++++++++++++++++++++++++ | |
82 | arch/x86/kernel/vmlinux.lds.S | 9 +++++++ | |
83 | 5 files changed, 84 insertions(+), 1 deletion(-) | |
84 | ||
85 | diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h | |
86 | index c92fc30e6def..189d12d8afe0 100644 | |
87 | --- a/arch/x86/include/asm/fixmap.h | |
88 | +++ b/arch/x86/include/asm/fixmap.h | |
89 | @@ -61,6 +61,8 @@ struct cpu_entry_area { | |
90 | * of the TSS region. | |
91 | */ | |
92 | struct tss_struct tss; | |
93 | + | |
94 | + char entry_trampoline[PAGE_SIZE]; | |
95 | }; | |
96 | ||
97 | #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) | |
98 | diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c | |
99 | index f765c3253ec3..822be00c85ff 100644 | |
100 | --- a/arch/x86/kernel/asm-offsets.c | |
101 | +++ b/arch/x86/kernel/asm-offsets.c | |
102 | @@ -100,4 +100,5 @@ void common(void) { | |
103 | ||
104 | /* Layout info for cpu_entry_area */ | |
105 | OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); | |
106 | + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); | |
107 | } | |
108 | diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c | |
109 | index 404e4b75db6e..c2b2ee73b8a1 100644 | |
110 | --- a/arch/x86/kernel/cpu/common.c | |
111 | +++ b/arch/x86/kernel/cpu/common.c | |
112 | @@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); | |
113 | static inline void setup_cpu_entry_area(int cpu) | |
114 | { | |
115 | #ifdef CONFIG_X86_64 | |
116 | + extern char _entry_trampoline[]; | |
117 | + | |
118 | /* On 64-bit systems, we use a read-only fixmap GDT. */ | |
119 | pgprot_t gdt_prot = PAGE_KERNEL_RO; | |
120 | #else | |
121 | @@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area(int cpu) | |
122 | #ifdef CONFIG_X86_32 | |
123 | this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); | |
124 | #endif | |
125 | + | |
126 | +#ifdef CONFIG_X86_64 | |
127 | + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), | |
128 | + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); | |
129 | +#endif | |
130 | } | |
131 | ||
132 | /* Load the original GDT from the per-cpu structure */ | |
133 | @@ -1396,10 +1403,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks | |
134 | /* May not be marked __init: used by software suspend */ | |
135 | void syscall_init(void) | |
136 | { | |
137 | + extern char _entry_trampoline[]; | |
138 | + extern char entry_SYSCALL_64_trampoline[]; | |
139 | + | |
140 | int cpu = smp_processor_id(); | |
141 | + unsigned long SYSCALL64_entry_trampoline = | |
142 | + (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + | |
143 | + (entry_SYSCALL_64_trampoline - _entry_trampoline); | |
144 | ||
145 | wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); | |
146 | - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); | |
147 | + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); | |
148 | ||
149 | #ifdef CONFIG_IA32_EMULATION | |
150 | wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); | |
151 | diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S | |
152 | index 4abe5b806d2a..dc100a7052ee 100644 | |
153 | --- a/arch/x86/entry/entry_64.S | |
154 | +++ b/arch/x86/entry/entry_64.S | |
155 | @@ -135,6 +135,64 @@ END(native_usergs_sysret64) | |
156 | * with them due to bugs in both AMD and Intel CPUs. | |
157 | */ | |
158 | ||
159 | + .pushsection .entry_trampoline, "ax" | |
160 | + | |
161 | +/* | |
162 | + * The code in here gets remapped into cpu_entry_area's trampoline. This means | |
163 | + * that the assembler and linker have the wrong idea as to where this code | |
164 | + * lives (and, in fact, it's mapped more than once, so it's not even at a | |
165 | + * fixed address). So we can't reference any symbols outside the entry | |
166 | + * trampoline and expect it to work. | |
167 | + * | |
168 | + * Instead, we carefully abuse %rip-relative addressing. | |
169 | + * _entry_trampoline(%rip) refers to the start of the remapped) entry | |
170 | + * trampoline. We can thus find cpu_entry_area with this macro: | |
171 | + */ | |
172 | + | |
173 | +#define CPU_ENTRY_AREA \ | |
174 | + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) | |
175 | + | |
176 | +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ | |
177 | +#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ | |
178 | + SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA | |
179 | + | |
180 | +ENTRY(entry_SYSCALL_64_trampoline) | |
181 | + UNWIND_HINT_EMPTY | |
182 | + swapgs | |
183 | + | |
184 | + /* Stash the user RSP. */ | |
185 | + movq %rsp, RSP_SCRATCH | |
186 | + | |
187 | + /* Load the top of the task stack into RSP */ | |
188 | + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp | |
189 | + | |
190 | + /* Start building the simulated IRET frame. */ | |
191 | + pushq $__USER_DS /* pt_regs->ss */ | |
192 | + pushq RSP_SCRATCH /* pt_regs->sp */ | |
193 | + pushq %r11 /* pt_regs->flags */ | |
194 | + pushq $__USER_CS /* pt_regs->cs */ | |
195 | + pushq %rcx /* pt_regs->ip */ | |
196 | + | |
197 | + /* | |
198 | + * x86 lacks a near absolute jump, and we can't jump to the real | |
199 | + * entry text with a relative jump. We could push the target | |
200 | + * address and then use retq, but this destroys the pipeline on | |
201 | + * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, | |
202 | + * spill RDI and restore it in a second-stage trampoline. | |
203 | + */ | |
204 | + pushq %rdi | |
205 | + movq $entry_SYSCALL_64_stage2, %rdi | |
206 | + jmp *%rdi | |
207 | +END(entry_SYSCALL_64_trampoline) | |
208 | + | |
209 | + .popsection | |
210 | + | |
211 | +ENTRY(entry_SYSCALL_64_stage2) | |
212 | + UNWIND_HINT_EMPTY | |
213 | + popq %rdi | |
214 | + jmp entry_SYSCALL_64_after_hwframe | |
215 | +END(entry_SYSCALL_64_stage2) | |
216 | + | |
217 | ENTRY(entry_SYSCALL_64) | |
218 | UNWIND_HINT_EMPTY | |
219 | /* | |
220 | diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S | |
221 | index f05f00acac89..423aa36f0150 100644 | |
222 | --- a/arch/x86/kernel/vmlinux.lds.S | |
223 | +++ b/arch/x86/kernel/vmlinux.lds.S | |
224 | @@ -106,6 +106,15 @@ SECTIONS | |
225 | SOFTIRQENTRY_TEXT | |
226 | *(.fixup) | |
227 | *(.gnu.warning) | |
228 | + | |
229 | +#ifdef CONFIG_X86_64 | |
230 | + . = ALIGN(PAGE_SIZE); | |
231 | + _entry_trampoline = .; | |
232 | + *(.entry_trampoline) | |
233 | + . = ALIGN(PAGE_SIZE); | |
234 | + ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); | |
235 | +#endif | |
236 | + | |
237 | /* End of text section */ | |
238 | _etext = .; | |
239 | } :text = 0x9090 | |
240 | -- | |
241 | 2.14.2 | |
242 |