]> git.proxmox.com Git - pve-kernel.git/blob - patches/kernel/0157-x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch
4319f10281474755d480c1768ad890e512da1e70
[pve-kernel.git] / patches / kernel / 0157-x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Andy Lutomirski <luto@kernel.org>
3 Date: Mon, 4 Dec 2017 15:07:25 +0100
4 Subject: [PATCH] x86/entry/64: Create a per-CPU SYSCALL entry trampoline
5 MIME-Version: 1.0
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
8
9 CVE-2017-5754
10
11 Handling SYSCALL is tricky: the SYSCALL handler is entered with every
12 single register (except FLAGS), including RSP, live. It somehow needs
13 to set RSP to point to a valid stack, which means it needs to save the
14 user RSP somewhere and find its own stack pointer. The canonical way
15 to do this is with SWAPGS, which lets us access percpu data using the
16 %gs prefix.
17
18 With PAGE_TABLE_ISOLATION-like pagetable switching, this is
19 problematic. Without a scratch register, switching CR3 is impossible, so
20 %gs-based percpu memory would need to be mapped in the user pagetables.
21 Doing that without information leaks is difficult or impossible.
22
23 Instead, use a different sneaky trick. Map a copy of the first part
24 of the SYSCALL asm at a different address for each CPU. Now RIP
25 varies depending on the CPU, so we can use RIP-relative memory access
26 to access percpu memory. By putting the relevant information (one
27 scratch slot and the stack address) at a constant offset relative to
28 RIP, we can make SYSCALL work without relying on %gs.
29
30 A nice thing about this approach is that we can easily switch it on
31 and off if we want pagetable switching to be configurable.
32
33 The compat variant of SYSCALL doesn't have this problem in the first
34 place -- there are plenty of scratch registers, since we don't care
35 about preserving r8-r15. This patch therefore doesn't touch SYSCALL32
36 at all.
37
38 This patch actually seems to be a small speedup. With this patch,
39 SYSCALL touches an extra cache line and an extra virtual page, but
40 the pipeline no longer stalls waiting for SWAPGS. It seems that, at
41 least in a tight loop, the latter outweights the former.
42
43 Thanks to David Laight for an optimization tip.
44
45 Signed-off-by: Andy Lutomirski <luto@kernel.org>
46 Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
47 Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
48 Reviewed-by: Borislav Petkov <bpetkov@suse.de>
49 Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
50 Cc: Borislav Petkov <bp@alien8.de>
51 Cc: Brian Gerst <brgerst@gmail.com>
52 Cc: Dave Hansen <dave.hansen@intel.com>
53 Cc: Dave Hansen <dave.hansen@linux.intel.com>
54 Cc: David Laight <David.Laight@aculab.com>
55 Cc: Denys Vlasenko <dvlasenk@redhat.com>
56 Cc: Eduardo Valentin <eduval@amazon.com>
57 Cc: Greg KH <gregkh@linuxfoundation.org>
58 Cc: H. Peter Anvin <hpa@zytor.com>
59 Cc: Josh Poimboeuf <jpoimboe@redhat.com>
60 Cc: Juergen Gross <jgross@suse.com>
61 Cc: Linus Torvalds <torvalds@linux-foundation.org>
62 Cc: Peter Zijlstra <peterz@infradead.org>
63 Cc: Rik van Riel <riel@redhat.com>
64 Cc: Will Deacon <will.deacon@arm.com>
65 Cc: aliguori@amazon.com
66 Cc: daniel.gruss@iaik.tugraz.at
67 Cc: hughd@google.com
68 Cc: keescook@google.com
69 Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de
70 Signed-off-by: Ingo Molnar <mingo@kernel.org>
71 (cherry picked from commit 3386bc8aed825e9f1f65ce38df4b109b2019b71a)
72 Signed-off-by: Andy Whitcroft <apw@canonical.com>
73 Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
74 (cherry picked from commit 9fec5954d068a19bbf134da7af66db94699b03a3)
75 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
76 ---
77 arch/x86/include/asm/fixmap.h | 2 ++
78 arch/x86/kernel/asm-offsets.c | 1 +
79 arch/x86/kernel/cpu/common.c | 15 ++++++++++-
80 arch/x86/entry/entry_64.S | 58 +++++++++++++++++++++++++++++++++++++++++++
81 arch/x86/kernel/vmlinux.lds.S | 9 +++++++
82 5 files changed, 84 insertions(+), 1 deletion(-)
83
84 diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
85 index c92fc30e6def..189d12d8afe0 100644
86 --- a/arch/x86/include/asm/fixmap.h
87 +++ b/arch/x86/include/asm/fixmap.h
88 @@ -61,6 +61,8 @@ struct cpu_entry_area {
89 * of the TSS region.
90 */
91 struct tss_struct tss;
92 +
93 + char entry_trampoline[PAGE_SIZE];
94 };
95
96 #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
97 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
98 index f765c3253ec3..822be00c85ff 100644
99 --- a/arch/x86/kernel/asm-offsets.c
100 +++ b/arch/x86/kernel/asm-offsets.c
101 @@ -100,4 +100,5 @@ void common(void) {
102
103 /* Layout info for cpu_entry_area */
104 OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
105 + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
106 }
107 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
108 index 404e4b75db6e..c2b2ee73b8a1 100644
109 --- a/arch/x86/kernel/cpu/common.c
110 +++ b/arch/x86/kernel/cpu/common.c
111 @@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
112 static inline void setup_cpu_entry_area(int cpu)
113 {
114 #ifdef CONFIG_X86_64
115 + extern char _entry_trampoline[];
116 +
117 /* On 64-bit systems, we use a read-only fixmap GDT. */
118 pgprot_t gdt_prot = PAGE_KERNEL_RO;
119 #else
120 @@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area(int cpu)
121 #ifdef CONFIG_X86_32
122 this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu));
123 #endif
124 +
125 +#ifdef CONFIG_X86_64
126 + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
127 + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
128 +#endif
129 }
130
131 /* Load the original GDT from the per-cpu structure */
132 @@ -1396,10 +1403,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
133 /* May not be marked __init: used by software suspend */
134 void syscall_init(void)
135 {
136 + extern char _entry_trampoline[];
137 + extern char entry_SYSCALL_64_trampoline[];
138 +
139 int cpu = smp_processor_id();
140 + unsigned long SYSCALL64_entry_trampoline =
141 + (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
142 + (entry_SYSCALL_64_trampoline - _entry_trampoline);
143
144 wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
145 - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
146 + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
147
148 #ifdef CONFIG_IA32_EMULATION
149 wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
150 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
151 index 4abe5b806d2a..dc100a7052ee 100644
152 --- a/arch/x86/entry/entry_64.S
153 +++ b/arch/x86/entry/entry_64.S
154 @@ -135,6 +135,64 @@ END(native_usergs_sysret64)
155 * with them due to bugs in both AMD and Intel CPUs.
156 */
157
158 + .pushsection .entry_trampoline, "ax"
159 +
160 +/*
161 + * The code in here gets remapped into cpu_entry_area's trampoline. This means
162 + * that the assembler and linker have the wrong idea as to where this code
163 + * lives (and, in fact, it's mapped more than once, so it's not even at a
164 + * fixed address). So we can't reference any symbols outside the entry
165 + * trampoline and expect it to work.
166 + *
167 + * Instead, we carefully abuse %rip-relative addressing.
168 + * _entry_trampoline(%rip) refers to the start of the remapped) entry
169 + * trampoline. We can thus find cpu_entry_area with this macro:
170 + */
171 +
172 +#define CPU_ENTRY_AREA \
173 + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
174 +
175 +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
176 +#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \
177 + SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
178 +
179 +ENTRY(entry_SYSCALL_64_trampoline)
180 + UNWIND_HINT_EMPTY
181 + swapgs
182 +
183 + /* Stash the user RSP. */
184 + movq %rsp, RSP_SCRATCH
185 +
186 + /* Load the top of the task stack into RSP */
187 + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
188 +
189 + /* Start building the simulated IRET frame. */
190 + pushq $__USER_DS /* pt_regs->ss */
191 + pushq RSP_SCRATCH /* pt_regs->sp */
192 + pushq %r11 /* pt_regs->flags */
193 + pushq $__USER_CS /* pt_regs->cs */
194 + pushq %rcx /* pt_regs->ip */
195 +
196 + /*
197 + * x86 lacks a near absolute jump, and we can't jump to the real
198 + * entry text with a relative jump. We could push the target
199 + * address and then use retq, but this destroys the pipeline on
200 + * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
201 + * spill RDI and restore it in a second-stage trampoline.
202 + */
203 + pushq %rdi
204 + movq $entry_SYSCALL_64_stage2, %rdi
205 + jmp *%rdi
206 +END(entry_SYSCALL_64_trampoline)
207 +
208 + .popsection
209 +
210 +ENTRY(entry_SYSCALL_64_stage2)
211 + UNWIND_HINT_EMPTY
212 + popq %rdi
213 + jmp entry_SYSCALL_64_after_hwframe
214 +END(entry_SYSCALL_64_stage2)
215 +
216 ENTRY(entry_SYSCALL_64)
217 UNWIND_HINT_EMPTY
218 /*
219 diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
220 index f05f00acac89..423aa36f0150 100644
221 --- a/arch/x86/kernel/vmlinux.lds.S
222 +++ b/arch/x86/kernel/vmlinux.lds.S
223 @@ -106,6 +106,15 @@ SECTIONS
224 SOFTIRQENTRY_TEXT
225 *(.fixup)
226 *(.gnu.warning)
227 +
228 +#ifdef CONFIG_X86_64
229 + . = ALIGN(PAGE_SIZE);
230 + _entry_trampoline = .;
231 + *(.entry_trampoline)
232 + . = ALIGN(PAGE_SIZE);
233 + ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
234 +#endif
235 +
236 /* End of text section */
237 _etext = .;
238 } :text = 0x9090
239 --
240 2.14.2
241