]> git.proxmox.com Git - pve-kernel.git/blob - patches/kernel/0155-x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch
da52c578134a204bf8fe5eaf49dc1d47737b5f28
[pve-kernel.git] / patches / kernel / 0155-x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch
1 From 2ae2b7902084742e84eac3e32409f0d9ff4811d8 Mon Sep 17 00:00:00 2001
2 From: Andy Lutomirski <luto@kernel.org>
3 Date: Mon, 4 Dec 2017 15:07:25 +0100
4 Subject: [PATCH 155/233] x86/entry/64: Create a per-CPU SYSCALL entry
5 trampoline
6 MIME-Version: 1.0
7 Content-Type: text/plain; charset=UTF-8
8 Content-Transfer-Encoding: 8bit
9
10 CVE-2017-5754
11
12 Handling SYSCALL is tricky: the SYSCALL handler is entered with every
13 single register (except FLAGS), including RSP, live. It somehow needs
14 to set RSP to point to a valid stack, which means it needs to save the
15 user RSP somewhere and find its own stack pointer. The canonical way
16 to do this is with SWAPGS, which lets us access percpu data using the
17 %gs prefix.
18
19 With PAGE_TABLE_ISOLATION-like pagetable switching, this is
20 problematic. Without a scratch register, switching CR3 is impossible, so
21 %gs-based percpu memory would need to be mapped in the user pagetables.
22 Doing that without information leaks is difficult or impossible.
23
24 Instead, use a different sneaky trick. Map a copy of the first part
25 of the SYSCALL asm at a different address for each CPU. Now RIP
26 varies depending on the CPU, so we can use RIP-relative memory access
27 to access percpu memory. By putting the relevant information (one
28 scratch slot and the stack address) at a constant offset relative to
29 RIP, we can make SYSCALL work without relying on %gs.
30
31 A nice thing about this approach is that we can easily switch it on
32 and off if we want pagetable switching to be configurable.
33
34 The compat variant of SYSCALL doesn't have this problem in the first
35 place -- there are plenty of scratch registers, since we don't care
36 about preserving r8-r15. This patch therefore doesn't touch SYSCALL32
37 at all.
38
39 This patch actually seems to be a small speedup. With this patch,
40 SYSCALL touches an extra cache line and an extra virtual page, but
41 the pipeline no longer stalls waiting for SWAPGS. It seems that, at
42 least in a tight loop, the latter outweights the former.
43
44 Thanks to David Laight for an optimization tip.
45
46 Signed-off-by: Andy Lutomirski <luto@kernel.org>
47 Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
48 Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
49 Reviewed-by: Borislav Petkov <bpetkov@suse.de>
50 Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
51 Cc: Borislav Petkov <bp@alien8.de>
52 Cc: Brian Gerst <brgerst@gmail.com>
53 Cc: Dave Hansen <dave.hansen@intel.com>
54 Cc: Dave Hansen <dave.hansen@linux.intel.com>
55 Cc: David Laight <David.Laight@aculab.com>
56 Cc: Denys Vlasenko <dvlasenk@redhat.com>
57 Cc: Eduardo Valentin <eduval@amazon.com>
58 Cc: Greg KH <gregkh@linuxfoundation.org>
59 Cc: H. Peter Anvin <hpa@zytor.com>
60 Cc: Josh Poimboeuf <jpoimboe@redhat.com>
61 Cc: Juergen Gross <jgross@suse.com>
62 Cc: Linus Torvalds <torvalds@linux-foundation.org>
63 Cc: Peter Zijlstra <peterz@infradead.org>
64 Cc: Rik van Riel <riel@redhat.com>
65 Cc: Will Deacon <will.deacon@arm.com>
66 Cc: aliguori@amazon.com
67 Cc: daniel.gruss@iaik.tugraz.at
68 Cc: hughd@google.com
69 Cc: keescook@google.com
70 Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de
71 Signed-off-by: Ingo Molnar <mingo@kernel.org>
72 (cherry picked from commit 3386bc8aed825e9f1f65ce38df4b109b2019b71a)
73 Signed-off-by: Andy Whitcroft <apw@canonical.com>
74 Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
75 (cherry picked from commit 9fec5954d068a19bbf134da7af66db94699b03a3)
76 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
77 ---
78 arch/x86/include/asm/fixmap.h | 2 ++
79 arch/x86/kernel/asm-offsets.c | 1 +
80 arch/x86/kernel/cpu/common.c | 15 ++++++++++-
81 arch/x86/entry/entry_64.S | 58 +++++++++++++++++++++++++++++++++++++++++++
82 arch/x86/kernel/vmlinux.lds.S | 9 +++++++
83 5 files changed, 84 insertions(+), 1 deletion(-)
84
85 diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
86 index c92fc30e6def..189d12d8afe0 100644
87 --- a/arch/x86/include/asm/fixmap.h
88 +++ b/arch/x86/include/asm/fixmap.h
89 @@ -61,6 +61,8 @@ struct cpu_entry_area {
90 * of the TSS region.
91 */
92 struct tss_struct tss;
93 +
94 + char entry_trampoline[PAGE_SIZE];
95 };
96
97 #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
98 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
99 index f765c3253ec3..822be00c85ff 100644
100 --- a/arch/x86/kernel/asm-offsets.c
101 +++ b/arch/x86/kernel/asm-offsets.c
102 @@ -100,4 +100,5 @@ void common(void) {
103
104 /* Layout info for cpu_entry_area */
105 OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
106 + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
107 }
108 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
109 index 404e4b75db6e..c2b2ee73b8a1 100644
110 --- a/arch/x86/kernel/cpu/common.c
111 +++ b/arch/x86/kernel/cpu/common.c
112 @@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
113 static inline void setup_cpu_entry_area(int cpu)
114 {
115 #ifdef CONFIG_X86_64
116 + extern char _entry_trampoline[];
117 +
118 /* On 64-bit systems, we use a read-only fixmap GDT. */
119 pgprot_t gdt_prot = PAGE_KERNEL_RO;
120 #else
121 @@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area(int cpu)
122 #ifdef CONFIG_X86_32
123 this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu));
124 #endif
125 +
126 +#ifdef CONFIG_X86_64
127 + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
128 + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
129 +#endif
130 }
131
132 /* Load the original GDT from the per-cpu structure */
133 @@ -1396,10 +1403,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
134 /* May not be marked __init: used by software suspend */
135 void syscall_init(void)
136 {
137 + extern char _entry_trampoline[];
138 + extern char entry_SYSCALL_64_trampoline[];
139 +
140 int cpu = smp_processor_id();
141 + unsigned long SYSCALL64_entry_trampoline =
142 + (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
143 + (entry_SYSCALL_64_trampoline - _entry_trampoline);
144
145 wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
146 - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
147 + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
148
149 #ifdef CONFIG_IA32_EMULATION
150 wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
151 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
152 index 4abe5b806d2a..dc100a7052ee 100644
153 --- a/arch/x86/entry/entry_64.S
154 +++ b/arch/x86/entry/entry_64.S
155 @@ -135,6 +135,64 @@ END(native_usergs_sysret64)
156 * with them due to bugs in both AMD and Intel CPUs.
157 */
158
159 + .pushsection .entry_trampoline, "ax"
160 +
161 +/*
162 + * The code in here gets remapped into cpu_entry_area's trampoline. This means
163 + * that the assembler and linker have the wrong idea as to where this code
164 + * lives (and, in fact, it's mapped more than once, so it's not even at a
165 + * fixed address). So we can't reference any symbols outside the entry
166 + * trampoline and expect it to work.
167 + *
168 + * Instead, we carefully abuse %rip-relative addressing.
169 + * _entry_trampoline(%rip) refers to the start of the remapped) entry
170 + * trampoline. We can thus find cpu_entry_area with this macro:
171 + */
172 +
173 +#define CPU_ENTRY_AREA \
174 + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
175 +
176 +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
177 +#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \
178 + SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
179 +
180 +ENTRY(entry_SYSCALL_64_trampoline)
181 + UNWIND_HINT_EMPTY
182 + swapgs
183 +
184 + /* Stash the user RSP. */
185 + movq %rsp, RSP_SCRATCH
186 +
187 + /* Load the top of the task stack into RSP */
188 + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
189 +
190 + /* Start building the simulated IRET frame. */
191 + pushq $__USER_DS /* pt_regs->ss */
192 + pushq RSP_SCRATCH /* pt_regs->sp */
193 + pushq %r11 /* pt_regs->flags */
194 + pushq $__USER_CS /* pt_regs->cs */
195 + pushq %rcx /* pt_regs->ip */
196 +
197 + /*
198 + * x86 lacks a near absolute jump, and we can't jump to the real
199 + * entry text with a relative jump. We could push the target
200 + * address and then use retq, but this destroys the pipeline on
201 + * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
202 + * spill RDI and restore it in a second-stage trampoline.
203 + */
204 + pushq %rdi
205 + movq $entry_SYSCALL_64_stage2, %rdi
206 + jmp *%rdi
207 +END(entry_SYSCALL_64_trampoline)
208 +
209 + .popsection
210 +
211 +ENTRY(entry_SYSCALL_64_stage2)
212 + UNWIND_HINT_EMPTY
213 + popq %rdi
214 + jmp entry_SYSCALL_64_after_hwframe
215 +END(entry_SYSCALL_64_stage2)
216 +
217 ENTRY(entry_SYSCALL_64)
218 UNWIND_HINT_EMPTY
219 /*
220 diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
221 index f05f00acac89..423aa36f0150 100644
222 --- a/arch/x86/kernel/vmlinux.lds.S
223 +++ b/arch/x86/kernel/vmlinux.lds.S
224 @@ -106,6 +106,15 @@ SECTIONS
225 SOFTIRQENTRY_TEXT
226 *(.fixup)
227 *(.gnu.warning)
228 +
229 +#ifdef CONFIG_X86_64
230 + . = ALIGN(PAGE_SIZE);
231 + _entry_trampoline = .;
232 + *(.entry_trampoline)
233 + . = ALIGN(PAGE_SIZE);
234 + ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
235 +#endif
236 +
237 /* End of text section */
238 _etext = .;
239 } :text = 0x9090
240 --
241 2.14.2
242