]> git.proxmox.com Git - pve-kernel.git/blame - patches/kernel/0155-x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch
KPTI: add follow-up fixes
[pve-kernel.git] / patches / kernel / 0155-x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch
CommitLineData
321d628a
FG
1From 2ae2b7902084742e84eac3e32409f0d9ff4811d8 Mon Sep 17 00:00:00 2001
2From: Andy Lutomirski <luto@kernel.org>
3Date: Mon, 4 Dec 2017 15:07:25 +0100
e4cdf2a5 4Subject: [PATCH 155/241] x86/entry/64: Create a per-CPU SYSCALL entry
321d628a
FG
5 trampoline
6MIME-Version: 1.0
7Content-Type: text/plain; charset=UTF-8
8Content-Transfer-Encoding: 8bit
9
10CVE-2017-5754
11
12Handling SYSCALL is tricky: the SYSCALL handler is entered with every
13single register (except FLAGS), including RSP, live. It somehow needs
14to set RSP to point to a valid stack, which means it needs to save the
15user RSP somewhere and find its own stack pointer. The canonical way
16to do this is with SWAPGS, which lets us access percpu data using the
17%gs prefix.
18
19With PAGE_TABLE_ISOLATION-like pagetable switching, this is
20problematic. Without a scratch register, switching CR3 is impossible, so
21%gs-based percpu memory would need to be mapped in the user pagetables.
22Doing that without information leaks is difficult or impossible.
23
24Instead, use a different sneaky trick. Map a copy of the first part
25of the SYSCALL asm at a different address for each CPU. Now RIP
26varies depending on the CPU, so we can use RIP-relative memory access
27to access percpu memory. By putting the relevant information (one
28scratch slot and the stack address) at a constant offset relative to
29RIP, we can make SYSCALL work without relying on %gs.
30
31A nice thing about this approach is that we can easily switch it on
32and off if we want pagetable switching to be configurable.
33
34The compat variant of SYSCALL doesn't have this problem in the first
35place -- there are plenty of scratch registers, since we don't care
36about preserving r8-r15. This patch therefore doesn't touch SYSCALL32
37at all.
38
39This patch actually seems to be a small speedup. With this patch,
40SYSCALL touches an extra cache line and an extra virtual page, but
41the pipeline no longer stalls waiting for SWAPGS. It seems that, at
42least in a tight loop, the latter outweights the former.
43
44Thanks to David Laight for an optimization tip.
45
46Signed-off-by: Andy Lutomirski <luto@kernel.org>
47Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
48Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
49Reviewed-by: Borislav Petkov <bpetkov@suse.de>
50Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
51Cc: Borislav Petkov <bp@alien8.de>
52Cc: Brian Gerst <brgerst@gmail.com>
53Cc: Dave Hansen <dave.hansen@intel.com>
54Cc: Dave Hansen <dave.hansen@linux.intel.com>
55Cc: David Laight <David.Laight@aculab.com>
56Cc: Denys Vlasenko <dvlasenk@redhat.com>
57Cc: Eduardo Valentin <eduval@amazon.com>
58Cc: Greg KH <gregkh@linuxfoundation.org>
59Cc: H. Peter Anvin <hpa@zytor.com>
60Cc: Josh Poimboeuf <jpoimboe@redhat.com>
61Cc: Juergen Gross <jgross@suse.com>
62Cc: Linus Torvalds <torvalds@linux-foundation.org>
63Cc: Peter Zijlstra <peterz@infradead.org>
64Cc: Rik van Riel <riel@redhat.com>
65Cc: Will Deacon <will.deacon@arm.com>
66Cc: aliguori@amazon.com
67Cc: daniel.gruss@iaik.tugraz.at
68Cc: hughd@google.com
69Cc: keescook@google.com
70Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de
71Signed-off-by: Ingo Molnar <mingo@kernel.org>
72(cherry picked from commit 3386bc8aed825e9f1f65ce38df4b109b2019b71a)
73Signed-off-by: Andy Whitcroft <apw@canonical.com>
74Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
75(cherry picked from commit 9fec5954d068a19bbf134da7af66db94699b03a3)
76Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
77---
78 arch/x86/include/asm/fixmap.h | 2 ++
79 arch/x86/kernel/asm-offsets.c | 1 +
80 arch/x86/kernel/cpu/common.c | 15 ++++++++++-
81 arch/x86/entry/entry_64.S | 58 +++++++++++++++++++++++++++++++++++++++++++
82 arch/x86/kernel/vmlinux.lds.S | 9 +++++++
83 5 files changed, 84 insertions(+), 1 deletion(-)
84
85diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
86index c92fc30e6def..189d12d8afe0 100644
87--- a/arch/x86/include/asm/fixmap.h
88+++ b/arch/x86/include/asm/fixmap.h
89@@ -61,6 +61,8 @@ struct cpu_entry_area {
90 * of the TSS region.
91 */
92 struct tss_struct tss;
93+
94+ char entry_trampoline[PAGE_SIZE];
95 };
96
97 #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
98diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
99index f765c3253ec3..822be00c85ff 100644
100--- a/arch/x86/kernel/asm-offsets.c
101+++ b/arch/x86/kernel/asm-offsets.c
102@@ -100,4 +100,5 @@ void common(void) {
103
104 /* Layout info for cpu_entry_area */
105 OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
106+ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
107 }
108diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
109index 404e4b75db6e..c2b2ee73b8a1 100644
110--- a/arch/x86/kernel/cpu/common.c
111+++ b/arch/x86/kernel/cpu/common.c
112@@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
113 static inline void setup_cpu_entry_area(int cpu)
114 {
115 #ifdef CONFIG_X86_64
116+ extern char _entry_trampoline[];
117+
118 /* On 64-bit systems, we use a read-only fixmap GDT. */
119 pgprot_t gdt_prot = PAGE_KERNEL_RO;
120 #else
121@@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area(int cpu)
122 #ifdef CONFIG_X86_32
123 this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu));
124 #endif
125+
126+#ifdef CONFIG_X86_64
127+ __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
128+ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
129+#endif
130 }
131
132 /* Load the original GDT from the per-cpu structure */
133@@ -1396,10 +1403,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
134 /* May not be marked __init: used by software suspend */
135 void syscall_init(void)
136 {
137+ extern char _entry_trampoline[];
138+ extern char entry_SYSCALL_64_trampoline[];
139+
140 int cpu = smp_processor_id();
141+ unsigned long SYSCALL64_entry_trampoline =
142+ (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
143+ (entry_SYSCALL_64_trampoline - _entry_trampoline);
144
145 wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
146- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
147+ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
148
149 #ifdef CONFIG_IA32_EMULATION
150 wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
151diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
152index 4abe5b806d2a..dc100a7052ee 100644
153--- a/arch/x86/entry/entry_64.S
154+++ b/arch/x86/entry/entry_64.S
155@@ -135,6 +135,64 @@ END(native_usergs_sysret64)
156 * with them due to bugs in both AMD and Intel CPUs.
157 */
158
159+ .pushsection .entry_trampoline, "ax"
160+
161+/*
162+ * The code in here gets remapped into cpu_entry_area's trampoline. This means
163+ * that the assembler and linker have the wrong idea as to where this code
164+ * lives (and, in fact, it's mapped more than once, so it's not even at a
165+ * fixed address). So we can't reference any symbols outside the entry
166+ * trampoline and expect it to work.
167+ *
168+ * Instead, we carefully abuse %rip-relative addressing.
169+ * _entry_trampoline(%rip) refers to the start of the remapped) entry
170+ * trampoline. We can thus find cpu_entry_area with this macro:
171+ */
172+
173+#define CPU_ENTRY_AREA \
174+ _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
175+
176+/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
177+#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \
178+ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
179+
180+ENTRY(entry_SYSCALL_64_trampoline)
181+ UNWIND_HINT_EMPTY
182+ swapgs
183+
184+ /* Stash the user RSP. */
185+ movq %rsp, RSP_SCRATCH
186+
187+ /* Load the top of the task stack into RSP */
188+ movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
189+
190+ /* Start building the simulated IRET frame. */
191+ pushq $__USER_DS /* pt_regs->ss */
192+ pushq RSP_SCRATCH /* pt_regs->sp */
193+ pushq %r11 /* pt_regs->flags */
194+ pushq $__USER_CS /* pt_regs->cs */
195+ pushq %rcx /* pt_regs->ip */
196+
197+ /*
198+ * x86 lacks a near absolute jump, and we can't jump to the real
199+ * entry text with a relative jump. We could push the target
200+ * address and then use retq, but this destroys the pipeline on
201+ * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
202+ * spill RDI and restore it in a second-stage trampoline.
203+ */
204+ pushq %rdi
205+ movq $entry_SYSCALL_64_stage2, %rdi
206+ jmp *%rdi
207+END(entry_SYSCALL_64_trampoline)
208+
209+ .popsection
210+
211+ENTRY(entry_SYSCALL_64_stage2)
212+ UNWIND_HINT_EMPTY
213+ popq %rdi
214+ jmp entry_SYSCALL_64_after_hwframe
215+END(entry_SYSCALL_64_stage2)
216+
217 ENTRY(entry_SYSCALL_64)
218 UNWIND_HINT_EMPTY
219 /*
220diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
221index f05f00acac89..423aa36f0150 100644
222--- a/arch/x86/kernel/vmlinux.lds.S
223+++ b/arch/x86/kernel/vmlinux.lds.S
224@@ -106,6 +106,15 @@ SECTIONS
225 SOFTIRQENTRY_TEXT
226 *(.fixup)
227 *(.gnu.warning)
228+
229+#ifdef CONFIG_X86_64
230+ . = ALIGN(PAGE_SIZE);
231+ _entry_trampoline = .;
232+ *(.entry_trampoline)
233+ . = ALIGN(PAGE_SIZE);
234+ ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
235+#endif
236+
237 /* End of text section */
238 _etext = .;
239 } :text = 0x9090
240--
2412.14.2
242