]>
Commit | Line | Data |
---|---|---|
59d5af67 | 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
321d628a FG |
2 | From: Dave Hansen <dave.hansen@linux.intel.com> |
3 | Date: Mon, 4 Dec 2017 15:07:35 +0100 | |
59d5af67 | 4 | Subject: [PATCH] x86/mm/pti: Prepare the x86/entry assembly code for |
321d628a FG |
5 | entry/exit CR3 switching |
6 | MIME-Version: 1.0 | |
7 | Content-Type: text/plain; charset=UTF-8 | |
8 | Content-Transfer-Encoding: 8bit | |
9 | ||
10 | CVE-2017-5754 | |
11 | ||
12 | PAGE_TABLE_ISOLATION needs to switch to a different CR3 value when it | |
13 | enters the kernel and switch back when it exits. This essentially needs to | |
14 | be done before leaving assembly code. | |
15 | ||
16 | This is extra challenging because the switching context is tricky: the | |
17 | registers that can be clobbered can vary. It is also hard to store things | |
18 | on the stack because there is an established ABI (ptregs) or the stack is | |
19 | entirely unsafe to use. | |
20 | ||
21 | Establish a set of macros that allow changing to the user and kernel CR3 | |
22 | values. | |
23 | ||
24 | Interactions with SWAPGS: | |
25 | ||
26 | Previous versions of the PAGE_TABLE_ISOLATION code relied on having | |
27 | per-CPU scratch space to save/restore a register that can be used for the | |
28 | CR3 MOV. The %GS register is used to index into our per-CPU space, so | |
29 | SWAPGS *had* to be done before the CR3 switch. That scratch space is gone | |
30 | now, but the semantic that SWAPGS must be done before the CR3 MOV is | |
31 | retained. This is good to keep because it is not that hard to do and it | |
32 | allows to do things like add per-CPU debugging information. | |
33 | ||
34 | What this does in the NMI code is worth pointing out. NMIs can interrupt | |
35 | *any* context and they can also be nested with NMIs interrupting other | |
36 | NMIs. The comments below ".Lnmi_from_kernel" explain the format of the | |
37 | stack during this situation. Changing the format of this stack is hard. | |
38 | Instead of storing the old CR3 value on the stack, this depends on the | |
39 | *regular* register save/restore mechanism and then uses %r14 to keep CR3 | |
40 | during the NMI. It is callee-saved and will not be clobbered by the C NMI | |
41 | handlers that get called. | |
42 | ||
43 | [ PeterZ: ESPFIX optimization ] | |
44 | ||
45 | Based-on-code-from: Andy Lutomirski <luto@kernel.org> | |
46 | Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> | |
47 | Signed-off-by: Thomas Gleixner <tglx@linutronix.de> | |
48 | Reviewed-by: Borislav Petkov <bp@suse.de> | |
49 | Reviewed-by: Thomas Gleixner <tglx@linutronix.de> | |
50 | Cc: Andy Lutomirski <luto@kernel.org> | |
51 | Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> | |
52 | Cc: Borislav Petkov <bp@alien8.de> | |
53 | Cc: Brian Gerst <brgerst@gmail.com> | |
54 | Cc: David Laight <David.Laight@aculab.com> | |
55 | Cc: Denys Vlasenko <dvlasenk@redhat.com> | |
56 | Cc: Eduardo Valentin <eduval@amazon.com> | |
57 | Cc: Greg KH <gregkh@linuxfoundation.org> | |
58 | Cc: H. Peter Anvin <hpa@zytor.com> | |
59 | Cc: Josh Poimboeuf <jpoimboe@redhat.com> | |
60 | Cc: Juergen Gross <jgross@suse.com> | |
61 | Cc: Linus Torvalds <torvalds@linux-foundation.org> | |
62 | Cc: Peter Zijlstra <peterz@infradead.org> | |
63 | Cc: Will Deacon <will.deacon@arm.com> | |
64 | Cc: aliguori@amazon.com | |
65 | Cc: daniel.gruss@iaik.tugraz.at | |
66 | Cc: hughd@google.com | |
67 | Cc: keescook@google.com | |
68 | Cc: linux-mm@kvack.org | |
69 | Signed-off-by: Ingo Molnar <mingo@kernel.org> | |
70 | (cherry picked from commit 8a09317b895f073977346779df52f67c1056d81d) | |
71 | Signed-off-by: Andy Whitcroft <apw@canonical.com> | |
72 | Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com> | |
73 | (cherry picked from commit 313dfb599cf7f8e53fc6f710d15bed60972dcd6f) | |
74 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
75 | --- | |
76 | arch/x86/entry/calling.h | 66 ++++++++++++++++++++++++++++++++++++++++ | |
77 | arch/x86/entry/entry_64.S | 45 +++++++++++++++++++++++---- | |
78 | arch/x86/entry/entry_64_compat.S | 24 ++++++++++++++- | |
79 | 3 files changed, 128 insertions(+), 7 deletions(-) | |
80 | ||
81 | diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h | |
82 | index 1895a685d3dd..dde6262be0a3 100644 | |
83 | --- a/arch/x86/entry/calling.h | |
84 | +++ b/arch/x86/entry/calling.h | |
85 | @@ -1,5 +1,7 @@ | |
86 | #include <linux/jump_label.h> | |
87 | #include <asm/unwind_hints.h> | |
88 | +#include <asm/cpufeatures.h> | |
89 | +#include <asm/page_types.h> | |
90 | ||
91 | /* | |
92 | ||
93 | @@ -186,6 +188,70 @@ For 32-bit we have the following conventions - kernel is built with | |
94 | #endif | |
95 | .endm | |
96 | ||
97 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | |
98 | + | |
99 | +/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */ | |
100 | +#define PTI_SWITCH_MASK (1<<PAGE_SHIFT) | |
101 | + | |
102 | +.macro ADJUST_KERNEL_CR3 reg:req | |
103 | + /* Clear "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ | |
104 | + andq $(~PTI_SWITCH_MASK), \reg | |
105 | +.endm | |
106 | + | |
107 | +.macro ADJUST_USER_CR3 reg:req | |
108 | + /* Move CR3 up a page to the user page tables: */ | |
109 | + orq $(PTI_SWITCH_MASK), \reg | |
110 | +.endm | |
111 | + | |
112 | +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req | |
113 | + mov %cr3, \scratch_reg | |
114 | + ADJUST_KERNEL_CR3 \scratch_reg | |
115 | + mov \scratch_reg, %cr3 | |
116 | +.endm | |
117 | + | |
118 | +.macro SWITCH_TO_USER_CR3 scratch_reg:req | |
119 | + mov %cr3, \scratch_reg | |
120 | + ADJUST_USER_CR3 \scratch_reg | |
121 | + mov \scratch_reg, %cr3 | |
122 | +.endm | |
123 | + | |
124 | +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req | |
125 | + movq %cr3, \scratch_reg | |
126 | + movq \scratch_reg, \save_reg | |
127 | + /* | |
128 | + * Is the switch bit zero? This means the address is | |
129 | + * up in real PAGE_TABLE_ISOLATION patches in a moment. | |
130 | + */ | |
131 | + testq $(PTI_SWITCH_MASK), \scratch_reg | |
132 | + jz .Ldone_\@ | |
133 | + | |
134 | + ADJUST_KERNEL_CR3 \scratch_reg | |
135 | + movq \scratch_reg, %cr3 | |
136 | + | |
137 | +.Ldone_\@: | |
138 | +.endm | |
139 | + | |
140 | +.macro RESTORE_CR3 save_reg:req | |
141 | + /* | |
142 | + * The CR3 write could be avoided when not changing its value, | |
143 | + * but would require a CR3 read *and* a scratch register. | |
144 | + */ | |
145 | + movq \save_reg, %cr3 | |
146 | +.endm | |
147 | + | |
148 | +#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ | |
149 | + | |
150 | +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req | |
151 | +.endm | |
152 | +.macro SWITCH_TO_USER_CR3 scratch_reg:req | |
153 | +.endm | |
154 | +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req | |
155 | +.endm | |
156 | +.macro RESTORE_CR3 save_reg:req | |
157 | +.endm | |
158 | + | |
159 | +#endif | |
160 | + | |
161 | #endif /* CONFIG_X86_64 */ | |
162 | ||
163 | /* | |
164 | diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S | |
165 | index 03e052f02176..292ccc6ec48d 100644 | |
166 | --- a/arch/x86/entry/entry_64.S | |
167 | +++ b/arch/x86/entry/entry_64.S | |
168 | @@ -163,6 +163,9 @@ ENTRY(entry_SYSCALL_64_trampoline) | |
169 | /* Stash the user RSP. */ | |
170 | movq %rsp, RSP_SCRATCH | |
171 | ||
172 | + /* Note: using %rsp as a scratch reg. */ | |
173 | + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp | |
174 | + | |
175 | /* Load the top of the task stack into RSP */ | |
176 | movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp | |
177 | ||
178 | @@ -202,6 +205,10 @@ ENTRY(entry_SYSCALL_64) | |
179 | */ | |
180 | ||
181 | swapgs | |
182 | + /* | |
183 | + * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it | |
184 | + * is not required to switch CR3. | |
185 | + */ | |
186 | movq %rsp, PER_CPU_VAR(rsp_scratch) | |
187 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | |
188 | ||
189 | @@ -398,6 +405,7 @@ syscall_return_via_sysret: | |
190 | * We are on the trampoline stack. All regs except RDI are live. | |
191 | * We can do future final exit work right here. | |
192 | */ | |
193 | + SWITCH_TO_USER_CR3 scratch_reg=%rdi | |
194 | ||
195 | popq %rdi | |
196 | popq %rsp | |
197 | @@ -735,6 +743,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) | |
198 | * We can do future final exit work right here. | |
199 | */ | |
200 | ||
201 | + SWITCH_TO_USER_CR3 scratch_reg=%rdi | |
202 | + | |
203 | /* Restore RDI. */ | |
204 | popq %rdi | |
205 | SWAPGS | |
206 | @@ -817,7 +827,9 @@ native_irq_return_ldt: | |
207 | */ | |
208 | ||
209 | pushq %rdi /* Stash user RDI */ | |
210 | - SWAPGS | |
211 | + SWAPGS /* to kernel GS */ | |
212 | + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ | |
213 | + | |
214 | movq PER_CPU_VAR(espfix_waddr), %rdi | |
215 | movq %rax, (0*8)(%rdi) /* user RAX */ | |
216 | movq (1*8)(%rsp), %rax /* user RIP */ | |
217 | @@ -833,7 +845,6 @@ native_irq_return_ldt: | |
218 | /* Now RAX == RSP. */ | |
219 | ||
220 | andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ | |
221 | - popq %rdi /* Restore user RDI */ | |
222 | ||
223 | /* | |
224 | * espfix_stack[31:16] == 0. The page tables are set up such that | |
225 | @@ -844,7 +855,11 @@ native_irq_return_ldt: | |
226 | * still points to an RO alias of the ESPFIX stack. | |
227 | */ | |
228 | orq PER_CPU_VAR(espfix_stack), %rax | |
229 | - SWAPGS | |
230 | + | |
231 | + SWITCH_TO_USER_CR3 scratch_reg=%rdi /* to user CR3 */ | |
232 | + SWAPGS /* to user GS */ | |
233 | + popq %rdi /* Restore user RDI */ | |
234 | + | |
235 | movq %rax, %rsp | |
236 | UNWIND_HINT_IRET_REGS offset=8 | |
237 | ||
238 | @@ -957,6 +972,8 @@ ENTRY(switch_to_thread_stack) | |
239 | UNWIND_HINT_FUNC | |
240 | ||
241 | pushq %rdi | |
242 | + /* Need to switch before accessing the thread stack. */ | |
243 | + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi | |
244 | movq %rsp, %rdi | |
245 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | |
246 | UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI | |
247 | @@ -1256,7 +1273,11 @@ ENTRY(paranoid_entry) | |
248 | js 1f /* negative -> in kernel */ | |
249 | SWAPGS | |
250 | xorl %ebx, %ebx | |
251 | -1: ret | |
252 | + | |
253 | +1: | |
254 | + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 | |
255 | + | |
256 | + ret | |
257 | END(paranoid_entry) | |
258 | ||
259 | /* | |
260 | @@ -1278,6 +1299,7 @@ ENTRY(paranoid_exit) | |
261 | testl %ebx, %ebx /* swapgs needed? */ | |
262 | jnz .Lparanoid_exit_no_swapgs | |
263 | TRACE_IRQS_IRETQ | |
264 | + RESTORE_CR3 save_reg=%r14 | |
265 | SWAPGS_UNSAFE_STACK | |
266 | jmp .Lparanoid_exit_restore | |
267 | .Lparanoid_exit_no_swapgs: | |
268 | @@ -1305,6 +1327,8 @@ ENTRY(error_entry) | |
269 | * from user mode due to an IRET fault. | |
270 | */ | |
271 | SWAPGS | |
272 | + /* We have user CR3. Change to kernel CR3. */ | |
273 | + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax | |
274 | ||
275 | .Lerror_entry_from_usermode_after_swapgs: | |
276 | /* Put us onto the real thread stack. */ | |
277 | @@ -1351,6 +1375,7 @@ ENTRY(error_entry) | |
278 | * .Lgs_change's error handler with kernel gsbase. | |
279 | */ | |
280 | SWAPGS | |
281 | + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax | |
282 | jmp .Lerror_entry_done | |
283 | ||
284 | .Lbstep_iret: | |
285 | @@ -1360,10 +1385,11 @@ ENTRY(error_entry) | |
286 | ||
287 | .Lerror_bad_iret: | |
288 | /* | |
289 | - * We came from an IRET to user mode, so we have user gsbase. | |
290 | - * Switch to kernel gsbase: | |
291 | + * We came from an IRET to user mode, so we have user | |
292 | + * gsbase and CR3. Switch to kernel gsbase and CR3: | |
293 | */ | |
294 | SWAPGS | |
295 | + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax | |
296 | ||
297 | /* | |
298 | * Pretend that the exception came from user mode: set up pt_regs | |
299 | @@ -1395,6 +1421,10 @@ END(error_exit) | |
300 | /* | |
301 | * Runs on exception stack. Xen PV does not go through this path at all, | |
302 | * so we can use real assembly here. | |
303 | + * | |
304 | + * Registers: | |
305 | + * %r14: Used to save/restore the CR3 of the interrupted context | |
306 | + * when PAGE_TABLE_ISOLATION is in use. Do not clobber. | |
307 | */ | |
308 | ENTRY(nmi) | |
309 | UNWIND_HINT_IRET_REGS | |
310 | @@ -1458,6 +1488,7 @@ ENTRY(nmi) | |
311 | ||
312 | swapgs | |
313 | cld | |
314 | + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx | |
315 | movq %rsp, %rdx | |
316 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | |
317 | UNWIND_HINT_IRET_REGS base=%rdx offset=8 | |
318 | @@ -1710,6 +1741,8 @@ end_repeat_nmi: | |
319 | movq $-1, %rsi | |
320 | call do_nmi | |
321 | ||
322 | + RESTORE_CR3 save_reg=%r14 | |
323 | + | |
324 | testl %ebx, %ebx /* swapgs needed? */ | |
325 | jnz nmi_restore | |
326 | nmi_swapgs: | |
327 | diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S | |
328 | index 2270601b6218..43f856aeee67 100644 | |
329 | --- a/arch/x86/entry/entry_64_compat.S | |
330 | +++ b/arch/x86/entry/entry_64_compat.S | |
331 | @@ -48,6 +48,10 @@ | |
332 | ENTRY(entry_SYSENTER_compat) | |
333 | /* Interrupts are off on entry. */ | |
334 | SWAPGS | |
335 | + | |
336 | + /* We are about to clobber %rsp anyway, clobbering here is OK */ | |
337 | + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp | |
338 | + | |
339 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | |
340 | ||
341 | /* | |
342 | @@ -214,6 +218,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) | |
343 | pushq $0 /* pt_regs->r14 = 0 */ | |
344 | pushq $0 /* pt_regs->r15 = 0 */ | |
345 | ||
346 | + /* | |
347 | + * We just saved %rdi so it is safe to clobber. It is not | |
348 | + * preserved during the C calls inside TRACE_IRQS_OFF anyway. | |
349 | + */ | |
350 | + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi | |
351 | + | |
352 | /* | |
353 | * User mode is traced as though IRQs are on, and SYSENTER | |
354 | * turned them off. | |
355 | @@ -255,10 +265,22 @@ sysret32_from_system_call: | |
356 | * when the system call started, which is already known to user | |
357 | * code. We zero R8-R10 to avoid info leaks. | |
358 | */ | |
359 | + movq RSP-ORIG_RAX(%rsp), %rsp | |
360 | + | |
361 | + /* | |
362 | + * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored | |
363 | + * on the process stack which is not mapped to userspace and | |
364 | + * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 | |
365 | + * switch until after after the last reference to the process | |
366 | + * stack. | |
367 | + * | |
368 | + * %r8 is zeroed before the sysret, thus safe to clobber. | |
369 | + */ | |
370 | + SWITCH_TO_USER_CR3 scratch_reg=%r8 | |
371 | + | |
372 | xorq %r8, %r8 | |
373 | xorq %r9, %r9 | |
374 | xorq %r10, %r10 | |
375 | - movq RSP-ORIG_RAX(%rsp), %rsp | |
376 | swapgs | |
377 | sysretl | |
378 | END(entry_SYSCALL_compat) | |
379 | -- | |
380 | 2.14.2 | |
381 |