]> git.proxmox.com Git - pve-kernel.git/blob - patches/kernel/0205-x86-pti-Put-the-LDT-in-its-own-PGD-if-PTI-is-on.patch
revert buggy SCSI error handler commit
[pve-kernel.git] / patches / kernel / 0205-x86-pti-Put-the-LDT-in-its-own-PGD-if-PTI-is-on.patch
1 From e0e5d2785d4b282a1f82f36199f52f9196868d6b Mon Sep 17 00:00:00 2001
2 From: Andy Lutomirski <luto@kernel.org>
3 Date: Tue, 12 Dec 2017 07:56:45 -0800
4 Subject: [PATCH 205/242] x86/pti: Put the LDT in its own PGD if PTI is on
5 MIME-Version: 1.0
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
8
9 CVE-2017-5754
10
11 With PTI enabled, the LDT must be mapped in the usermode tables somewhere.
12 The LDT is per process, i.e. per mm.
13
14 An earlier approach mapped the LDT on context switch into a fixmap area,
15 but that's a big overhead and exhausted the fixmap space when NR_CPUS got
16 big.
17
18 Take advantage of the fact that there is an address space hole which
19 provides a completely unused pgd. Use this pgd to manage per-mm LDT
20 mappings.
21
22 This has a down side: the LDT isn't (currently) randomized, and an attack
23 that can write the LDT is instant root due to call gates (thanks, AMD, for
24 leaving call gates in AMD64 but designing them wrong so they're only useful
25 for exploits). This can be mitigated by making the LDT read-only or
26 randomizing the mapping, either of which is strightforward on top of this
27 patch.
28
29 This will significantly slow down LDT users, but that shouldn't matter for
30 important workloads -- the LDT is only used by DOSEMU(2), Wine, and very
31 old libc implementations.
32
33 [ tglx: Cleaned it up. ]
34
35 Signed-off-by: Andy Lutomirski <luto@kernel.org>
36 Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
37 Cc: Borislav Petkov <bp@alien8.de>
38 Cc: Brian Gerst <brgerst@gmail.com>
39 Cc: Dave Hansen <dave.hansen@intel.com>
40 Cc: Dave Hansen <dave.hansen@linux.intel.com>
41 Cc: David Laight <David.Laight@aculab.com>
42 Cc: H. Peter Anvin <hpa@zytor.com>
43 Cc: Josh Poimboeuf <jpoimboe@redhat.com>
44 Cc: Juergen Gross <jgross@suse.com>
45 Cc: Kees Cook <keescook@chromium.org>
46 Cc: Kirill A. Shutemov <kirill@shutemov.name>
47 Cc: Linus Torvalds <torvalds@linux-foundation.org>
48 Cc: Peter Zijlstra <peterz@infradead.org>
49 Signed-off-by: Ingo Molnar <mingo@kernel.org>
50 (cherry picked from commit f55f0501cbf65ec41cca5058513031b711730b1d)
51 Signed-off-by: Andy Whitcroft <apw@canonical.com>
52 Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
53 (cherry picked from commit c250643846b45ea6782fb0cfcc15e8cd34744bc7)
54 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
55 ---
56 Documentation/x86/x86_64/mm.txt | 3 +-
57 arch/x86/include/asm/mmu_context.h | 59 ++++++++++++--
58 arch/x86/include/asm/pgtable_64_types.h | 4 +
59 arch/x86/include/asm/processor.h | 23 ++++--
60 arch/x86/kernel/ldt.c | 139 +++++++++++++++++++++++++++++++-
61 arch/x86/mm/dump_pagetables.c | 9 +++
62 6 files changed, 220 insertions(+), 17 deletions(-)
63
64 diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
65 index 496a1dbf139d..ad41b3813f0a 100644
66 --- a/Documentation/x86/x86_64/mm.txt
67 +++ b/Documentation/x86/x86_64/mm.txt
68 @@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
69 ... unused hole ...
70 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
71 ... unused hole ...
72 +fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
73 fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
74 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
75 ... unused hole ...
76 @@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables:
77 hole caused by [56:63] sign extension
78 ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
79 ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
80 -ff90000000000000 - ff9fffffffffffff (=52 bits) hole
81 +ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
82 ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
83 ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
84 ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
85 diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
86 index 89a01ad7e370..9e3546e1c0f4 100644
87 --- a/arch/x86/include/asm/mmu_context.h
88 +++ b/arch/x86/include/asm/mmu_context.h
89 @@ -49,10 +49,33 @@ struct ldt_struct {
90 * call gates. On native, we could merge the ldt_struct and LDT
91 * allocations, but it's not worth trying to optimize.
92 */
93 - struct desc_struct *entries;
94 - unsigned int nr_entries;
95 + struct desc_struct *entries;
96 + unsigned int nr_entries;
97 +
98 + /*
99 + * If PTI is in use, then the entries array is not mapped while we're
100 + * in user mode. The whole array will be aliased at the addressed
101 + * given by ldt_slot_va(slot). We use two slots so that we can allocate
102 + * and map, and enable a new LDT without invalidating the mapping
103 + * of an older, still-in-use LDT.
104 + *
105 + * slot will be -1 if this LDT doesn't have an alias mapping.
106 + */
107 + int slot;
108 };
109
110 +/* This is a multiple of PAGE_SIZE. */
111 +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
112 +
113 +static inline void *ldt_slot_va(int slot)
114 +{
115 +#ifdef CONFIG_X86_64
116 + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
117 +#else
118 + BUG();
119 +#endif
120 +}
121 +
122 /*
123 * Used for LDT copy/destruction.
124 */
125 @@ -63,6 +86,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
126 }
127 int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
128 void destroy_context_ldt(struct mm_struct *mm);
129 +void ldt_arch_exit_mmap(struct mm_struct *mm);
130 #else /* CONFIG_MODIFY_LDT_SYSCALL */
131 static inline void init_new_context_ldt(struct mm_struct *mm) { }
132 static inline int ldt_dup_context(struct mm_struct *oldmm,
133 @@ -70,7 +94,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
134 {
135 return 0;
136 }
137 -static inline void destroy_context_ldt(struct mm_struct *mm) {}
138 +static inline void destroy_context_ldt(struct mm_struct *mm) { }
139 +static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
140 #endif
141
142 static inline void load_mm_ldt(struct mm_struct *mm)
143 @@ -95,10 +120,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
144 * that we can see.
145 */
146
147 - if (unlikely(ldt))
148 - set_ldt(ldt->entries, ldt->nr_entries);
149 - else
150 + if (unlikely(ldt)) {
151 + if (static_cpu_has(X86_FEATURE_PTI)) {
152 + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
153 + /*
154 + * Whoops -- either the new LDT isn't mapped
155 + * (if slot == -1) or is mapped into a bogus
156 + * slot (if slot > 1).
157 + */
158 + clear_LDT();
159 + return;
160 + }
161 +
162 + /*
163 + * If page table isolation is enabled, ldt->entries
164 + * will not be mapped in the userspace pagetables.
165 + * Tell the CPU to access the LDT through the alias
166 + * at ldt_slot_va(ldt->slot).
167 + */
168 + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
169 + } else {
170 + set_ldt(ldt->entries, ldt->nr_entries);
171 + }
172 + } else {
173 clear_LDT();
174 + }
175 #else
176 clear_LDT();
177 #endif
178 @@ -193,6 +239,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
179 static inline void arch_exit_mmap(struct mm_struct *mm)
180 {
181 paravirt_arch_exit_mmap(mm);
182 + ldt_arch_exit_mmap(mm);
183 }
184
185 #ifdef CONFIG_X86_64
186 diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
187 index 5932dead34ee..e8a809ee0bb6 100644
188 --- a/arch/x86/include/asm/pgtable_64_types.h
189 +++ b/arch/x86/include/asm/pgtable_64_types.h
190 @@ -81,10 +81,14 @@ typedef struct { pteval_t pte; } pte_t;
191 # define VMALLOC_SIZE_TB _AC(12800, UL)
192 # define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
193 # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
194 +# define LDT_PGD_ENTRY _AC(-112, UL)
195 +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
196 #else
197 # define VMALLOC_SIZE_TB _AC(32, UL)
198 # define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
199 # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
200 +# define LDT_PGD_ENTRY _AC(-4, UL)
201 +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
202 #endif
203
204 #ifdef CONFIG_RANDOMIZE_MEMORY
205 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
206 index 935d68609922..24503521c947 100644
207 --- a/arch/x86/include/asm/processor.h
208 +++ b/arch/x86/include/asm/processor.h
209 @@ -843,13 +843,22 @@ static inline void spin_lock_prefetch(const void *x)
210
211 #else
212 /*
213 - * User space process size. 47bits minus one guard page. The guard
214 - * page is necessary on Intel CPUs: if a SYSCALL instruction is at
215 - * the highest possible canonical userspace address, then that
216 - * syscall will enter the kernel with a non-canonical return
217 - * address, and SYSRET will explode dangerously. We avoid this
218 - * particular problem by preventing anything from being mapped
219 - * at the maximum canonical address.
220 + * User space process size. This is the first address outside the user range.
221 + * There are a few constraints that determine this:
222 + *
223 + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
224 + * address, then that syscall will enter the kernel with a
225 + * non-canonical return address, and SYSRET will explode dangerously.
226 + * We avoid this particular problem by preventing anything executable
227 + * from being mapped at the maximum canonical address.
228 + *
229 + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
230 + * CPUs malfunction if they execute code from the highest canonical page.
231 + * They'll speculate right off the end of the canonical space, and
232 + * bad things happen. This is worked around in the same way as the
233 + * Intel problem.
234 + *
235 + * With page table isolation enabled, we map the LDT in ... [stay tuned]
236 */
237 #define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
238
239 diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
240 index 74a5aaf13f3c..eceaada581ff 100644
241 --- a/arch/x86/kernel/ldt.c
242 +++ b/arch/x86/kernel/ldt.c
243 @@ -23,6 +23,7 @@
244 #include <linux/uaccess.h>
245
246 #include <asm/ldt.h>
247 +#include <asm/tlb.h>
248 #include <asm/desc.h>
249 #include <asm/mmu_context.h>
250 #include <asm/syscalls.h>
251 @@ -50,13 +51,11 @@ static void refresh_ldt_segments(void)
252 static void flush_ldt(void *__mm)
253 {
254 struct mm_struct *mm = __mm;
255 - mm_context_t *pc;
256
257 if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
258 return;
259
260 - pc = &mm->context;
261 - set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
262 + load_mm_ldt(mm);
263
264 refresh_ldt_segments();
265 }
266 @@ -93,10 +92,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
267 return NULL;
268 }
269
270 + /* The new LDT isn't aliased for PTI yet. */
271 + new_ldt->slot = -1;
272 +
273 new_ldt->nr_entries = num_entries;
274 return new_ldt;
275 }
276
277 +/*
278 + * If PTI is enabled, this maps the LDT into the kernelmode and
279 + * usermode tables for the given mm.
280 + *
281 + * There is no corresponding unmap function. Even if the LDT is freed, we
282 + * leave the PTEs around until the slot is reused or the mm is destroyed.
283 + * This is harmless: the LDT is always in ordinary memory, and no one will
284 + * access the freed slot.
285 + *
286 + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
287 + * it useful, and the flush would slow down modify_ldt().
288 + */
289 +static int
290 +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
291 +{
292 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
293 + bool is_vmalloc, had_top_level_entry;
294 + unsigned long va;
295 + spinlock_t *ptl;
296 + pgd_t *pgd;
297 + int i;
298 +
299 + if (!static_cpu_has(X86_FEATURE_PTI))
300 + return 0;
301 +
302 + /*
303 + * Any given ldt_struct should have map_ldt_struct() called at most
304 + * once.
305 + */
306 + WARN_ON(ldt->slot != -1);
307 +
308 + /*
309 + * Did we already have the top level entry allocated? We can't
310 + * use pgd_none() for this because it doens't do anything on
311 + * 4-level page table kernels.
312 + */
313 + pgd = pgd_offset(mm, LDT_BASE_ADDR);
314 + had_top_level_entry = (pgd->pgd != 0);
315 +
316 + is_vmalloc = is_vmalloc_addr(ldt->entries);
317 +
318 + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
319 + unsigned long offset = i << PAGE_SHIFT;
320 + const void *src = (char *)ldt->entries + offset;
321 + unsigned long pfn;
322 + pte_t pte, *ptep;
323 +
324 + va = (unsigned long)ldt_slot_va(slot) + offset;
325 + pfn = is_vmalloc ? vmalloc_to_pfn(src) :
326 + page_to_pfn(virt_to_page(src));
327 + /*
328 + * Treat the PTI LDT range as a *userspace* range.
329 + * get_locked_pte() will allocate all needed pagetables
330 + * and account for them in this mm.
331 + */
332 + ptep = get_locked_pte(mm, va, &ptl);
333 + if (!ptep)
334 + return -ENOMEM;
335 + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
336 + set_pte_at(mm, va, ptep, pte);
337 + pte_unmap_unlock(ptep, ptl);
338 + }
339 +
340 + if (mm->context.ldt) {
341 + /*
342 + * We already had an LDT. The top-level entry should already
343 + * have been allocated and synchronized with the usermode
344 + * tables.
345 + */
346 + WARN_ON(!had_top_level_entry);
347 + if (static_cpu_has(X86_FEATURE_PTI))
348 + WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
349 + } else {
350 + /*
351 + * This is the first time we're mapping an LDT for this process.
352 + * Sync the pgd to the usermode tables.
353 + */
354 + WARN_ON(had_top_level_entry);
355 + if (static_cpu_has(X86_FEATURE_PTI)) {
356 + WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
357 + set_pgd(kernel_to_user_pgdp(pgd), *pgd);
358 + }
359 + }
360 +
361 + va = (unsigned long)ldt_slot_va(slot);
362 + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
363 +
364 + ldt->slot = slot;
365 +#endif
366 + return 0;
367 +}
368 +
369 +static void free_ldt_pgtables(struct mm_struct *mm)
370 +{
371 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
372 + struct mmu_gather tlb;
373 + unsigned long start = LDT_BASE_ADDR;
374 + unsigned long end = start + (1UL << PGDIR_SHIFT);
375 +
376 + if (!static_cpu_has(X86_FEATURE_PTI))
377 + return;
378 +
379 + tlb_gather_mmu(&tlb, mm, start, end);
380 + free_pgd_range(&tlb, start, end, start, end);
381 + tlb_finish_mmu(&tlb, start, end);
382 +#endif
383 +}
384 +
385 /* After calling this, the LDT is immutable. */
386 static void finalize_ldt_struct(struct ldt_struct *ldt)
387 {
388 @@ -155,6 +265,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
389 new_ldt->nr_entries * LDT_ENTRY_SIZE);
390 finalize_ldt_struct(new_ldt);
391
392 + retval = map_ldt_struct(mm, new_ldt, 0);
393 + if (retval) {
394 + free_ldt_pgtables(mm);
395 + free_ldt_struct(new_ldt);
396 + goto out_unlock;
397 + }
398 mm->context.ldt = new_ldt;
399
400 out_unlock:
401 @@ -173,6 +289,11 @@ void destroy_context_ldt(struct mm_struct *mm)
402 mm->context.ldt = NULL;
403 }
404
405 +void ldt_arch_exit_mmap(struct mm_struct *mm)
406 +{
407 + free_ldt_pgtables(mm);
408 +}
409 +
410 static int read_ldt(void __user *ptr, unsigned long bytecount)
411 {
412 struct mm_struct *mm = current->mm;
413 @@ -286,6 +407,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
414 new_ldt->entries[ldt_info.entry_number] = ldt;
415 finalize_ldt_struct(new_ldt);
416
417 + /*
418 + * If we are using PTI, map the new LDT into the userspace pagetables.
419 + * If there is already an LDT, use the other slot so that other CPUs
420 + * will continue to use the old LDT until install_ldt() switches
421 + * them over to the new LDT.
422 + */
423 + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
424 + if (error) {
425 + free_ldt_struct(old_ldt);
426 + goto out_unlock;
427 + }
428 +
429 install_ldt(mm, new_ldt);
430 free_ldt_struct(old_ldt);
431 error = 0;
432 diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
433 index 3b7720404a9f..eed93dd4cb4a 100644
434 --- a/arch/x86/mm/dump_pagetables.c
435 +++ b/arch/x86/mm/dump_pagetables.c
436 @@ -52,11 +52,17 @@ enum address_markers_idx {
437 USER_SPACE_NR = 0,
438 KERNEL_SPACE_NR,
439 LOW_KERNEL_NR,
440 +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
441 + LDT_NR,
442 +#endif
443 VMALLOC_START_NR,
444 VMEMMAP_START_NR,
445 #ifdef CONFIG_KASAN
446 KASAN_SHADOW_START_NR,
447 KASAN_SHADOW_END_NR,
448 +#endif
449 +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
450 + LDT_NR,
451 #endif
452 CPU_ENTRY_AREA_NR,
453 #ifdef CONFIG_X86_ESPFIX64
454 @@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
455 #ifdef CONFIG_KASAN
456 [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
457 [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
458 +#endif
459 +#ifdef CONFIG_MODIFY_LDT_SYSCALL
460 + [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
461 #endif
462 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
463 #ifdef CONFIG_X86_ESPFIX64
464 --
465 2.14.2
466