]>
Commit | Line | Data |
---|---|---|
aa8c6248 TG |
1 | /* |
2 | * Copyright(c) 2017 Intel Corporation. All rights reserved. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of version 2 of the GNU General Public License as | |
6 | * published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, but | |
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
11 | * General Public License for more details. | |
12 | * | |
13 | * This code is based in part on work published here: | |
14 | * | |
15 | * https://github.com/IAIK/KAISER | |
16 | * | |
17 | * The original work was written by and and signed off by for the Linux | |
18 | * kernel by: | |
19 | * | |
20 | * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> | |
21 | * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> | |
22 | * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> | |
23 | * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> | |
24 | * | |
25 | * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> | |
26 | * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and | |
27 | * Andy Lutomirsky <luto@amacapital.net> | |
28 | */ | |
29 | #include <linux/kernel.h> | |
30 | #include <linux/errno.h> | |
31 | #include <linux/string.h> | |
32 | #include <linux/types.h> | |
33 | #include <linux/bug.h> | |
34 | #include <linux/init.h> | |
35 | #include <linux/spinlock.h> | |
36 | #include <linux/mm.h> | |
37 | #include <linux/uaccess.h> | |
38 | ||
39 | #include <asm/cpufeature.h> | |
40 | #include <asm/hypervisor.h> | |
85900ea5 | 41 | #include <asm/vsyscall.h> |
aa8c6248 TG |
42 | #include <asm/cmdline.h> |
43 | #include <asm/pti.h> | |
44 | #include <asm/pgtable.h> | |
45 | #include <asm/pgalloc.h> | |
46 | #include <asm/tlbflush.h> | |
47 | #include <asm/desc.h> | |
f901f138 | 48 | #include <asm/sections.h> |
aa8c6248 TG |
49 | |
50 | #undef pr_fmt | |
51 | #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt | |
52 | ||
03f4424f AL |
53 | /* Backporting helper */ |
54 | #ifndef __GFP_NOTRACK | |
55 | #define __GFP_NOTRACK 0 | |
56 | #endif | |
57 | ||
aa8c6248 TG |
58 | static void __init pti_print_if_insecure(const char *reason) |
59 | { | |
de791821 | 60 | if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) |
aa8c6248 TG |
61 | pr_info("%s\n", reason); |
62 | } | |
63 | ||
41f4c20b BP |
64 | static void __init pti_print_if_secure(const char *reason) |
65 | { | |
de791821 | 66 | if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) |
41f4c20b BP |
67 | pr_info("%s\n", reason); |
68 | } | |
69 | ||
aa8c6248 TG |
70 | void __init pti_check_boottime_disable(void) |
71 | { | |
41f4c20b BP |
72 | char arg[5]; |
73 | int ret; | |
74 | ||
aa8c6248 TG |
75 | if (hypervisor_is_type(X86_HYPER_XEN_PV)) { |
76 | pti_print_if_insecure("disabled on XEN PV."); | |
77 | return; | |
78 | } | |
79 | ||
41f4c20b BP |
80 | ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); |
81 | if (ret > 0) { | |
82 | if (ret == 3 && !strncmp(arg, "off", 3)) { | |
83 | pti_print_if_insecure("disabled on command line."); | |
84 | return; | |
85 | } | |
86 | if (ret == 2 && !strncmp(arg, "on", 2)) { | |
87 | pti_print_if_secure("force enabled on command line."); | |
88 | goto enable; | |
89 | } | |
90 | if (ret == 4 && !strncmp(arg, "auto", 4)) | |
91 | goto autosel; | |
92 | } | |
93 | ||
aa8c6248 TG |
94 | if (cmdline_find_option_bool(boot_command_line, "nopti")) { |
95 | pti_print_if_insecure("disabled on command line."); | |
96 | return; | |
97 | } | |
98 | ||
41f4c20b | 99 | autosel: |
de791821 | 100 | if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) |
aa8c6248 | 101 | return; |
41f4c20b | 102 | enable: |
aa8c6248 TG |
103 | setup_force_cpu_cap(X86_FEATURE_PTI); |
104 | } | |
105 | ||
61e9b367 DH |
106 | pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) |
107 | { | |
108 | /* | |
109 | * Changes to the high (kernel) portion of the kernelmode page | |
110 | * tables are not automatically propagated to the usermode tables. | |
111 | * | |
112 | * Users should keep in mind that, unlike the kernelmode tables, | |
113 | * there is no vmalloc_fault equivalent for the usermode tables. | |
114 | * Top-level entries added to init_mm's usermode pgd after boot | |
115 | * will not be automatically propagated to other mms. | |
116 | */ | |
117 | if (!pgdp_maps_userspace(pgdp)) | |
118 | return pgd; | |
119 | ||
120 | /* | |
121 | * The user page tables get the full PGD, accessible from | |
122 | * userspace: | |
123 | */ | |
124 | kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; | |
125 | ||
126 | /* | |
127 | * If this is normal user memory, make it NX in the kernel | |
128 | * pagetables so that, if we somehow screw up and return to | |
129 | * usermode with the kernel CR3 loaded, we'll get a page fault | |
130 | * instead of allowing user code to execute with the wrong CR3. | |
131 | * | |
132 | * As exceptions, we don't set NX if: | |
133 | * - _PAGE_USER is not set. This could be an executable | |
134 | * EFI runtime mapping or something similar, and the kernel | |
135 | * may execute from it | |
136 | * - we don't have NX support | |
137 | * - we're clearing the PGD (i.e. the new pgd is not present). | |
138 | */ | |
139 | if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && | |
140 | (__supported_pte_mask & _PAGE_NX)) | |
141 | pgd.pgd |= _PAGE_NX; | |
142 | ||
143 | /* return the copy of the PGD we want the kernel to use: */ | |
144 | return pgd; | |
145 | } | |
146 | ||
03f4424f AL |
147 | /* |
148 | * Walk the user copy of the page tables (optionally) trying to allocate | |
149 | * page table pages on the way down. | |
150 | * | |
151 | * Returns a pointer to a P4D on success, or NULL on failure. | |
152 | */ | |
8d56eff2 | 153 | static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) |
03f4424f AL |
154 | { |
155 | pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); | |
156 | gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); | |
157 | ||
158 | if (address < PAGE_OFFSET) { | |
159 | WARN_ONCE(1, "attempt to walk user address\n"); | |
160 | return NULL; | |
161 | } | |
162 | ||
163 | if (pgd_none(*pgd)) { | |
164 | unsigned long new_p4d_page = __get_free_page(gfp); | |
165 | if (!new_p4d_page) | |
166 | return NULL; | |
167 | ||
8d56eff2 | 168 | set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); |
03f4424f AL |
169 | } |
170 | BUILD_BUG_ON(pgd_large(*pgd) != 0); | |
171 | ||
172 | return p4d_offset(pgd, address); | |
173 | } | |
174 | ||
175 | /* | |
176 | * Walk the user copy of the page tables (optionally) trying to allocate | |
177 | * page table pages on the way down. | |
178 | * | |
179 | * Returns a pointer to a PMD on success, or NULL on failure. | |
180 | */ | |
8d56eff2 | 181 | static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) |
03f4424f AL |
182 | { |
183 | gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); | |
184 | p4d_t *p4d = pti_user_pagetable_walk_p4d(address); | |
185 | pud_t *pud; | |
186 | ||
187 | BUILD_BUG_ON(p4d_large(*p4d) != 0); | |
188 | if (p4d_none(*p4d)) { | |
189 | unsigned long new_pud_page = __get_free_page(gfp); | |
190 | if (!new_pud_page) | |
191 | return NULL; | |
192 | ||
8d56eff2 | 193 | set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); |
03f4424f AL |
194 | } |
195 | ||
196 | pud = pud_offset(p4d, address); | |
197 | /* The user page tables do not use large mappings: */ | |
198 | if (pud_large(*pud)) { | |
199 | WARN_ON(1); | |
200 | return NULL; | |
201 | } | |
202 | if (pud_none(*pud)) { | |
203 | unsigned long new_pmd_page = __get_free_page(gfp); | |
204 | if (!new_pmd_page) | |
205 | return NULL; | |
206 | ||
8d56eff2 | 207 | set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); |
03f4424f AL |
208 | } |
209 | ||
210 | return pmd_offset(pud, address); | |
211 | } | |
212 | ||
85900ea5 AL |
213 | #ifdef CONFIG_X86_VSYSCALL_EMULATION |
214 | /* | |
215 | * Walk the shadow copy of the page tables (optionally) trying to allocate | |
216 | * page table pages on the way down. Does not support large pages. | |
217 | * | |
218 | * Note: this is only used when mapping *new* kernel data into the | |
219 | * user/shadow page tables. It is never used for userspace data. | |
220 | * | |
221 | * Returns a pointer to a PTE on success, or NULL on failure. | |
222 | */ | |
223 | static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) | |
224 | { | |
225 | gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); | |
226 | pmd_t *pmd = pti_user_pagetable_walk_pmd(address); | |
227 | pte_t *pte; | |
228 | ||
229 | /* We can't do anything sensible if we hit a large mapping. */ | |
230 | if (pmd_large(*pmd)) { | |
231 | WARN_ON(1); | |
232 | return NULL; | |
233 | } | |
234 | ||
235 | if (pmd_none(*pmd)) { | |
236 | unsigned long new_pte_page = __get_free_page(gfp); | |
237 | if (!new_pte_page) | |
238 | return NULL; | |
239 | ||
8d56eff2 | 240 | set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); |
85900ea5 AL |
241 | } |
242 | ||
243 | pte = pte_offset_kernel(pmd, address); | |
244 | if (pte_flags(*pte) & _PAGE_USER) { | |
245 | WARN_ONCE(1, "attempt to walk to user pte\n"); | |
246 | return NULL; | |
247 | } | |
248 | return pte; | |
249 | } | |
250 | ||
251 | static void __init pti_setup_vsyscall(void) | |
252 | { | |
253 | pte_t *pte, *target_pte; | |
254 | unsigned int level; | |
255 | ||
256 | pte = lookup_address(VSYSCALL_ADDR, &level); | |
257 | if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) | |
258 | return; | |
259 | ||
260 | target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); | |
261 | if (WARN_ON(!target_pte)) | |
262 | return; | |
263 | ||
264 | *target_pte = *pte; | |
265 | set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); | |
266 | } | |
267 | #else | |
268 | static void __init pti_setup_vsyscall(void) { } | |
269 | #endif | |
270 | ||
03f4424f AL |
271 | static void __init |
272 | pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) | |
273 | { | |
274 | unsigned long addr; | |
275 | ||
276 | /* | |
277 | * Clone the populated PMDs which cover start to end. These PMD areas | |
278 | * can have holes. | |
279 | */ | |
280 | for (addr = start; addr < end; addr += PMD_SIZE) { | |
281 | pmd_t *pmd, *target_pmd; | |
282 | pgd_t *pgd; | |
283 | p4d_t *p4d; | |
284 | pud_t *pud; | |
285 | ||
286 | pgd = pgd_offset_k(addr); | |
287 | if (WARN_ON(pgd_none(*pgd))) | |
288 | return; | |
289 | p4d = p4d_offset(pgd, addr); | |
290 | if (WARN_ON(p4d_none(*p4d))) | |
291 | return; | |
292 | pud = pud_offset(p4d, addr); | |
293 | if (pud_none(*pud)) | |
294 | continue; | |
295 | pmd = pmd_offset(pud, addr); | |
296 | if (pmd_none(*pmd)) | |
297 | continue; | |
298 | ||
299 | target_pmd = pti_user_pagetable_walk_pmd(addr); | |
300 | if (WARN_ON(!target_pmd)) | |
301 | return; | |
302 | ||
303 | /* | |
304 | * Copy the PMD. That is, the kernelmode and usermode | |
305 | * tables will share the last-level page tables of this | |
306 | * address range | |
307 | */ | |
308 | *target_pmd = pmd_clear_flags(*pmd, clear); | |
309 | } | |
310 | } | |
311 | ||
f7cfbee9 AL |
312 | /* |
313 | * Clone a single p4d (i.e. a top-level entry on 4-level systems and a | |
314 | * next-level entry on 5-level systems. | |
315 | */ | |
316 | static void __init pti_clone_p4d(unsigned long addr) | |
317 | { | |
318 | p4d_t *kernel_p4d, *user_p4d; | |
319 | pgd_t *kernel_pgd; | |
320 | ||
321 | user_p4d = pti_user_pagetable_walk_p4d(addr); | |
322 | kernel_pgd = pgd_offset_k(addr); | |
323 | kernel_p4d = p4d_offset(kernel_pgd, addr); | |
324 | *user_p4d = *kernel_p4d; | |
325 | } | |
326 | ||
327 | /* | |
328 | * Clone the CPU_ENTRY_AREA into the user space visible page table. | |
329 | */ | |
330 | static void __init pti_clone_user_shared(void) | |
331 | { | |
332 | pti_clone_p4d(CPU_ENTRY_AREA_BASE); | |
333 | } | |
334 | ||
4b6bbe95 AL |
335 | /* |
336 | * Clone the ESPFIX P4D into the user space visinble page table | |
337 | */ | |
338 | static void __init pti_setup_espfix64(void) | |
339 | { | |
340 | #ifdef CONFIG_X86_ESPFIX64 | |
341 | pti_clone_p4d(ESPFIX_BASE_ADDR); | |
342 | #endif | |
343 | } | |
344 | ||
6dc72c3c TG |
345 | /* |
346 | * Clone the populated PMDs of the entry and irqentry text and force it RO. | |
347 | */ | |
348 | static void __init pti_clone_entry_text(void) | |
349 | { | |
350 | pti_clone_pmds((unsigned long) __entry_text_start, | |
52994c25 TG |
351 | (unsigned long) __irqentry_text_end, |
352 | _PAGE_RW | _PAGE_GLOBAL); | |
6dc72c3c TG |
353 | } |
354 | ||
aa8c6248 TG |
355 | /* |
356 | * Initialize kernel page table isolation | |
357 | */ | |
358 | void __init pti_init(void) | |
359 | { | |
360 | if (!static_cpu_has(X86_FEATURE_PTI)) | |
361 | return; | |
362 | ||
363 | pr_info("enabled\n"); | |
f7cfbee9 AL |
364 | |
365 | pti_clone_user_shared(); | |
6dc72c3c | 366 | pti_clone_entry_text(); |
4b6bbe95 | 367 | pti_setup_espfix64(); |
85900ea5 | 368 | pti_setup_vsyscall(); |
aa8c6248 | 369 | } |