]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - arch/x86/mm/pti.c
x86: Don't include linux/irq.h from asm/hardirq.h
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / mm / pti.c
CommitLineData
aa8c6248
TG
1/*
2 * Copyright(c) 2017 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * This code is based in part on work published here:
14 *
15 * https://github.com/IAIK/KAISER
16 *
17 * The original work was written by and and signed off by for the Linux
18 * kernel by:
19 *
20 * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
21 * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
22 * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
23 * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
24 *
25 * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
26 * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
27 * Andy Lutomirsky <luto@amacapital.net>
28 */
29#include <linux/kernel.h>
30#include <linux/errno.h>
31#include <linux/string.h>
32#include <linux/types.h>
33#include <linux/bug.h>
34#include <linux/init.h>
35#include <linux/spinlock.h>
36#include <linux/mm.h>
37#include <linux/uaccess.h>
38
39#include <asm/cpufeature.h>
40#include <asm/hypervisor.h>
85900ea5 41#include <asm/vsyscall.h>
aa8c6248
TG
42#include <asm/cmdline.h>
43#include <asm/pti.h>
44#include <asm/pgtable.h>
45#include <asm/pgalloc.h>
46#include <asm/tlbflush.h>
47#include <asm/desc.h>
f901f138 48#include <asm/sections.h>
aa8c6248
TG
49
50#undef pr_fmt
51#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
52
03f4424f
AL
53/* Backporting helper */
54#ifndef __GFP_NOTRACK
55#define __GFP_NOTRACK 0
56#endif
57
aa8c6248
TG
58static void __init pti_print_if_insecure(const char *reason)
59{
de791821 60 if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
aa8c6248
TG
61 pr_info("%s\n", reason);
62}
63
41f4c20b
BP
64static void __init pti_print_if_secure(const char *reason)
65{
de791821 66 if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
41f4c20b
BP
67 pr_info("%s\n", reason);
68}
69
aa8c6248
TG
70void __init pti_check_boottime_disable(void)
71{
41f4c20b
BP
72 char arg[5];
73 int ret;
74
aa8c6248
TG
75 if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
76 pti_print_if_insecure("disabled on XEN PV.");
77 return;
78 }
79
41f4c20b
BP
80 ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
81 if (ret > 0) {
82 if (ret == 3 && !strncmp(arg, "off", 3)) {
83 pti_print_if_insecure("disabled on command line.");
84 return;
85 }
86 if (ret == 2 && !strncmp(arg, "on", 2)) {
87 pti_print_if_secure("force enabled on command line.");
88 goto enable;
89 }
90 if (ret == 4 && !strncmp(arg, "auto", 4))
91 goto autosel;
92 }
93
aa8c6248
TG
94 if (cmdline_find_option_bool(boot_command_line, "nopti")) {
95 pti_print_if_insecure("disabled on command line.");
96 return;
97 }
98
41f4c20b 99autosel:
de791821 100 if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
aa8c6248 101 return;
41f4c20b 102enable:
aa8c6248
TG
103 setup_force_cpu_cap(X86_FEATURE_PTI);
104}
105
61e9b367
DH
106pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
107{
108 /*
109 * Changes to the high (kernel) portion of the kernelmode page
110 * tables are not automatically propagated to the usermode tables.
111 *
112 * Users should keep in mind that, unlike the kernelmode tables,
113 * there is no vmalloc_fault equivalent for the usermode tables.
114 * Top-level entries added to init_mm's usermode pgd after boot
115 * will not be automatically propagated to other mms.
116 */
117 if (!pgdp_maps_userspace(pgdp))
118 return pgd;
119
120 /*
121 * The user page tables get the full PGD, accessible from
122 * userspace:
123 */
124 kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
125
126 /*
127 * If this is normal user memory, make it NX in the kernel
128 * pagetables so that, if we somehow screw up and return to
129 * usermode with the kernel CR3 loaded, we'll get a page fault
130 * instead of allowing user code to execute with the wrong CR3.
131 *
132 * As exceptions, we don't set NX if:
133 * - _PAGE_USER is not set. This could be an executable
134 * EFI runtime mapping or something similar, and the kernel
135 * may execute from it
136 * - we don't have NX support
137 * - we're clearing the PGD (i.e. the new pgd is not present).
138 */
139 if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
140 (__supported_pte_mask & _PAGE_NX))
141 pgd.pgd |= _PAGE_NX;
142
143 /* return the copy of the PGD we want the kernel to use: */
144 return pgd;
145}
146
03f4424f
AL
147/*
148 * Walk the user copy of the page tables (optionally) trying to allocate
149 * page table pages on the way down.
150 *
151 * Returns a pointer to a P4D on success, or NULL on failure.
152 */
8d56eff2 153static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
03f4424f
AL
154{
155 pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
156 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
157
158 if (address < PAGE_OFFSET) {
159 WARN_ONCE(1, "attempt to walk user address\n");
160 return NULL;
161 }
162
163 if (pgd_none(*pgd)) {
164 unsigned long new_p4d_page = __get_free_page(gfp);
165 if (!new_p4d_page)
166 return NULL;
167
8d56eff2 168 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
03f4424f
AL
169 }
170 BUILD_BUG_ON(pgd_large(*pgd) != 0);
171
172 return p4d_offset(pgd, address);
173}
174
175/*
176 * Walk the user copy of the page tables (optionally) trying to allocate
177 * page table pages on the way down.
178 *
179 * Returns a pointer to a PMD on success, or NULL on failure.
180 */
8d56eff2 181static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
03f4424f
AL
182{
183 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
184 p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
185 pud_t *pud;
186
187 BUILD_BUG_ON(p4d_large(*p4d) != 0);
188 if (p4d_none(*p4d)) {
189 unsigned long new_pud_page = __get_free_page(gfp);
190 if (!new_pud_page)
191 return NULL;
192
8d56eff2 193 set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
03f4424f
AL
194 }
195
196 pud = pud_offset(p4d, address);
197 /* The user page tables do not use large mappings: */
198 if (pud_large(*pud)) {
199 WARN_ON(1);
200 return NULL;
201 }
202 if (pud_none(*pud)) {
203 unsigned long new_pmd_page = __get_free_page(gfp);
204 if (!new_pmd_page)
205 return NULL;
206
8d56eff2 207 set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
03f4424f
AL
208 }
209
210 return pmd_offset(pud, address);
211}
212
85900ea5
AL
213#ifdef CONFIG_X86_VSYSCALL_EMULATION
214/*
215 * Walk the shadow copy of the page tables (optionally) trying to allocate
216 * page table pages on the way down. Does not support large pages.
217 *
218 * Note: this is only used when mapping *new* kernel data into the
219 * user/shadow page tables. It is never used for userspace data.
220 *
221 * Returns a pointer to a PTE on success, or NULL on failure.
222 */
223static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
224{
225 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
226 pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
227 pte_t *pte;
228
229 /* We can't do anything sensible if we hit a large mapping. */
230 if (pmd_large(*pmd)) {
231 WARN_ON(1);
232 return NULL;
233 }
234
235 if (pmd_none(*pmd)) {
236 unsigned long new_pte_page = __get_free_page(gfp);
237 if (!new_pte_page)
238 return NULL;
239
8d56eff2 240 set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
85900ea5
AL
241 }
242
243 pte = pte_offset_kernel(pmd, address);
244 if (pte_flags(*pte) & _PAGE_USER) {
245 WARN_ONCE(1, "attempt to walk to user pte\n");
246 return NULL;
247 }
248 return pte;
249}
250
251static void __init pti_setup_vsyscall(void)
252{
253 pte_t *pte, *target_pte;
254 unsigned int level;
255
256 pte = lookup_address(VSYSCALL_ADDR, &level);
257 if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
258 return;
259
260 target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
261 if (WARN_ON(!target_pte))
262 return;
263
264 *target_pte = *pte;
265 set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
266}
267#else
268static void __init pti_setup_vsyscall(void) { }
269#endif
270
03f4424f
AL
271static void __init
272pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
273{
274 unsigned long addr;
275
276 /*
277 * Clone the populated PMDs which cover start to end. These PMD areas
278 * can have holes.
279 */
280 for (addr = start; addr < end; addr += PMD_SIZE) {
281 pmd_t *pmd, *target_pmd;
282 pgd_t *pgd;
283 p4d_t *p4d;
284 pud_t *pud;
285
286 pgd = pgd_offset_k(addr);
287 if (WARN_ON(pgd_none(*pgd)))
288 return;
289 p4d = p4d_offset(pgd, addr);
290 if (WARN_ON(p4d_none(*p4d)))
291 return;
292 pud = pud_offset(p4d, addr);
293 if (pud_none(*pud))
294 continue;
295 pmd = pmd_offset(pud, addr);
296 if (pmd_none(*pmd))
297 continue;
298
299 target_pmd = pti_user_pagetable_walk_pmd(addr);
300 if (WARN_ON(!target_pmd))
301 return;
302
303 /*
304 * Copy the PMD. That is, the kernelmode and usermode
305 * tables will share the last-level page tables of this
306 * address range
307 */
308 *target_pmd = pmd_clear_flags(*pmd, clear);
309 }
310}
311
f7cfbee9
AL
312/*
313 * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
314 * next-level entry on 5-level systems.
315 */
316static void __init pti_clone_p4d(unsigned long addr)
317{
318 p4d_t *kernel_p4d, *user_p4d;
319 pgd_t *kernel_pgd;
320
321 user_p4d = pti_user_pagetable_walk_p4d(addr);
322 kernel_pgd = pgd_offset_k(addr);
323 kernel_p4d = p4d_offset(kernel_pgd, addr);
324 *user_p4d = *kernel_p4d;
325}
326
327/*
328 * Clone the CPU_ENTRY_AREA into the user space visible page table.
329 */
330static void __init pti_clone_user_shared(void)
331{
332 pti_clone_p4d(CPU_ENTRY_AREA_BASE);
333}
334
4b6bbe95
AL
335/*
336 * Clone the ESPFIX P4D into the user space visinble page table
337 */
338static void __init pti_setup_espfix64(void)
339{
340#ifdef CONFIG_X86_ESPFIX64
341 pti_clone_p4d(ESPFIX_BASE_ADDR);
342#endif
343}
344
6dc72c3c
TG
345/*
346 * Clone the populated PMDs of the entry and irqentry text and force it RO.
347 */
348static void __init pti_clone_entry_text(void)
349{
350 pti_clone_pmds((unsigned long) __entry_text_start,
52994c25
TG
351 (unsigned long) __irqentry_text_end,
352 _PAGE_RW | _PAGE_GLOBAL);
6dc72c3c
TG
353}
354
aa8c6248
TG
355/*
356 * Initialize kernel page table isolation
357 */
358void __init pti_init(void)
359{
360 if (!static_cpu_has(X86_FEATURE_PTI))
361 return;
362
363 pr_info("enabled\n");
f7cfbee9
AL
364
365 pti_clone_user_shared();
6dc72c3c 366 pti_clone_entry_text();
4b6bbe95 367 pti_setup_espfix64();
85900ea5 368 pti_setup_vsyscall();
aa8c6248 369}