]>
Commit | Line | Data |
---|---|---|
2025cf9e | 1 | // SPDX-License-Identifier: GPL-2.0-only |
3891a04a PA |
2 | /* ----------------------------------------------------------------------- * |
3 | * | |
4 | * Copyright 2014 Intel Corporation; author: H. Peter Anvin | |
5 | * | |
3891a04a PA |
6 | * ----------------------------------------------------------------------- */ |
7 | ||
8 | /* | |
9 | * The IRET instruction, when returning to a 16-bit segment, only | |
10 | * restores the bottom 16 bits of the user space stack pointer. This | |
11 | * causes some 16-bit software to break, but it also leaks kernel state | |
12 | * to user space. | |
13 | * | |
14 | * This works around this by creating percpu "ministacks", each of which | |
15 | * is mapped 2^16 times 64K apart. When we detect that the return SS is | |
16 | * on the LDT, we copy the IRET frame to the ministack and use the | |
17 | * relevant alias to return to userspace. The ministacks are mapped | |
18 | * readonly, so if the IRET fault we promote #GP to #DF which is an IST | |
19 | * vector and thus has its own stack; we then do the fixup in the #DF | |
20 | * handler. | |
21 | * | |
22 | * This file sets up the ministacks and the related page tables. The | |
23 | * actual ministack invocation is in entry_64.S. | |
24 | */ | |
25 | ||
26 | #include <linux/init.h> | |
27 | #include <linux/init_task.h> | |
28 | #include <linux/kernel.h> | |
29 | #include <linux/percpu.h> | |
30 | #include <linux/gfp.h> | |
31 | #include <linux/random.h> | |
32 | #include <asm/pgtable.h> | |
33 | #include <asm/pgalloc.h> | |
34 | #include <asm/setup.h> | |
e1fe9ed8 | 35 | #include <asm/espfix.h> |
3891a04a PA |
36 | |
37 | /* | |
38 | * Note: we only need 6*8 = 48 bytes for the espfix stack, but round | |
39 | * it up to a cache line to avoid unnecessary sharing. | |
40 | */ | |
41 | #define ESPFIX_STACK_SIZE (8*8UL) | |
42 | #define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) | |
43 | ||
44 | /* There is address space for how many espfix pages? */ | |
1d33b219 | 45 | #define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16)) |
3891a04a PA |
46 | |
47 | #define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) | |
48 | #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS | |
1d33b219 | 49 | # error "Need more virtual address space for the ESPFIX hack" |
3891a04a PA |
50 | #endif |
51 | ||
75f296d9 | 52 | #define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) |
3891a04a PA |
53 | |
54 | /* This contains the *bottom* address of the espfix stack */ | |
55 | DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); | |
56 | DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); | |
57 | ||
58 | /* Initialization mutex - should this be a spinlock? */ | |
59 | static DEFINE_MUTEX(espfix_init_mutex); | |
60 | ||
61 | /* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ | |
62 | #define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) | |
63 | static void *espfix_pages[ESPFIX_MAX_PAGES]; | |
64 | ||
65 | static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] | |
66 | __aligned(PAGE_SIZE); | |
67 | ||
68 | static unsigned int page_random, slot_random; | |
69 | ||
70 | /* | |
71 | * This returns the bottom address of the espfix stack for a specific CPU. | |
72 | * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case | |
73 | * we have to account for some amount of padding at the end of each page. | |
74 | */ | |
75 | static inline unsigned long espfix_base_addr(unsigned int cpu) | |
76 | { | |
77 | unsigned long page, slot; | |
78 | unsigned long addr; | |
79 | ||
80 | page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; | |
81 | slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; | |
82 | addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); | |
83 | addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); | |
84 | addr += ESPFIX_BASE_ADDR; | |
85 | return addr; | |
86 | } | |
87 | ||
88 | #define PTE_STRIDE (65536/PAGE_SIZE) | |
89 | #define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) | |
90 | #define ESPFIX_PMD_CLONES PTRS_PER_PMD | |
91 | #define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) | |
92 | ||
93 | #define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) | |
94 | ||
95 | static void init_espfix_random(void) | |
96 | { | |
97 | unsigned long rand; | |
98 | ||
99 | /* | |
100 | * This is run before the entropy pools are initialized, | |
101 | * but this is hopefully better than nothing. | |
102 | */ | |
103 | if (!arch_get_random_long(&rand)) { | |
104 | /* The constant is an arbitrary large prime */ | |
4ea1636b | 105 | rand = rdtsc(); |
3891a04a PA |
106 | rand *= 0xc345c6b72fd16123UL; |
107 | } | |
108 | ||
109 | slot_random = rand % ESPFIX_STACKS_PER_PAGE; | |
110 | page_random = (rand / ESPFIX_STACKS_PER_PAGE) | |
111 | & (ESPFIX_PAGE_SPACE - 1); | |
112 | } | |
113 | ||
114 | void __init init_espfix_bsp(void) | |
115 | { | |
1d33b219 KS |
116 | pgd_t *pgd; |
117 | p4d_t *p4d; | |
3891a04a PA |
118 | |
119 | /* Install the espfix pud into the kernel page directory */ | |
65ade2f8 | 120 | pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; |
1d33b219 KS |
121 | p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); |
122 | p4d_populate(&init_mm, p4d, espfix_pud_page); | |
3891a04a PA |
123 | |
124 | /* Randomize the locations */ | |
125 | init_espfix_random(); | |
126 | ||
127 | /* The rest is the same as for any other processor */ | |
1db87563 | 128 | init_espfix_ap(0); |
3891a04a PA |
129 | } |
130 | ||
1db87563 | 131 | void init_espfix_ap(int cpu) |
3891a04a | 132 | { |
1db87563 | 133 | unsigned int page; |
3891a04a PA |
134 | unsigned long addr; |
135 | pud_t pud, *pud_p; | |
136 | pmd_t pmd, *pmd_p; | |
137 | pte_t pte, *pte_p; | |
20d5e4a9 | 138 | int n, node; |
3891a04a PA |
139 | void *stack_page; |
140 | pteval_t ptemask; | |
141 | ||
142 | /* We only have to do this once... */ | |
20d5e4a9 | 143 | if (likely(per_cpu(espfix_stack, cpu))) |
3891a04a PA |
144 | return; /* Already initialized */ |
145 | ||
3891a04a PA |
146 | addr = espfix_base_addr(cpu); |
147 | page = cpu/ESPFIX_STACKS_PER_PAGE; | |
148 | ||
149 | /* Did another CPU already set this up? */ | |
6aa7de05 | 150 | stack_page = READ_ONCE(espfix_pages[page]); |
3891a04a PA |
151 | if (likely(stack_page)) |
152 | goto done; | |
153 | ||
154 | mutex_lock(&espfix_init_mutex); | |
155 | ||
156 | /* Did we race on the lock? */ | |
6aa7de05 | 157 | stack_page = READ_ONCE(espfix_pages[page]); |
3891a04a PA |
158 | if (stack_page) |
159 | goto unlock_done; | |
160 | ||
20d5e4a9 | 161 | node = cpu_to_node(cpu); |
3891a04a PA |
162 | ptemask = __supported_pte_mask; |
163 | ||
164 | pud_p = &espfix_pud_page[pud_index(addr)]; | |
165 | pud = *pud_p; | |
166 | if (!pud_present(pud)) { | |
20d5e4a9 ZG |
167 | struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); |
168 | ||
169 | pmd_p = (pmd_t *)page_address(page); | |
3891a04a | 170 | pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); |
8762e509 | 171 | paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); |
3891a04a PA |
172 | for (n = 0; n < ESPFIX_PUD_CLONES; n++) |
173 | set_pud(&pud_p[n], pud); | |
174 | } | |
175 | ||
176 | pmd_p = pmd_offset(&pud, addr); | |
177 | pmd = *pmd_p; | |
178 | if (!pmd_present(pmd)) { | |
20d5e4a9 ZG |
179 | struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); |
180 | ||
181 | pte_p = (pte_t *)page_address(page); | |
3891a04a | 182 | pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); |
8762e509 | 183 | paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); |
3891a04a PA |
184 | for (n = 0; n < ESPFIX_PMD_CLONES; n++) |
185 | set_pmd(&pmd_p[n], pmd); | |
186 | } | |
187 | ||
188 | pte_p = pte_offset_kernel(&pmd, addr); | |
20d5e4a9 | 189 | stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); |
6baf4bec DH |
190 | /* |
191 | * __PAGE_KERNEL_* includes _PAGE_GLOBAL, which we want since | |
192 | * this is mapped to userspace. | |
193 | */ | |
21729f81 | 194 | pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); |
3891a04a PA |
195 | for (n = 0; n < ESPFIX_PTE_CLONES; n++) |
196 | set_pte(&pte_p[n*PTE_STRIDE], pte); | |
197 | ||
198 | /* Job is done for this CPU and any CPU which shares this page */ | |
6aa7de05 | 199 | WRITE_ONCE(espfix_pages[page], stack_page); |
3891a04a PA |
200 | |
201 | unlock_done: | |
202 | mutex_unlock(&espfix_init_mutex); | |
203 | done: | |
20d5e4a9 ZG |
204 | per_cpu(espfix_stack, cpu) = addr; |
205 | per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page | |
206 | + (addr & ~PAGE_MASK); | |
3891a04a | 207 | } |