]>
Commit | Line | Data |
---|---|---|
321d628a FG |
1 | From e3c7bff633fc1210c6b19dd3ebcafb9f6716d586 Mon Sep 17 00:00:00 2001 |
2 | From: Andy Lutomirski <luto@kernel.org> | |
3 | Date: Mon, 24 Jul 2017 21:41:38 -0700 | |
b378f209 | 4 | Subject: [PATCH 042/233] x86/mm: Implement PCID based optimization: try to |
321d628a FG |
5 | preserve old TLB entries using PCID |
6 | MIME-Version: 1.0 | |
7 | Content-Type: text/plain; charset=UTF-8 | |
8 | Content-Transfer-Encoding: 8bit | |
9 | ||
10 | CVE-2017-5754 | |
11 | ||
12 | PCID is a "process context ID" -- it's what other architectures call | |
13 | an address space ID. Every non-global TLB entry is tagged with a | |
14 | PCID, only TLB entries that match the currently selected PCID are | |
15 | used, and we can switch PGDs without flushing the TLB. x86's | |
16 | PCID is 12 bits. | |
17 | ||
18 | This is an unorthodox approach to using PCID. x86's PCID is far too | |
19 | short to uniquely identify a process, and we can't even really | |
20 | uniquely identify a running process because there are monster | |
21 | systems with over 4096 CPUs. To make matters worse, past attempts | |
22 | to use all 12 PCID bits have resulted in slowdowns instead of | |
23 | speedups. | |
24 | ||
25 | This patch uses PCID differently. We use a PCID to identify a | |
26 | recently-used mm on a per-cpu basis. An mm has no fixed PCID | |
27 | binding at all; instead, we give it a fresh PCID each time it's | |
28 | loaded except in cases where we want to preserve the TLB, in which | |
29 | case we reuse a recent value. | |
30 | ||
31 | Here are some benchmark results, done on a Skylake laptop at 2.3 GHz | |
32 | (turbo off, intel_pstate requesting max performance) under KVM with | |
33 | the guest using idle=poll (to avoid artifacts when bouncing between | |
34 | CPUs). I haven't done any real statistics here -- I just ran them | |
35 | in a loop and picked the fastest results that didn't look like | |
36 | outliers. Unpatched means commit a4eb8b993554, so all the | |
37 | bookkeeping overhead is gone. | |
38 | ||
39 | ping-pong between two mms on the same CPU using eventfd: | |
40 | ||
41 | patched: 1.22µs | |
42 | patched, nopcid: 1.33µs | |
43 | unpatched: 1.34µs | |
44 | ||
45 | Same ping-pong, but now touch 512 pages (all zero-page to minimize | |
46 | cache misses) each iteration. dTLB misses are measured by | |
47 | dtlb_load_misses.miss_causes_a_walk: | |
48 | ||
49 | patched: 1.8µs 11M dTLB misses | |
50 | patched, nopcid: 6.2µs, 207M dTLB misses | |
51 | unpatched: 6.1µs, 190M dTLB misses | |
52 | ||
53 | Signed-off-by: Andy Lutomirski <luto@kernel.org> | |
54 | Reviewed-by: Nadav Amit <nadav.amit@gmail.com> | |
55 | Cc: Andrew Morton <akpm@linux-foundation.org> | |
56 | Cc: Arjan van de Ven <arjan@linux.intel.com> | |
57 | Cc: Borislav Petkov <bp@alien8.de> | |
58 | Cc: Dave Hansen <dave.hansen@intel.com> | |
59 | Cc: Linus Torvalds <torvalds@linux-foundation.org> | |
60 | Cc: Mel Gorman <mgorman@suse.de> | |
61 | Cc: Peter Zijlstra <peterz@infradead.org> | |
62 | Cc: Rik van Riel <riel@redhat.com> | |
63 | Cc: Thomas Gleixner <tglx@linutronix.de> | |
64 | Cc: linux-mm@kvack.org | |
65 | Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org | |
66 | Signed-off-by: Ingo Molnar <mingo@kernel.org> | |
67 | (backported from commit 10af6235e0d327d42e1bad974385197817923dc1) | |
68 | Signed-off-by: Andy Whitcroft <apw@canonical.com> | |
69 | Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com> | |
70 | (cherry picked from commit d833a976288cdcf7fb1dabb48ebf614ebf6a311c) | |
71 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
72 | --- | |
73 | arch/x86/include/asm/mmu_context.h | 3 ++ | |
74 | arch/x86/include/asm/processor-flags.h | 2 + | |
75 | arch/x86/include/asm/tlbflush.h | 18 +++++++- | |
76 | arch/x86/mm/init.c | 1 + | |
77 | arch/x86/mm/tlb.c | 84 +++++++++++++++++++++++++--------- | |
78 | 5 files changed, 85 insertions(+), 23 deletions(-) | |
79 | ||
80 | diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h | |
81 | index d6b055b328f2..7ae318c340d9 100644 | |
82 | --- a/arch/x86/include/asm/mmu_context.h | |
83 | +++ b/arch/x86/include/asm/mmu_context.h | |
84 | @@ -298,6 +298,9 @@ static inline unsigned long __get_current_cr3_fast(void) | |
85 | { | |
86 | unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); | |
87 | ||
88 | + if (static_cpu_has(X86_FEATURE_PCID)) | |
89 | + cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); | |
90 | + | |
91 | /* For now, be very restrictive about when this can be called. */ | |
92 | VM_WARN_ON(in_nmi() || preemptible()); | |
93 | ||
94 | diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h | |
95 | index 79aa2f98398d..791b60199aa4 100644 | |
96 | --- a/arch/x86/include/asm/processor-flags.h | |
97 | +++ b/arch/x86/include/asm/processor-flags.h | |
98 | @@ -35,6 +35,7 @@ | |
99 | /* Mask off the address space ID bits. */ | |
100 | #define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull | |
101 | #define CR3_PCID_MASK 0xFFFull | |
102 | +#define CR3_NOFLUSH (1UL << 63) | |
103 | #else | |
104 | /* | |
105 | * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save | |
106 | @@ -42,6 +43,7 @@ | |
107 | */ | |
108 | #define CR3_ADDR_MASK 0xFFFFFFFFull | |
109 | #define CR3_PCID_MASK 0ull | |
110 | +#define CR3_NOFLUSH 0 | |
111 | #endif | |
112 | ||
113 | #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ | |
114 | diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h | |
115 | index 6397275008db..d23e61dc0640 100644 | |
116 | --- a/arch/x86/include/asm/tlbflush.h | |
117 | +++ b/arch/x86/include/asm/tlbflush.h | |
118 | @@ -82,6 +82,12 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) | |
119 | #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) | |
120 | #endif | |
121 | ||
122 | +/* | |
123 | + * 6 because 6 should be plenty and struct tlb_state will fit in | |
124 | + * two cache lines. | |
125 | + */ | |
126 | +#define TLB_NR_DYN_ASIDS 6 | |
127 | + | |
128 | struct tlb_context { | |
129 | u64 ctx_id; | |
130 | u64 tlb_gen; | |
131 | @@ -95,6 +101,8 @@ struct tlb_state { | |
132 | * mode even if we've already switched back to swapper_pg_dir. | |
133 | */ | |
134 | struct mm_struct *loaded_mm; | |
135 | + u16 loaded_mm_asid; | |
136 | + u16 next_asid; | |
137 | ||
138 | /* | |
139 | * Access to this CR4 shadow and to H/W CR4 is protected by | |
140 | @@ -104,7 +112,8 @@ struct tlb_state { | |
141 | ||
142 | /* | |
143 | * This is a list of all contexts that might exist in the TLB. | |
144 | - * Since we don't yet use PCID, there is only one context. | |
145 | + * There is one per ASID that we use, and the ASID (what the | |
146 | + * CPU calls PCID) is the index into ctxts. | |
147 | * | |
148 | * For each context, ctx_id indicates which mm the TLB's user | |
149 | * entries came from. As an invariant, the TLB will never | |
150 | @@ -114,8 +123,13 @@ struct tlb_state { | |
151 | * To be clear, this means that it's legal for the TLB code to | |
152 | * flush the TLB without updating tlb_gen. This can happen | |
153 | * (for now, at least) due to paravirt remote flushes. | |
154 | + * | |
155 | + * NB: context 0 is a bit special, since it's also used by | |
156 | + * various bits of init code. This is fine -- code that | |
157 | + * isn't aware of PCID will end up harmlessly flushing | |
158 | + * context 0. | |
159 | */ | |
160 | - struct tlb_context ctxs[1]; | |
161 | + struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; | |
162 | }; | |
163 | DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); | |
164 | ||
165 | diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c | |
166 | index c86dc071bb10..af5c1ed21d43 100644 | |
167 | --- a/arch/x86/mm/init.c | |
168 | +++ b/arch/x86/mm/init.c | |
169 | @@ -849,6 +849,7 @@ void __init zone_sizes_init(void) | |
170 | ||
171 | DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { | |
172 | .loaded_mm = &init_mm, | |
173 | + .next_asid = 1, | |
174 | .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ | |
175 | }; | |
176 | EXPORT_SYMBOL_GPL(cpu_tlbstate); | |
177 | diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c | |
178 | index 0982c997d36f..57943b4d8f2e 100644 | |
179 | --- a/arch/x86/mm/tlb.c | |
180 | +++ b/arch/x86/mm/tlb.c | |
181 | @@ -30,6 +30,40 @@ | |
182 | ||
183 | atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); | |
184 | ||
185 | +static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, | |
186 | + u16 *new_asid, bool *need_flush) | |
187 | +{ | |
188 | + u16 asid; | |
189 | + | |
190 | + if (!static_cpu_has(X86_FEATURE_PCID)) { | |
191 | + *new_asid = 0; | |
192 | + *need_flush = true; | |
193 | + return; | |
194 | + } | |
195 | + | |
196 | + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { | |
197 | + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != | |
198 | + next->context.ctx_id) | |
199 | + continue; | |
200 | + | |
201 | + *new_asid = asid; | |
202 | + *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < | |
203 | + next_tlb_gen); | |
204 | + return; | |
205 | + } | |
206 | + | |
207 | + /* | |
208 | + * We don't currently own an ASID slot on this CPU. | |
209 | + * Allocate a slot. | |
210 | + */ | |
211 | + *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; | |
212 | + if (*new_asid >= TLB_NR_DYN_ASIDS) { | |
213 | + *new_asid = 0; | |
214 | + this_cpu_write(cpu_tlbstate.next_asid, 1); | |
215 | + } | |
216 | + *need_flush = true; | |
217 | +} | |
218 | + | |
219 | void leave_mm(int cpu) | |
220 | { | |
221 | struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); | |
222 | @@ -66,6 +100,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |
223 | struct task_struct *tsk) | |
224 | { | |
225 | struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); | |
226 | + u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); | |
227 | unsigned cpu = smp_processor_id(); | |
228 | u64 next_tlb_gen; | |
229 | ||
230 | @@ -85,12 +120,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |
231 | /* | |
232 | * Verify that CR3 is what we think it is. This will catch | |
233 | * hypothetical buggy code that directly switches to swapper_pg_dir | |
234 | - * without going through leave_mm() / switch_mm_irqs_off(). | |
235 | + * without going through leave_mm() / switch_mm_irqs_off() or that | |
236 | + * does something like write_cr3(read_cr3_pa()). | |
237 | */ | |
238 | - VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd)); | |
239 | + VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid)); | |
240 | ||
241 | if (real_prev == next) { | |
242 | - VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != | |
243 | + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != | |
244 | next->context.ctx_id); | |
245 | ||
246 | if (cpumask_test_cpu(cpu, mm_cpumask(next))) { | |
247 | @@ -107,16 +143,17 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |
248 | cpumask_set_cpu(cpu, mm_cpumask(next)); | |
249 | next_tlb_gen = atomic64_read(&next->context.tlb_gen); | |
250 | ||
251 | - if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) { | |
252 | + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < | |
253 | + next_tlb_gen) { | |
254 | /* | |
255 | * Ideally, we'd have a flush_tlb() variant that | |
256 | * takes the known CR3 value as input. This would | |
257 | * be faster on Xen PV and on hypothetical CPUs | |
258 | * on which INVPCID is fast. | |
259 | */ | |
260 | - this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, | |
261 | + this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, | |
262 | next_tlb_gen); | |
263 | - write_cr3(__pa(next->pgd)); | |
264 | + write_cr3(__pa(next->pgd) | prev_asid); | |
265 | ||
266 | /* | |
267 | * This gets called via leave_mm() in the idle path | |
268 | @@ -134,8 +171,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |
269 | * are not reflected in tlb_gen.) | |
270 | */ | |
271 | } else { | |
272 | - VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) == | |
273 | - next->context.ctx_id); | |
274 | + u16 new_asid; | |
275 | + bool need_flush; | |
276 | ||
277 | if (IS_ENABLED(CONFIG_VMAP_STACK)) { | |
278 | /* | |
279 | @@ -162,18 +199,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |
280 | cpumask_set_cpu(cpu, mm_cpumask(next)); | |
281 | next_tlb_gen = atomic64_read(&next->context.tlb_gen); | |
282 | ||
283 | - this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); | |
284 | - this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); | |
285 | - this_cpu_write(cpu_tlbstate.loaded_mm, next); | |
286 | - write_cr3(__pa(next->pgd)); | |
287 | + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); | |
288 | ||
289 | - /* | |
290 | - * This gets called via leave_mm() in the idle path where RCU | |
291 | - * functions differently. Tracing normally uses RCU, so we | |
292 | - * have to call the tracepoint specially here. | |
293 | - */ | |
294 | - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, | |
295 | + if (need_flush) { | |
296 | + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); | |
297 | + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); | |
298 | + write_cr3(__pa(next->pgd) | new_asid); | |
299 | + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, | |
300 | TLB_FLUSH_ALL); | |
301 | + } else { | |
302 | + /* The new ASID is already up to date. */ | |
303 | + write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); | |
304 | + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); | |
305 | + } | |
306 | + | |
307 | + this_cpu_write(cpu_tlbstate.loaded_mm, next); | |
308 | + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); | |
309 | } | |
310 | ||
311 | load_mm_cr4(next); | |
312 | @@ -200,13 +241,14 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, | |
313 | * wants us to catch up to. | |
314 | */ | |
315 | struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); | |
316 | + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); | |
317 | u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); | |
318 | - u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen); | |
319 | + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); | |
320 | ||
321 | /* This code cannot presently handle being reentered. */ | |
322 | VM_WARN_ON(!irqs_disabled()); | |
323 | ||
324 | - VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != | |
325 | + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != | |
326 | loaded_mm->context.ctx_id); | |
327 | ||
328 | if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { | |
329 | @@ -294,7 +336,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, | |
330 | } | |
331 | ||
332 | /* Both paths above update our state to mm_tlb_gen. */ | |
333 | - this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen); | |
334 | + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); | |
335 | } | |
336 | ||
337 | static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) | |
338 | -- | |
339 | 2.14.2 | |
340 |