]>
Commit | Line | Data |
---|---|---|
321d628a FG |
1 | From c1f19d153ad69363ac1bc62bbd9be05ca48c526c Mon Sep 17 00:00:00 2001 |
2 | From: Andy Lutomirski <luto@kernel.org> | |
3 | Date: Thu, 29 Jun 2017 08:53:16 -0700 | |
e4cdf2a5 | 4 | Subject: [PATCH 040/241] x86/mm: Track the TLB's tlb_gen and update the |
321d628a FG |
5 | flushing algorithm |
6 | MIME-Version: 1.0 | |
7 | Content-Type: text/plain; charset=UTF-8 | |
8 | Content-Transfer-Encoding: 8bit | |
9 | ||
10 | CVE-2017-5754 | |
11 | ||
12 | There are two kernel features that would benefit from tracking | |
13 | how up-to-date each CPU's TLB is in the case where IPIs aren't keeping | |
14 | it up to date in real time: | |
15 | ||
16 | - Lazy mm switching currently works by switching to init_mm when | |
17 | it would otherwise flush. This is wasteful: there isn't fundamentally | |
18 | any need to update CR3 at all when going lazy or when returning from | |
19 | lazy mode, nor is there any need to receive flush IPIs at all. Instead, | |
20 | we should just stop trying to keep the TLB coherent when we go lazy and, | |
21 | when unlazying, check whether we missed any flushes. | |
22 | ||
23 | - PCID will let us keep recent user contexts alive in the TLB. If we | |
24 | start doing this, we need a way to decide whether those contexts are | |
25 | up to date. | |
26 | ||
27 | On some paravirt systems, remote TLBs can be flushed without IPIs. | |
28 | This won't update the target CPUs' tlb_gens, which may cause | |
29 | unnecessary local flushes later on. We can address this if it becomes | |
30 | a problem by carefully updating the target CPU's tlb_gen directly. | |
31 | ||
32 | By itself, this patch is a very minor optimization that avoids | |
33 | unnecessary flushes when multiple TLB flushes targetting the same CPU | |
34 | race. The complexity in this patch would not be worth it on its own, | |
35 | but it will enable improved lazy TLB tracking and PCID. | |
36 | ||
37 | Signed-off-by: Andy Lutomirski <luto@kernel.org> | |
38 | Reviewed-by: Nadav Amit <nadav.amit@gmail.com> | |
39 | Reviewed-by: Thomas Gleixner <tglx@linutronix.de> | |
40 | Cc: Andrew Morton <akpm@linux-foundation.org> | |
41 | Cc: Arjan van de Ven <arjan@linux.intel.com> | |
42 | Cc: Borislav Petkov <bp@alien8.de> | |
43 | Cc: Dave Hansen <dave.hansen@intel.com> | |
44 | Cc: Linus Torvalds <torvalds@linux-foundation.org> | |
45 | Cc: Mel Gorman <mgorman@suse.de> | |
46 | Cc: Peter Zijlstra <peterz@infradead.org> | |
47 | Cc: Rik van Riel <riel@redhat.com> | |
48 | Cc: linux-mm@kvack.org | |
49 | Link: http://lkml.kernel.org/r/1210fb244bc9cbe7677f7f0b72db4d359675f24b.1498751203.git.luto@kernel.org | |
50 | Signed-off-by: Ingo Molnar <mingo@kernel.org> | |
51 | (cherry picked from commit b0579ade7cd82391360e959cc844e50a160e8a96) | |
52 | Signed-off-by: Andy Whitcroft <apw@canonical.com> | |
53 | Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com> | |
54 | (cherry picked from commit d34881c25f3c70228ed792fd62881185a25c4422) | |
55 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
56 | --- | |
57 | arch/x86/include/asm/tlbflush.h | 43 +++++++++++++++-- | |
58 | arch/x86/mm/tlb.c | 102 +++++++++++++++++++++++++++++++++++++--- | |
59 | 2 files changed, 135 insertions(+), 10 deletions(-) | |
60 | ||
61 | diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h | |
62 | index f1f2e73b7b77..3a167c214560 100644 | |
63 | --- a/arch/x86/include/asm/tlbflush.h | |
64 | +++ b/arch/x86/include/asm/tlbflush.h | |
65 | @@ -82,6 +82,11 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) | |
66 | #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) | |
67 | #endif | |
68 | ||
69 | +struct tlb_context { | |
70 | + u64 ctx_id; | |
71 | + u64 tlb_gen; | |
72 | +}; | |
73 | + | |
74 | struct tlb_state { | |
75 | /* | |
76 | * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts | |
77 | @@ -97,6 +102,21 @@ struct tlb_state { | |
78 | * disabling interrupts when modifying either one. | |
79 | */ | |
80 | unsigned long cr4; | |
81 | + | |
82 | + /* | |
83 | + * This is a list of all contexts that might exist in the TLB. | |
84 | + * Since we don't yet use PCID, there is only one context. | |
85 | + * | |
86 | + * For each context, ctx_id indicates which mm the TLB's user | |
87 | + * entries came from. As an invariant, the TLB will never | |
88 | + * contain entries that are out-of-date as when that mm reached | |
89 | + * the tlb_gen in the list. | |
90 | + * | |
91 | + * To be clear, this means that it's legal for the TLB code to | |
92 | + * flush the TLB without updating tlb_gen. This can happen | |
93 | + * (for now, at least) due to paravirt remote flushes. | |
94 | + */ | |
95 | + struct tlb_context ctxs[1]; | |
96 | }; | |
97 | DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); | |
98 | ||
99 | @@ -256,9 +276,26 @@ static inline void __flush_tlb_one(unsigned long addr) | |
100 | * and page-granular flushes are available only on i486 and up. | |
101 | */ | |
102 | struct flush_tlb_info { | |
103 | - struct mm_struct *mm; | |
104 | - unsigned long start; | |
105 | - unsigned long end; | |
106 | + /* | |
107 | + * We support several kinds of flushes. | |
108 | + * | |
109 | + * - Fully flush a single mm. .mm will be set, .end will be | |
110 | + * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to | |
111 | + * which the IPI sender is trying to catch us up. | |
112 | + * | |
113 | + * - Partially flush a single mm. .mm will be set, .start and | |
114 | + * .end will indicate the range, and .new_tlb_gen will be set | |
115 | + * such that the changes between generation .new_tlb_gen-1 and | |
116 | + * .new_tlb_gen are entirely contained in the indicated range. | |
117 | + * | |
118 | + * - Fully flush all mms whose tlb_gens have been updated. .mm | |
119 | + * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen | |
120 | + * will be zero. | |
121 | + */ | |
122 | + struct mm_struct *mm; | |
123 | + unsigned long start; | |
124 | + unsigned long end; | |
125 | + u64 new_tlb_gen; | |
126 | }; | |
127 | ||
128 | #define local_flush_tlb() __flush_tlb() | |
129 | diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c | |
130 | index 14f4f8f66aa8..4e5a5ddb9e4d 100644 | |
131 | --- a/arch/x86/mm/tlb.c | |
132 | +++ b/arch/x86/mm/tlb.c | |
133 | @@ -105,6 +105,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |
134 | } | |
135 | ||
136 | this_cpu_write(cpu_tlbstate.loaded_mm, next); | |
137 | + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); | |
138 | + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen)); | |
139 | ||
140 | WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); | |
141 | cpumask_set_cpu(cpu, mm_cpumask(next)); | |
142 | @@ -155,25 +157,102 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |
143 | switch_ldt(real_prev, next); | |
144 | } | |
145 | ||
146 | +/* | |
147 | + * flush_tlb_func_common()'s memory ordering requirement is that any | |
148 | + * TLB fills that happen after we flush the TLB are ordered after we | |
149 | + * read active_mm's tlb_gen. We don't need any explicit barriers | |
150 | + * because all x86 flush operations are serializing and the | |
151 | + * atomic64_read operation won't be reordered by the compiler. | |
152 | + */ | |
153 | static void flush_tlb_func_common(const struct flush_tlb_info *f, | |
154 | bool local, enum tlb_flush_reason reason) | |
155 | { | |
156 | + /* | |
157 | + * We have three different tlb_gen values in here. They are: | |
158 | + * | |
159 | + * - mm_tlb_gen: the latest generation. | |
160 | + * - local_tlb_gen: the generation that this CPU has already caught | |
161 | + * up to. | |
162 | + * - f->new_tlb_gen: the generation that the requester of the flush | |
163 | + * wants us to catch up to. | |
164 | + */ | |
165 | + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); | |
166 | + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); | |
167 | + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen); | |
168 | + | |
169 | /* This code cannot presently handle being reentered. */ | |
170 | VM_WARN_ON(!irqs_disabled()); | |
171 | ||
172 | + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != | |
173 | + loaded_mm->context.ctx_id); | |
174 | + | |
175 | if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { | |
176 | + /* | |
177 | + * leave_mm() is adequate to handle any type of flush, and | |
178 | + * we would prefer not to receive further IPIs. leave_mm() | |
179 | + * clears this CPU's bit in mm_cpumask(). | |
180 | + */ | |
181 | leave_mm(smp_processor_id()); | |
182 | return; | |
183 | } | |
184 | ||
185 | - if (f->end == TLB_FLUSH_ALL) { | |
186 | - local_flush_tlb(); | |
187 | - if (local) | |
188 | - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | |
189 | - trace_tlb_flush(reason, TLB_FLUSH_ALL); | |
190 | - } else { | |
191 | + if (unlikely(local_tlb_gen == mm_tlb_gen)) { | |
192 | + /* | |
193 | + * There's nothing to do: we're already up to date. This can | |
194 | + * happen if two concurrent flushes happen -- the first flush to | |
195 | + * be handled can catch us all the way up, leaving no work for | |
196 | + * the second flush. | |
197 | + */ | |
198 | + return; | |
199 | + } | |
200 | + | |
201 | + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); | |
202 | + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); | |
203 | + | |
204 | + /* | |
205 | + * If we get to this point, we know that our TLB is out of date. | |
206 | + * This does not strictly imply that we need to flush (it's | |
207 | + * possible that f->new_tlb_gen <= local_tlb_gen), but we're | |
208 | + * going to need to flush in the very near future, so we might | |
209 | + * as well get it over with. | |
210 | + * | |
211 | + * The only question is whether to do a full or partial flush. | |
212 | + * | |
213 | + * We do a partial flush if requested and two extra conditions | |
214 | + * are met: | |
215 | + * | |
216 | + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that | |
217 | + * we've always done all needed flushes to catch up to | |
218 | + * local_tlb_gen. If, for example, local_tlb_gen == 2 and | |
219 | + * f->new_tlb_gen == 3, then we know that the flush needed to bring | |
220 | + * us up to date for tlb_gen 3 is the partial flush we're | |
221 | + * processing. | |
222 | + * | |
223 | + * As an example of why this check is needed, suppose that there | |
224 | + * are two concurrent flushes. The first is a full flush that | |
225 | + * changes context.tlb_gen from 1 to 2. The second is a partial | |
226 | + * flush that changes context.tlb_gen from 2 to 3. If they get | |
227 | + * processed on this CPU in reverse order, we'll see | |
228 | + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. | |
229 | + * If we were to use __flush_tlb_single() and set local_tlb_gen to | |
230 | + * 3, we'd be break the invariant: we'd update local_tlb_gen above | |
231 | + * 1 without the full flush that's needed for tlb_gen 2. | |
232 | + * | |
233 | + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. | |
234 | + * Partial TLB flushes are not all that much cheaper than full TLB | |
235 | + * flushes, so it seems unlikely that it would be a performance win | |
236 | + * to do a partial flush if that won't bring our TLB fully up to | |
237 | + * date. By doing a full flush instead, we can increase | |
238 | + * local_tlb_gen all the way to mm_tlb_gen and we can probably | |
239 | + * avoid another flush in the very near future. | |
240 | + */ | |
241 | + if (f->end != TLB_FLUSH_ALL && | |
242 | + f->new_tlb_gen == local_tlb_gen + 1 && | |
243 | + f->new_tlb_gen == mm_tlb_gen) { | |
244 | + /* Partial flush */ | |
245 | unsigned long addr; | |
246 | unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; | |
247 | + | |
248 | addr = f->start; | |
249 | while (addr < f->end) { | |
250 | __flush_tlb_single(addr); | |
251 | @@ -182,7 +261,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, | |
252 | if (local) | |
253 | count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); | |
254 | trace_tlb_flush(reason, nr_pages); | |
255 | + } else { | |
256 | + /* Full flush. */ | |
257 | + local_flush_tlb(); | |
258 | + if (local) | |
259 | + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | |
260 | + trace_tlb_flush(reason, TLB_FLUSH_ALL); | |
261 | } | |
262 | + | |
263 | + /* Both paths above update our state to mm_tlb_gen. */ | |
264 | + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen); | |
265 | } | |
266 | ||
267 | static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) | |
268 | @@ -253,7 +341,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | |
269 | cpu = get_cpu(); | |
270 | ||
271 | /* This is also a barrier that synchronizes with switch_mm(). */ | |
272 | - inc_mm_tlb_gen(mm); | |
273 | + info.new_tlb_gen = inc_mm_tlb_gen(mm); | |
274 | ||
275 | /* Should we flush just the requested range? */ | |
276 | if ((end != TLB_FLUSH_ALL) && | |
277 | -- | |
278 | 2.14.2 | |
279 |