]> git.proxmox.com Git - pve-kernel.git/blob - patches/kernel/0040-x86-mm-Track-the-TLB-s-tlb_gen-and-update-the-flushi.patch
425088149e2bccacda881cf75165087316a732d3
[pve-kernel.git] / patches / kernel / 0040-x86-mm-Track-the-TLB-s-tlb_gen-and-update-the-flushi.patch
1 From c1f19d153ad69363ac1bc62bbd9be05ca48c526c Mon Sep 17 00:00:00 2001
2 From: Andy Lutomirski <luto@kernel.org>
3 Date: Thu, 29 Jun 2017 08:53:16 -0700
4 Subject: [PATCH 040/241] x86/mm: Track the TLB's tlb_gen and update the
5 flushing algorithm
6 MIME-Version: 1.0
7 Content-Type: text/plain; charset=UTF-8
8 Content-Transfer-Encoding: 8bit
9
10 CVE-2017-5754
11
12 There are two kernel features that would benefit from tracking
13 how up-to-date each CPU's TLB is in the case where IPIs aren't keeping
14 it up to date in real time:
15
16 - Lazy mm switching currently works by switching to init_mm when
17 it would otherwise flush. This is wasteful: there isn't fundamentally
18 any need to update CR3 at all when going lazy or when returning from
19 lazy mode, nor is there any need to receive flush IPIs at all. Instead,
20 we should just stop trying to keep the TLB coherent when we go lazy and,
21 when unlazying, check whether we missed any flushes.
22
23 - PCID will let us keep recent user contexts alive in the TLB. If we
24 start doing this, we need a way to decide whether those contexts are
25 up to date.
26
27 On some paravirt systems, remote TLBs can be flushed without IPIs.
28 This won't update the target CPUs' tlb_gens, which may cause
29 unnecessary local flushes later on. We can address this if it becomes
30 a problem by carefully updating the target CPU's tlb_gen directly.
31
32 By itself, this patch is a very minor optimization that avoids
33 unnecessary flushes when multiple TLB flushes targetting the same CPU
34 race. The complexity in this patch would not be worth it on its own,
35 but it will enable improved lazy TLB tracking and PCID.
36
37 Signed-off-by: Andy Lutomirski <luto@kernel.org>
38 Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
39 Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
40 Cc: Andrew Morton <akpm@linux-foundation.org>
41 Cc: Arjan van de Ven <arjan@linux.intel.com>
42 Cc: Borislav Petkov <bp@alien8.de>
43 Cc: Dave Hansen <dave.hansen@intel.com>
44 Cc: Linus Torvalds <torvalds@linux-foundation.org>
45 Cc: Mel Gorman <mgorman@suse.de>
46 Cc: Peter Zijlstra <peterz@infradead.org>
47 Cc: Rik van Riel <riel@redhat.com>
48 Cc: linux-mm@kvack.org
49 Link: http://lkml.kernel.org/r/1210fb244bc9cbe7677f7f0b72db4d359675f24b.1498751203.git.luto@kernel.org
50 Signed-off-by: Ingo Molnar <mingo@kernel.org>
51 (cherry picked from commit b0579ade7cd82391360e959cc844e50a160e8a96)
52 Signed-off-by: Andy Whitcroft <apw@canonical.com>
53 Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
54 (cherry picked from commit d34881c25f3c70228ed792fd62881185a25c4422)
55 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
56 ---
57 arch/x86/include/asm/tlbflush.h | 43 +++++++++++++++--
58 arch/x86/mm/tlb.c | 102 +++++++++++++++++++++++++++++++++++++---
59 2 files changed, 135 insertions(+), 10 deletions(-)
60
61 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
62 index f1f2e73b7b77..3a167c214560 100644
63 --- a/arch/x86/include/asm/tlbflush.h
64 +++ b/arch/x86/include/asm/tlbflush.h
65 @@ -82,6 +82,11 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
66 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
67 #endif
68
69 +struct tlb_context {
70 + u64 ctx_id;
71 + u64 tlb_gen;
72 +};
73 +
74 struct tlb_state {
75 /*
76 * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
77 @@ -97,6 +102,21 @@ struct tlb_state {
78 * disabling interrupts when modifying either one.
79 */
80 unsigned long cr4;
81 +
82 + /*
83 + * This is a list of all contexts that might exist in the TLB.
84 + * Since we don't yet use PCID, there is only one context.
85 + *
86 + * For each context, ctx_id indicates which mm the TLB's user
87 + * entries came from. As an invariant, the TLB will never
88 + * contain entries that are out-of-date as when that mm reached
89 + * the tlb_gen in the list.
90 + *
91 + * To be clear, this means that it's legal for the TLB code to
92 + * flush the TLB without updating tlb_gen. This can happen
93 + * (for now, at least) due to paravirt remote flushes.
94 + */
95 + struct tlb_context ctxs[1];
96 };
97 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
98
99 @@ -256,9 +276,26 @@ static inline void __flush_tlb_one(unsigned long addr)
100 * and page-granular flushes are available only on i486 and up.
101 */
102 struct flush_tlb_info {
103 - struct mm_struct *mm;
104 - unsigned long start;
105 - unsigned long end;
106 + /*
107 + * We support several kinds of flushes.
108 + *
109 + * - Fully flush a single mm. .mm will be set, .end will be
110 + * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
111 + * which the IPI sender is trying to catch us up.
112 + *
113 + * - Partially flush a single mm. .mm will be set, .start and
114 + * .end will indicate the range, and .new_tlb_gen will be set
115 + * such that the changes between generation .new_tlb_gen-1 and
116 + * .new_tlb_gen are entirely contained in the indicated range.
117 + *
118 + * - Fully flush all mms whose tlb_gens have been updated. .mm
119 + * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
120 + * will be zero.
121 + */
122 + struct mm_struct *mm;
123 + unsigned long start;
124 + unsigned long end;
125 + u64 new_tlb_gen;
126 };
127
128 #define local_flush_tlb() __flush_tlb()
129 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
130 index 14f4f8f66aa8..4e5a5ddb9e4d 100644
131 --- a/arch/x86/mm/tlb.c
132 +++ b/arch/x86/mm/tlb.c
133 @@ -105,6 +105,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
134 }
135
136 this_cpu_write(cpu_tlbstate.loaded_mm, next);
137 + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
138 + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen));
139
140 WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
141 cpumask_set_cpu(cpu, mm_cpumask(next));
142 @@ -155,25 +157,102 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
143 switch_ldt(real_prev, next);
144 }
145
146 +/*
147 + * flush_tlb_func_common()'s memory ordering requirement is that any
148 + * TLB fills that happen after we flush the TLB are ordered after we
149 + * read active_mm's tlb_gen. We don't need any explicit barriers
150 + * because all x86 flush operations are serializing and the
151 + * atomic64_read operation won't be reordered by the compiler.
152 + */
153 static void flush_tlb_func_common(const struct flush_tlb_info *f,
154 bool local, enum tlb_flush_reason reason)
155 {
156 + /*
157 + * We have three different tlb_gen values in here. They are:
158 + *
159 + * - mm_tlb_gen: the latest generation.
160 + * - local_tlb_gen: the generation that this CPU has already caught
161 + * up to.
162 + * - f->new_tlb_gen: the generation that the requester of the flush
163 + * wants us to catch up to.
164 + */
165 + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
166 + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
167 + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen);
168 +
169 /* This code cannot presently handle being reentered. */
170 VM_WARN_ON(!irqs_disabled());
171
172 + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
173 + loaded_mm->context.ctx_id);
174 +
175 if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
176 + /*
177 + * leave_mm() is adequate to handle any type of flush, and
178 + * we would prefer not to receive further IPIs. leave_mm()
179 + * clears this CPU's bit in mm_cpumask().
180 + */
181 leave_mm(smp_processor_id());
182 return;
183 }
184
185 - if (f->end == TLB_FLUSH_ALL) {
186 - local_flush_tlb();
187 - if (local)
188 - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
189 - trace_tlb_flush(reason, TLB_FLUSH_ALL);
190 - } else {
191 + if (unlikely(local_tlb_gen == mm_tlb_gen)) {
192 + /*
193 + * There's nothing to do: we're already up to date. This can
194 + * happen if two concurrent flushes happen -- the first flush to
195 + * be handled can catch us all the way up, leaving no work for
196 + * the second flush.
197 + */
198 + return;
199 + }
200 +
201 + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
202 + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
203 +
204 + /*
205 + * If we get to this point, we know that our TLB is out of date.
206 + * This does not strictly imply that we need to flush (it's
207 + * possible that f->new_tlb_gen <= local_tlb_gen), but we're
208 + * going to need to flush in the very near future, so we might
209 + * as well get it over with.
210 + *
211 + * The only question is whether to do a full or partial flush.
212 + *
213 + * We do a partial flush if requested and two extra conditions
214 + * are met:
215 + *
216 + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
217 + * we've always done all needed flushes to catch up to
218 + * local_tlb_gen. If, for example, local_tlb_gen == 2 and
219 + * f->new_tlb_gen == 3, then we know that the flush needed to bring
220 + * us up to date for tlb_gen 3 is the partial flush we're
221 + * processing.
222 + *
223 + * As an example of why this check is needed, suppose that there
224 + * are two concurrent flushes. The first is a full flush that
225 + * changes context.tlb_gen from 1 to 2. The second is a partial
226 + * flush that changes context.tlb_gen from 2 to 3. If they get
227 + * processed on this CPU in reverse order, we'll see
228 + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
229 + * If we were to use __flush_tlb_single() and set local_tlb_gen to
230 + * 3, we'd be break the invariant: we'd update local_tlb_gen above
231 + * 1 without the full flush that's needed for tlb_gen 2.
232 + *
233 + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
234 + * Partial TLB flushes are not all that much cheaper than full TLB
235 + * flushes, so it seems unlikely that it would be a performance win
236 + * to do a partial flush if that won't bring our TLB fully up to
237 + * date. By doing a full flush instead, we can increase
238 + * local_tlb_gen all the way to mm_tlb_gen and we can probably
239 + * avoid another flush in the very near future.
240 + */
241 + if (f->end != TLB_FLUSH_ALL &&
242 + f->new_tlb_gen == local_tlb_gen + 1 &&
243 + f->new_tlb_gen == mm_tlb_gen) {
244 + /* Partial flush */
245 unsigned long addr;
246 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
247 +
248 addr = f->start;
249 while (addr < f->end) {
250 __flush_tlb_single(addr);
251 @@ -182,7 +261,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
252 if (local)
253 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
254 trace_tlb_flush(reason, nr_pages);
255 + } else {
256 + /* Full flush. */
257 + local_flush_tlb();
258 + if (local)
259 + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
260 + trace_tlb_flush(reason, TLB_FLUSH_ALL);
261 }
262 +
263 + /* Both paths above update our state to mm_tlb_gen. */
264 + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen);
265 }
266
267 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
268 @@ -253,7 +341,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
269 cpu = get_cpu();
270
271 /* This is also a barrier that synchronizes with switch_mm(). */
272 - inc_mm_tlb_gen(mm);
273 + info.new_tlb_gen = inc_mm_tlb_gen(mm);
274
275 /* Should we flush just the requested range? */
276 if ((end != TLB_FLUSH_ALL) &&
277 --
278 2.14.2
279