[pve-kernel.git] / patches / kernel / 0040-x86-mm-Track-the-TLB-s-tlb_gen-and-update-the-flushi.patch

From c1f19d153ad69363ac1bc62bbd9be05ca48c526c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:16 -0700
Subject: [PATCH 040/241] x86/mm: Track the TLB's tlb_gen and update the
 flushing algorithm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CVE-2017-5754

There are two kernel features that would benefit from tracking
how up-to-date each CPU's TLB is in the case where IPIs aren't keeping
it up to date in real time:

 - Lazy mm switching currently works by switching to init_mm when
   it would otherwise flush.  This is wasteful: there isn't fundamentally
   any need to update CR3 at all when going lazy or when returning from
   lazy mode, nor is there any need to receive flush IPIs at all.  Instead,
   we should just stop trying to keep the TLB coherent when we go lazy and,
   when unlazying, check whether we missed any flushes.

 - PCID will let us keep recent user contexts alive in the TLB.  If we
   start doing this, we need a way to decide whether those contexts are
   up to date.

On some paravirt systems, remote TLBs can be flushed without IPIs.
This won't update the target CPUs' tlb_gens, which may cause
unnecessary local flushes later on.  We can address this if it becomes
a problem by carefully updating the target CPU's tlb_gen directly.

By itself, this patch is a very minor optimization that avoids
unnecessary flushes when multiple TLB flushes targetting the same CPU
race.  The complexity in this patch would not be worth it on its own,
but it will enable improved lazy TLB tracking and PCID.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1210fb244bc9cbe7677f7f0b72db4d359675f24b.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit b0579ade7cd82391360e959cc844e50a160e8a96)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit d34881c25f3c70228ed792fd62881185a25c4422)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
 arch/x86/include/asm/tlbflush.h |  43 +++++++++++++++--
 arch/x86/mm/tlb.c               | 102 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 135 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index f1f2e73b7b77..3a167c214560 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,6 +82,11 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+struct tlb_context {
+	u64 ctx_id;
+	u64 tlb_gen;
+};
+
 struct tlb_state {
 	/*
 	 * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
@@ -97,6 +102,21 @@ struct tlb_state {
 	 * disabling interrupts when modifying either one.
 	 */
 	unsigned long cr4;
+
+	/*
+	 * This is a list of all contexts that might exist in the TLB.
+	 * Since we don't yet use PCID, there is only one context.
+	 *
+	 * For each context, ctx_id indicates which mm the TLB's user
+	 * entries came from.  As an invariant, the TLB will never
+	 * contain entries that are out-of-date as when that mm reached
+	 * the tlb_gen in the list.
+	 *
+	 * To be clear, this means that it's legal for the TLB code to
+	 * flush the TLB without updating tlb_gen.  This can happen
+	 * (for now, at least) due to paravirt remote flushes.
+	 */
+	struct tlb_context ctxs[1];
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
@@ -256,9 +276,26 @@ static inline void __flush_tlb_one(unsigned long addr)
  * and page-granular flushes are available only on i486 and up.
  */
 struct flush_tlb_info {
-	struct mm_struct *mm;
-	unsigned long start;
-	unsigned long end;
+	/*
+	 * We support several kinds of flushes.
+	 *
+	 * - Fully flush a single mm.  .mm will be set, .end will be
+	 *   TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
+	 *   which the IPI sender is trying to catch us up.
+	 *
+	 * - Partially flush a single mm.  .mm will be set, .start and
+	 *   .end will indicate the range, and .new_tlb_gen will be set
+	 *   such that the changes between generation .new_tlb_gen-1 and
+	 *   .new_tlb_gen are entirely contained in the indicated range.
+	 *
+	 * - Fully flush all mms whose tlb_gens have been updated.  .mm
+	 *   will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
+	 *   will be zero.
+	 */
+	struct mm_struct	*mm;
+	unsigned long		start;
+	unsigned long		end;
+	u64			new_tlb_gen;
 };
 
 #define local_flush_tlb() __flush_tlb()
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 14f4f8f66aa8..4e5a5ddb9e4d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -105,6 +105,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	}
 
 	this_cpu_write(cpu_tlbstate.loaded_mm, next);
+	this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
+	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen));
 
 	WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
 	cpumask_set_cpu(cpu, mm_cpumask(next));
@@ -155,25 +157,102 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	switch_ldt(real_prev, next);
 }
 
+/*
+ * flush_tlb_func_common()'s memory ordering requirement is that any
+ * TLB fills that happen after we flush the TLB are ordered after we
+ * read active_mm's tlb_gen.  We don't need any explicit barriers
+ * because all x86 flush operations are serializing and the
+ * atomic64_read operation won't be reordered by the compiler.
+ */
 static void flush_tlb_func_common(const struct flush_tlb_info *f,
 				  bool local, enum tlb_flush_reason reason)
 {
+	/*
+	 * We have three different tlb_gen values in here.  They are:
+	 *
+	 * - mm_tlb_gen:     the latest generation.
+	 * - local_tlb_gen:  the generation that this CPU has already caught
+	 *                   up to.
+	 * - f->new_tlb_gen: the generation that the requester of the flush
+	 *                   wants us to catch up to.
+	 */
+	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen);
+
 	/* This code cannot presently handle being reentered. */
 	VM_WARN_ON(!irqs_disabled());
 
+	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
+		   loaded_mm->context.ctx_id);
+
 	if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
+		/*
+		 * leave_mm() is adequate to handle any type of flush, and
+		 * we would prefer not to receive further IPIs.  leave_mm()
+		 * clears this CPU's bit in mm_cpumask().
+		 */
 		leave_mm(smp_processor_id());
 		return;
 	}
 
-	if (f->end == TLB_FLUSH_ALL) {
-		local_flush_tlb();
-		if (local)
-			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-		trace_tlb_flush(reason, TLB_FLUSH_ALL);
-	} else {
+	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
+		/*
+		 * There's nothing to do: we're already up to date.  This can
+		 * happen if two concurrent flushes happen -- the first flush to
+		 * be handled can catch us all the way up, leaving no work for
+		 * the second flush.
+		 */
+		return;
+	}
+
+	WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
+	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
+
+	/*
+	 * If we get to this point, we know that our TLB is out of date.
+	 * This does not strictly imply that we need to flush (it's
+	 * possible that f->new_tlb_gen <= local_tlb_gen), but we're
+	 * going to need to flush in the very near future, so we might
+	 * as well get it over with.
+	 *
+	 * The only question is whether to do a full or partial flush.
+	 *
+	 * We do a partial flush if requested and two extra conditions
+	 * are met:
+	 *
+	 * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
+	 *    we've always done all needed flushes to catch up to
+	 *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
+	 *    f->new_tlb_gen == 3, then we know that the flush needed to bring
+	 *    us up to date for tlb_gen 3 is the partial flush we're
+	 *    processing.
+	 *
+	 *    As an example of why this check is needed, suppose that there
+	 *    are two concurrent flushes.  The first is a full flush that
+	 *    changes context.tlb_gen from 1 to 2.  The second is a partial
+	 *    flush that changes context.tlb_gen from 2 to 3.  If they get
+	 *    processed on this CPU in reverse order, we'll see
+	 *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
+	 *    If we were to use __flush_tlb_single() and set local_tlb_gen to
+	 *    3, we'd be break the invariant: we'd update local_tlb_gen above
+	 *    1 without the full flush that's needed for tlb_gen 2.
+	 *
+	 * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
+	 *    Partial TLB flushes are not all that much cheaper than full TLB
+	 *    flushes, so it seems unlikely that it would be a performance win
+	 *    to do a partial flush if that won't bring our TLB fully up to
+	 *    date.  By doing a full flush instead, we can increase
+	 *    local_tlb_gen all the way to mm_tlb_gen and we can probably
+	 *    avoid another flush in the very near future.
+	 */
+	if (f->end != TLB_FLUSH_ALL &&
+	    f->new_tlb_gen == local_tlb_gen + 1 &&
+	    f->new_tlb_gen == mm_tlb_gen) {
+		/* Partial flush */
 		unsigned long addr;
 		unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+
 		addr = f->start;
 		while (addr < f->end) {
 			__flush_tlb_single(addr);
@@ -182,7 +261,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 		if (local)
 			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
 		trace_tlb_flush(reason, nr_pages);
+	} else {
+		/* Full flush. */
+		local_flush_tlb();
+		if (local)
+			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+		trace_tlb_flush(reason, TLB_FLUSH_ALL);
 	}
+
+	/* Both paths above update our state to mm_tlb_gen. */
+	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen);
 }
 
 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
@@ -253,7 +341,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	cpu = get_cpu();
 
 	/* This is also a barrier that synchronizes with switch_mm(). */
-	inc_mm_tlb_gen(mm);
+	info.new_tlb_gen = inc_mm_tlb_gen(mm);
 
 	/* Should we flush just the requested range? */
 	if ((end != TLB_FLUSH_ALL) &&
-- 
2.14.2
Commit	Line	Data
321d628a FG	1	From c1f19d153ad69363ac1bc62bbd9be05ca48c526c Mon Sep 17 00:00:00 2001
	2	From: Andy Lutomirski <luto@kernel.org>
	3	Date: Thu, 29 Jun 2017 08:53:16 -0700
e4cdf2a5	4	Subject: [PATCH 040/241] x86/mm: Track the TLB's tlb_gen and update the
321d628a FG	5	flushing algorithm
	6	MIME-Version: 1.0
	7	Content-Type: text/plain; charset=UTF-8
	8	Content-Transfer-Encoding: 8bit
	9
	10	CVE-2017-5754
	11
	12	There are two kernel features that would benefit from tracking
	13	how up-to-date each CPU's TLB is in the case where IPIs aren't keeping
	14	it up to date in real time:
	15
	16	- Lazy mm switching currently works by switching to init_mm when
	17	it would otherwise flush. This is wasteful: there isn't fundamentally
	18	any need to update CR3 at all when going lazy or when returning from
	19	lazy mode, nor is there any need to receive flush IPIs at all. Instead,
	20	we should just stop trying to keep the TLB coherent when we go lazy and,
	21	when unlazying, check whether we missed any flushes.
	22
	23	- PCID will let us keep recent user contexts alive in the TLB. If we
	24	start doing this, we need a way to decide whether those contexts are
	25	up to date.
	26
	27	On some paravirt systems, remote TLBs can be flushed without IPIs.
	28	This won't update the target CPUs' tlb_gens, which may cause
	29	unnecessary local flushes later on. We can address this if it becomes
	30	a problem by carefully updating the target CPU's tlb_gen directly.
	31
	32	By itself, this patch is a very minor optimization that avoids
	33	unnecessary flushes when multiple TLB flushes targetting the same CPU
	34	race. The complexity in this patch would not be worth it on its own,
	35	but it will enable improved lazy TLB tracking and PCID.
	36
	37	Signed-off-by: Andy Lutomirski <luto@kernel.org>
	38	Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
	39	Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
	40	Cc: Andrew Morton <akpm@linux-foundation.org>
	41	Cc: Arjan van de Ven <arjan@linux.intel.com>
	42	Cc: Borislav Petkov <bp@alien8.de>
	43	Cc: Dave Hansen <dave.hansen@intel.com>
	44	Cc: Linus Torvalds <torvalds@linux-foundation.org>
	45	Cc: Mel Gorman <mgorman@suse.de>
	46	Cc: Peter Zijlstra <peterz@infradead.org>
	47	Cc: Rik van Riel <riel@redhat.com>
	48	Cc: linux-mm@kvack.org
	49	Link: http://lkml.kernel.org/r/1210fb244bc9cbe7677f7f0b72db4d359675f24b.1498751203.git.luto@kernel.org
	50	Signed-off-by: Ingo Molnar <mingo@kernel.org>
	51	(cherry picked from commit b0579ade7cd82391360e959cc844e50a160e8a96)
	52	Signed-off-by: Andy Whitcroft <apw@canonical.com>
	53	Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
	54	(cherry picked from commit d34881c25f3c70228ed792fd62881185a25c4422)
	55	Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
	56	---
	57	arch/x86/include/asm/tlbflush.h \| 43 +++++++++++++++--
	58	arch/x86/mm/tlb.c \| 102 +++++++++++++++++++++++++++++++++++++---
	59	2 files changed, 135 insertions(+), 10 deletions(-)
	60
	61	diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
	62	index f1f2e73b7b77..3a167c214560 100644
	63	--- a/arch/x86/include/asm/tlbflush.h
	64	+++ b/arch/x86/include/asm/tlbflush.h
	65	@@ -82,6 +82,11 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
	66	#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
	67	#endif
	68
69	+struct tlb_context {
70	+ u64 ctx_id;
71	+ u64 tlb_gen;
72	+};
73	+
74	struct tlb_state {
75	/*
76	* cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
77	@@ -97,6 +102,21 @@ struct tlb_state {
78	* disabling interrupts when modifying either one.
79	*/
80	unsigned long cr4;
81	+
82	+ /*
83	+ * This is a list of all contexts that might exist in the TLB.
84	+ * Since we don't yet use PCID, there is only one context.
85	+ *
86	+ * For each context, ctx_id indicates which mm the TLB's user
87	+ * entries came from. As an invariant, the TLB will never
88	+ * contain entries that are out-of-date as when that mm reached
89	+ * the tlb_gen in the list.
90	+ *
91	+ * To be clear, this means that it's legal for the TLB code to
92	+ * flush the TLB without updating tlb_gen. This can happen
93	+ * (for now, at least) due to paravirt remote flushes.
94	+ */
95	+ struct tlb_context ctxs[1];
96	};
97	DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
98
99	@@ -256,9 +276,26 @@ static inline void __flush_tlb_one(unsigned long addr)
100	* and page-granular flushes are available only on i486 and up.
101	*/
102	struct flush_tlb_info {
103	- struct mm_struct *mm;
104	- unsigned long start;
105	- unsigned long end;
106	+ /*
107	+ * We support several kinds of flushes.
108	+ *
109	+ * - Fully flush a single mm. .mm will be set, .end will be
110	+ * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
111	+ * which the IPI sender is trying to catch us up.
112	+ *
113	+ * - Partially flush a single mm. .mm will be set, .start and
114	+ * .end will indicate the range, and .new_tlb_gen will be set
115	+ * such that the changes between generation .new_tlb_gen-1 and
116	+ * .new_tlb_gen are entirely contained in the indicated range.
117	+ *
118	+ * - Fully flush all mms whose tlb_gens have been updated. .mm
119	+ * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
120	+ * will be zero.
121	+ */
122	+ struct mm_struct *mm;
123	+ unsigned long start;
124	+ unsigned long end;
125	+ u64 new_tlb_gen;
126	};
127
128	#define local_flush_tlb() __flush_tlb()
129	diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
130	index 14f4f8f66aa8..4e5a5ddb9e4d 100644
131	--- a/arch/x86/mm/tlb.c
132	+++ b/arch/x86/mm/tlb.c
133	@@ -105,6 +105,8 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
134	}
135
136	this_cpu_write(cpu_tlbstate.loaded_mm, next);
137	+ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
138	+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen));
139
140	WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
141	cpumask_set_cpu(cpu, mm_cpumask(next));
142	@@ -155,25 +157,102 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
143	switch_ldt(real_prev, next);
144	}
145
146	+/*
147	+ * flush_tlb_func_common()'s memory ordering requirement is that any
148	+ * TLB fills that happen after we flush the TLB are ordered after we
149	+ * read active_mm's tlb_gen. We don't need any explicit barriers
150	+ * because all x86 flush operations are serializing and the
151	+ * atomic64_read operation won't be reordered by the compiler.
152	+ */
153	static void flush_tlb_func_common(const struct flush_tlb_info *f,
154	bool local, enum tlb_flush_reason reason)
155	{
156	+ /*
157	+ * We have three different tlb_gen values in here. They are:
158	+ *
159	+ * - mm_tlb_gen: the latest generation.
160	+ * - local_tlb_gen: the generation that this CPU has already caught
161	+ * up to.
162	+ * - f->new_tlb_gen: the generation that the requester of the flush
163	+ * wants us to catch up to.
164	+ */
165	+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
166	+ u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
167	+ u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen);
168	+
169	/* This code cannot presently handle being reentered. */
170	VM_WARN_ON(!irqs_disabled());
171
172	+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
173	+ loaded_mm->context.ctx_id);
174	+
175	if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
176	+ /*
177	+ * leave_mm() is adequate to handle any type of flush, and
178	+ * we would prefer not to receive further IPIs. leave_mm()
179	+ * clears this CPU's bit in mm_cpumask().
180	+ */
181	leave_mm(smp_processor_id());
182	return;
183	}
184
185	- if (f->end == TLB_FLUSH_ALL) {
186	- local_flush_tlb();
187	- if (local)
188	- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
189	- trace_tlb_flush(reason, TLB_FLUSH_ALL);
190	- } else {
191	+ if (unlikely(local_tlb_gen == mm_tlb_gen)) {
192	+ /*
193	+ * There's nothing to do: we're already up to date. This can
194	+ * happen if two concurrent flushes happen -- the first flush to
195	+ * be handled can catch us all the way up, leaving no work for
196	+ * the second flush.
197	+ */
198	+ return;
199	+ }
200	+
201	+ WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
202	+ WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
203	+
204	+ /*
205	+ * If we get to this point, we know that our TLB is out of date.
206	+ * This does not strictly imply that we need to flush (it's
207	+ * possible that f->new_tlb_gen <= local_tlb_gen), but we're
208	+ * going to need to flush in the very near future, so we might
209	+ * as well get it over with.
210	+ *
211	+ * The only question is whether to do a full or partial flush.
212	+ *
213	+ * We do a partial flush if requested and two extra conditions
214	+ * are met:
215	+ *
216	+ * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
217	+ * we've always done all needed flushes to catch up to
218	+ * local_tlb_gen. If, for example, local_tlb_gen == 2 and
219	+ * f->new_tlb_gen == 3, then we know that the flush needed to bring
220	+ * us up to date for tlb_gen 3 is the partial flush we're
221	+ * processing.
222	+ *
223	+ * As an example of why this check is needed, suppose that there
224	+ * are two concurrent flushes. The first is a full flush that
225	+ * changes context.tlb_gen from 1 to 2. The second is a partial
226	+ * flush that changes context.tlb_gen from 2 to 3. If they get
227	+ * processed on this CPU in reverse order, we'll see
228	+ * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
229	+ * If we were to use __flush_tlb_single() and set local_tlb_gen to
230	+ * 3, we'd be break the invariant: we'd update local_tlb_gen above
231	+ * 1 without the full flush that's needed for tlb_gen 2.
232	+ *
233	+ * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
234	+ * Partial TLB flushes are not all that much cheaper than full TLB
235	+ * flushes, so it seems unlikely that it would be a performance win
236	+ * to do a partial flush if that won't bring our TLB fully up to
237	+ * date. By doing a full flush instead, we can increase
238	+ * local_tlb_gen all the way to mm_tlb_gen and we can probably
239	+ * avoid another flush in the very near future.
240	+ */
241	+ if (f->end != TLB_FLUSH_ALL &&
242	+ f->new_tlb_gen == local_tlb_gen + 1 &&
243	+ f->new_tlb_gen == mm_tlb_gen) {
244	+ /* Partial flush */
245	unsigned long addr;
246	unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
247	+
248	addr = f->start;
249	while (addr < f->end) {
250	__flush_tlb_single(addr);
251	@@ -182,7 +261,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
252	if (local)
253	count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
254	trace_tlb_flush(reason, nr_pages);
255	+ } else {
256	+ /* Full flush. */
257	+ local_flush_tlb();
258	+ if (local)
259	+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
260	+ trace_tlb_flush(reason, TLB_FLUSH_ALL);
261	}
262	+
263	+ /* Both paths above update our state to mm_tlb_gen. */
264	+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen);
265	}
266
267	static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
268	@@ -253,7 +341,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
269	cpu = get_cpu();
270
271	/* This is also a barrier that synchronizes with switch_mm(). */
272	- inc_mm_tlb_gen(mm);
273	+ info.new_tlb_gen = inc_mm_tlb_gen(mm);
274
275	/* Should we flush just the requested range? */
276	if ((end != TLB_FLUSH_ALL) &&
277	--
278	2.14.2
279