[pve-kernel.git] / patches / kernel / 0042-x86-mm-Implement-PCID-based-optimization-try-to-pres.patch

From e3c7bff633fc1210c6b19dd3ebcafb9f6716d586 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 24 Jul 2017 21:41:38 -0700
Subject: [PATCH 042/241] x86/mm: Implement PCID based optimization: try to
 preserve old TLB entries using PCID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CVE-2017-5754

PCID is a "process context ID" -- it's what other architectures call
an address space ID.  Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB.  x86's
PCID is 12 bits.

This is an unorthodox approach to using PCID.  x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs.  To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.

This patch uses PCID differently.  We use a PCID to identify a
recently-used mm on a per-cpu basis.  An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.

Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs).  I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers.  Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.

ping-pong between two mms on the same CPU using eventfd:

  patched:         1.22µs
  patched, nopcid: 1.33µs
  unpatched:       1.34µs

Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration.  dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:

  patched:         1.8µs  11M  dTLB misses
  patched, nopcid: 6.2µs, 207M dTLB misses
  unpatched:       6.1µs, 190M dTLB misses

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(backported from commit 10af6235e0d327d42e1bad974385197817923dc1)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit d833a976288cdcf7fb1dabb48ebf614ebf6a311c)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
 arch/x86/include/asm/mmu_context.h     |  3 ++
 arch/x86/include/asm/processor-flags.h |  2 +
 arch/x86/include/asm/tlbflush.h        | 18 +++++++-
 arch/x86/mm/init.c                     |  1 +
 arch/x86/mm/tlb.c                      | 84 +++++++++++++++++++++++++---------
 5 files changed, 85 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index d6b055b328f2..7ae318c340d9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -298,6 +298,9 @@ static inline unsigned long __get_current_cr3_fast(void)
 {
 	unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
 
+	if (static_cpu_has(X86_FEATURE_PCID))
+		cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
 	/* For now, be very restrictive about when this can be called. */
 	VM_WARN_ON(in_nmi() || preemptible());
 
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 79aa2f98398d..791b60199aa4 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -35,6 +35,7 @@
 /* Mask off the address space ID bits. */
 #define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
 #define CR3_PCID_MASK 0xFFFull
+#define CR3_NOFLUSH (1UL << 63)
 #else
 /*
  * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
@@ -42,6 +43,7 @@
  */
 #define CR3_ADDR_MASK 0xFFFFFFFFull
 #define CR3_PCID_MASK 0ull
+#define CR3_NOFLUSH 0
 #endif
 
 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6397275008db..d23e61dc0640 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,6 +82,12 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in
+ * two cache lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
+
 struct tlb_context {
 	u64 ctx_id;
 	u64 tlb_gen;
@@ -95,6 +101,8 @@ struct tlb_state {
 	 * mode even if we've already switched back to swapper_pg_dir.
 	 */
 	struct mm_struct *loaded_mm;
+	u16 loaded_mm_asid;
+	u16 next_asid;
 
 	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
@@ -104,7 +112,8 @@ struct tlb_state {
 
 	/*
 	 * This is a list of all contexts that might exist in the TLB.
-	 * Since we don't yet use PCID, there is only one context.
+	 * There is one per ASID that we use, and the ASID (what the
+	 * CPU calls PCID) is the index into ctxts.
 	 *
 	 * For each context, ctx_id indicates which mm the TLB's user
 	 * entries came from.  As an invariant, the TLB will never
@@ -114,8 +123,13 @@ struct tlb_state {
 	 * To be clear, this means that it's legal for the TLB code to
 	 * flush the TLB without updating tlb_gen.  This can happen
 	 * (for now, at least) due to paravirt remote flushes.
+	 *
+	 * NB: context 0 is a bit special, since it's also used by
+	 * various bits of init code.  This is fine -- code that
+	 * isn't aware of PCID will end up harmlessly flushing
+	 * context 0.
 	 */
-	struct tlb_context ctxs[1];
+	struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c86dc071bb10..af5c1ed21d43 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -849,6 +849,7 @@ void __init zone_sizes_init(void)
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 	.loaded_mm = &init_mm,
+	.next_asid = 1,
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
 };
 EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0982c997d36f..57943b4d8f2e 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,40 @@
 
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 
+static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
+			    u16 *new_asid, bool *need_flush)
+{
+	u16 asid;
+
+	if (!static_cpu_has(X86_FEATURE_PCID)) {
+		*new_asid = 0;
+		*need_flush = true;
+		return;
+	}
+
+	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
+		    next->context.ctx_id)
+			continue;
+
+		*new_asid = asid;
+		*need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
+			       next_tlb_gen);
+		return;
+	}
+
+	/*
+	 * We don't currently own an ASID slot on this CPU.
+	 * Allocate a slot.
+	 */
+	*new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
+	if (*new_asid >= TLB_NR_DYN_ASIDS) {
+		*new_asid = 0;
+		this_cpu_write(cpu_tlbstate.next_asid, 1);
+	}
+	*need_flush = true;
+}
+
 void leave_mm(int cpu)
 {
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -66,6 +100,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			struct task_struct *tsk)
 {
 	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
 	unsigned cpu = smp_processor_id();
 	u64 next_tlb_gen;
 
@@ -85,12 +120,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	/*
 	 * Verify that CR3 is what we think it is.  This will catch
 	 * hypothetical buggy code that directly switches to swapper_pg_dir
-	 * without going through leave_mm() / switch_mm_irqs_off().
+	 * without going through leave_mm() / switch_mm_irqs_off() or that
+	 * does something like write_cr3(read_cr3_pa()).
 	 */
-	VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd));
+	VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid));
 
 	if (real_prev == next) {
-		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
+		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
 			  next->context.ctx_id);
 
 		if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
@@ -107,16 +143,17 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		cpumask_set_cpu(cpu, mm_cpumask(next));
 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 
-		if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) {
+		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
+		    next_tlb_gen) {
 			/*
 			 * Ideally, we'd have a flush_tlb() variant that
 			 * takes the known CR3 value as input.  This would
 			 * be faster on Xen PV and on hypothetical CPUs
 			 * on which INVPCID is fast.
 			 */
-			this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
+			this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
 				       next_tlb_gen);
-			write_cr3(__pa(next->pgd));
+			write_cr3(__pa(next->pgd) | prev_asid);
 
 			/*
 			 * This gets called via leave_mm() in the idle path
@@ -134,8 +171,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		 * are not reflected in tlb_gen.)
 		 */
 	} else {
-		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) ==
-			  next->context.ctx_id);
+		u16 new_asid;
+		bool need_flush;
 
 		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
 			/*
@@ -162,18 +199,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		cpumask_set_cpu(cpu, mm_cpumask(next));
 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 
-		this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
-		this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen);
-		this_cpu_write(cpu_tlbstate.loaded_mm, next);
-		write_cr3(__pa(next->pgd));
+		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
 
-		/*
-		 * This gets called via leave_mm() in the idle path where RCU
-		 * functions differently.  Tracing normally uses RCU, so we
-		 * have to call the tracepoint specially here.
-		 */
-		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
+		if (need_flush) {
+			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+			write_cr3(__pa(next->pgd) | new_asid);
+			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
 					TLB_FLUSH_ALL);
+		} else {
+			/* The new ASID is already up to date. */
+			write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH);
+			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
+		}
+
+		this_cpu_write(cpu_tlbstate.loaded_mm, next);
+		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 	}
 
 	load_mm_cr4(next);
@@ -200,13 +241,14 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 	 *                   wants us to catch up to.
 	 */
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
 	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
-	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen);
+	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
 
 	/* This code cannot presently handle being reentered. */
 	VM_WARN_ON(!irqs_disabled());
 
-	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
+	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
 		   loaded_mm->context.ctx_id);
 
 	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
@@ -294,7 +336,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 	}
 
 	/* Both paths above update our state to mm_tlb_gen. */
-	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen);
+	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
 }
 
 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
-- 
2.14.2
Commit	Line	Data
321d628a FG	1	From e3c7bff633fc1210c6b19dd3ebcafb9f6716d586 Mon Sep 17 00:00:00 2001
	2	From: Andy Lutomirski <luto@kernel.org>
	3	Date: Mon, 24 Jul 2017 21:41:38 -0700
e4cdf2a5	4	Subject: [PATCH 042/241] x86/mm: Implement PCID based optimization: try to
321d628a FG	5	preserve old TLB entries using PCID
	6	MIME-Version: 1.0
	7	Content-Type: text/plain; charset=UTF-8
	8	Content-Transfer-Encoding: 8bit
	9
	10	CVE-2017-5754
	11
	12	PCID is a "process context ID" -- it's what other architectures call
	13	an address space ID. Every non-global TLB entry is tagged with a
	14	PCID, only TLB entries that match the currently selected PCID are
	15	used, and we can switch PGDs without flushing the TLB. x86's
	16	PCID is 12 bits.
	17
	18	This is an unorthodox approach to using PCID. x86's PCID is far too
	19	short to uniquely identify a process, and we can't even really
	20	uniquely identify a running process because there are monster
	21	systems with over 4096 CPUs. To make matters worse, past attempts
	22	to use all 12 PCID bits have resulted in slowdowns instead of
	23	speedups.
	24
	25	This patch uses PCID differently. We use a PCID to identify a
	26	recently-used mm on a per-cpu basis. An mm has no fixed PCID
	27	binding at all; instead, we give it a fresh PCID each time it's
	28	loaded except in cases where we want to preserve the TLB, in which
	29	case we reuse a recent value.
	30
	31	Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
	32	(turbo off, intel_pstate requesting max performance) under KVM with
	33	the guest using idle=poll (to avoid artifacts when bouncing between
	34	CPUs). I haven't done any real statistics here -- I just ran them
	35	in a loop and picked the fastest results that didn't look like
	36	outliers. Unpatched means commit a4eb8b993554, so all the
	37	bookkeeping overhead is gone.
	38
	39	ping-pong between two mms on the same CPU using eventfd:
	40
	41	patched: 1.22µs
	42	patched, nopcid: 1.33µs
	43	unpatched: 1.34µs
	44
	45	Same ping-pong, but now touch 512 pages (all zero-page to minimize
	46	cache misses) each iteration. dTLB misses are measured by
	47	dtlb_load_misses.miss_causes_a_walk:
	48
	49	patched: 1.8µs 11M dTLB misses
	50	patched, nopcid: 6.2µs, 207M dTLB misses
	51	unpatched: 6.1µs, 190M dTLB misses
	52
	53	Signed-off-by: Andy Lutomirski <luto@kernel.org>
	54	Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
	55	Cc: Andrew Morton <akpm@linux-foundation.org>
	56	Cc: Arjan van de Ven <arjan@linux.intel.com>
	57	Cc: Borislav Petkov <bp@alien8.de>
	58	Cc: Dave Hansen <dave.hansen@intel.com>
	59	Cc: Linus Torvalds <torvalds@linux-foundation.org>
	60	Cc: Mel Gorman <mgorman@suse.de>
	61	Cc: Peter Zijlstra <peterz@infradead.org>
	62	Cc: Rik van Riel <riel@redhat.com>
	63	Cc: Thomas Gleixner <tglx@linutronix.de>
	64	Cc: linux-mm@kvack.org
	65	Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
	66	Signed-off-by: Ingo Molnar <mingo@kernel.org>
	67	(backported from commit 10af6235e0d327d42e1bad974385197817923dc1)
	68	Signed-off-by: Andy Whitcroft <apw@canonical.com>
69	Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
70	(cherry picked from commit d833a976288cdcf7fb1dabb48ebf614ebf6a311c)
71	Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
72	---
73	arch/x86/include/asm/mmu_context.h \| 3 ++
74	arch/x86/include/asm/processor-flags.h \| 2 +
75	arch/x86/include/asm/tlbflush.h \| 18 +++++++-
76	arch/x86/mm/init.c \| 1 +
77	arch/x86/mm/tlb.c \| 84 +++++++++++++++++++++++++---------
78	5 files changed, 85 insertions(+), 23 deletions(-)
79
80	diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
81	index d6b055b328f2..7ae318c340d9 100644
82	--- a/arch/x86/include/asm/mmu_context.h
83	+++ b/arch/x86/include/asm/mmu_context.h
84	@@ -298,6 +298,9 @@ static inline unsigned long __get_current_cr3_fast(void)
85	{
86	unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
87
88	+ if (static_cpu_has(X86_FEATURE_PCID))
89	+ cr3 \|= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
90	+
91	/* For now, be very restrictive about when this can be called. */
92	VM_WARN_ON(in_nmi() \|\| preemptible());
93
94	diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
95	index 79aa2f98398d..791b60199aa4 100644
96	--- a/arch/x86/include/asm/processor-flags.h
97	+++ b/arch/x86/include/asm/processor-flags.h
98	@@ -35,6 +35,7 @@
99	/* Mask off the address space ID bits. */
100	#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
101	#define CR3_PCID_MASK 0xFFFull
102	+#define CR3_NOFLUSH (1UL << 63)
103	#else
104	/*
105	* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
106	@@ -42,6 +43,7 @@
107	*/
108	#define CR3_ADDR_MASK 0xFFFFFFFFull
109	#define CR3_PCID_MASK 0ull
110	+#define CR3_NOFLUSH 0
111	#endif
112
113	#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
114	diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
115	index 6397275008db..d23e61dc0640 100644
116	--- a/arch/x86/include/asm/tlbflush.h
117	+++ b/arch/x86/include/asm/tlbflush.h
118	@@ -82,6 +82,12 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
119	#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
120	#endif
121
122	+/*
123	+ * 6 because 6 should be plenty and struct tlb_state will fit in
124	+ * two cache lines.
125	+ */
126	+#define TLB_NR_DYN_ASIDS 6
127	+
128	struct tlb_context {
129	u64 ctx_id;
130	u64 tlb_gen;
131	@@ -95,6 +101,8 @@ struct tlb_state {
132	* mode even if we've already switched back to swapper_pg_dir.
133	*/
134	struct mm_struct *loaded_mm;
135	+ u16 loaded_mm_asid;
136	+ u16 next_asid;
137
138	/*
139	* Access to this CR4 shadow and to H/W CR4 is protected by
140	@@ -104,7 +112,8 @@ struct tlb_state {
141
142	/*
143	* This is a list of all contexts that might exist in the TLB.
144	- * Since we don't yet use PCID, there is only one context.
145	+ * There is one per ASID that we use, and the ASID (what the
146	+ * CPU calls PCID) is the index into ctxts.
147	*
148	* For each context, ctx_id indicates which mm the TLB's user
149	* entries came from. As an invariant, the TLB will never
150	@@ -114,8 +123,13 @@ struct tlb_state {
151	* To be clear, this means that it's legal for the TLB code to
152	* flush the TLB without updating tlb_gen. This can happen
153	* (for now, at least) due to paravirt remote flushes.
154	+ *
155	+ * NB: context 0 is a bit special, since it's also used by
156	+ * various bits of init code. This is fine -- code that
157	+ * isn't aware of PCID will end up harmlessly flushing
158	+ * context 0.
159	*/
160	- struct tlb_context ctxs[1];
161	+ struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
162	};
163	DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
164
165	diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
166	index c86dc071bb10..af5c1ed21d43 100644
167	--- a/arch/x86/mm/init.c
168	+++ b/arch/x86/mm/init.c
169	@@ -849,6 +849,7 @@ void __init zone_sizes_init(void)
170
171	DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
172	.loaded_mm = &init_mm,
173	+ .next_asid = 1,
174	.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
175	};
176	EXPORT_SYMBOL_GPL(cpu_tlbstate);
177	diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
178	index 0982c997d36f..57943b4d8f2e 100644
179	--- a/arch/x86/mm/tlb.c
180	+++ b/arch/x86/mm/tlb.c
181	@@ -30,6 +30,40 @@
182
183	atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
184
185	+static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
186	+ u16 new_asid, bool need_flush)
187	+{
188	+ u16 asid;
189	+
190	+ if (!static_cpu_has(X86_FEATURE_PCID)) {
191	+ *new_asid = 0;
192	+ *need_flush = true;
193	+ return;
194	+ }
195	+
196	+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
197	+ if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
198	+ next->context.ctx_id)
199	+ continue;
200	+
201	+ *new_asid = asid;
202	+ *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
203	+ next_tlb_gen);
204	+ return;
205	+ }
206	+
207	+ /*
208	+ * We don't currently own an ASID slot on this CPU.
209	+ * Allocate a slot.
210	+ */
211	+ *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
212	+ if (*new_asid >= TLB_NR_DYN_ASIDS) {
213	+ *new_asid = 0;
214	+ this_cpu_write(cpu_tlbstate.next_asid, 1);
215	+ }
216	+ *need_flush = true;
217	+}
218	+
219	void leave_mm(int cpu)
220	{
221	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
222	@@ -66,6 +100,7 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
223	struct task_struct *tsk)
224	{
225	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
226	+ u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
227	unsigned cpu = smp_processor_id();
228	u64 next_tlb_gen;
229
230	@@ -85,12 +120,13 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
231	/*
232	* Verify that CR3 is what we think it is. This will catch
233	* hypothetical buggy code that directly switches to swapper_pg_dir
234	- * without going through leave_mm() / switch_mm_irqs_off().
235	+ * without going through leave_mm() / switch_mm_irqs_off() or that
236	+ * does something like write_cr3(read_cr3_pa()).
237	*/
238	- VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd));
239	+ VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) \| prev_asid));
240
241	if (real_prev == next) {
242	- VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
243	+ VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
244	next->context.ctx_id);
245
246	if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
247	@@ -107,16 +143,17 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
248	cpumask_set_cpu(cpu, mm_cpumask(next));
249	next_tlb_gen = atomic64_read(&next->context.tlb_gen);
250
251	- if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) {
252	+ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
253	+ next_tlb_gen) {
254	/*
255	* Ideally, we'd have a flush_tlb() variant that
256	* takes the known CR3 value as input. This would
257	* be faster on Xen PV and on hypothetical CPUs
258	* on which INVPCID is fast.
259	*/
260	- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
261	+ this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
262	next_tlb_gen);
263	- write_cr3(__pa(next->pgd));
264	+ write_cr3(__pa(next->pgd) \| prev_asid);
265
266	/*
267	* This gets called via leave_mm() in the idle path
268	@@ -134,8 +171,8 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
269	* are not reflected in tlb_gen.)
270	*/
271	} else {
272	- VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) ==
273	- next->context.ctx_id);
274	+ u16 new_asid;
275	+ bool need_flush;
276
277	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
278	/*
279	@@ -162,18 +199,22 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
280	cpumask_set_cpu(cpu, mm_cpumask(next));
281	next_tlb_gen = atomic64_read(&next->context.tlb_gen);
282
283	- this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
284	- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen);
285	- this_cpu_write(cpu_tlbstate.loaded_mm, next);
286	- write_cr3(__pa(next->pgd));
287	+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
288
289	- /*
290	- * This gets called via leave_mm() in the idle path where RCU
291	- * functions differently. Tracing normally uses RCU, so we
292	- * have to call the tracepoint specially here.
293	- */
294	- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
295	+ if (need_flush) {
296	+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
297	+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
298	+ write_cr3(__pa(next->pgd) \| new_asid);
299	+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
300	TLB_FLUSH_ALL);
301	+ } else {
302	+ /* The new ASID is already up to date. */
303	+ write_cr3(__sme_pa(next->pgd) \| new_asid \| CR3_NOFLUSH);
304	+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
305	+ }
306	+
307	+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
308	+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
309	}
310
311	load_mm_cr4(next);
312	@@ -200,13 +241,14 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
313	* wants us to catch up to.
314	*/
315	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
316	+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
317	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
318	- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen);
319	+ u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
320
321	/* This code cannot presently handle being reentered. */
322	VM_WARN_ON(!irqs_disabled());
323
324	- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
325	+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
326	loaded_mm->context.ctx_id);
327
328	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
329	@@ -294,7 +336,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
330	}
331
332	/* Both paths above update our state to mm_tlb_gen. */
333	- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen);
334	+ this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
335	}
336
337	static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
338	--
339	2.14.2
340