]>
Commit | Line | Data |
---|---|---|
25ad1249 WB |
1 | From 71b3c126e61177eb693423f2e18a1914205b165e Mon Sep 17 00:00:00 2001 |
2 | From: Andy Lutomirski <luto@kernel.org> | |
3 | Date: Wed, 6 Jan 2016 12:21:01 -0800 | |
4 | Subject: [PATCH] x86/mm: Add barriers and document switch_mm()-vs-flush | |
5 | synchronization | |
6 | ||
7 | When switch_mm() activates a new PGD, it also sets a bit that | |
8 | tells other CPUs that the PGD is in use so that TLB flush IPIs | |
9 | will be sent. In order for that to work correctly, the bit | |
10 | needs to be visible prior to loading the PGD and therefore | |
11 | starting to fill the local TLB. | |
12 | ||
13 | Document all the barriers that make this work correctly and add | |
14 | a couple that were missing. | |
15 | ||
16 | Signed-off-by: Andy Lutomirski <luto@kernel.org> | |
17 | Cc: Andrew Morton <akpm@linux-foundation.org> | |
18 | Cc: Andy Lutomirski <luto@amacapital.net> | |
19 | Cc: Borislav Petkov <bp@alien8.de> | |
20 | Cc: Brian Gerst <brgerst@gmail.com> | |
21 | Cc: Dave Hansen <dave.hansen@linux.intel.com> | |
22 | Cc: Denys Vlasenko <dvlasenk@redhat.com> | |
23 | Cc: H. Peter Anvin <hpa@zytor.com> | |
24 | Cc: Linus Torvalds <torvalds@linux-foundation.org> | |
25 | Cc: Peter Zijlstra <peterz@infradead.org> | |
26 | Cc: Rik van Riel <riel@redhat.com> | |
27 | Cc: Thomas Gleixner <tglx@linutronix.de> | |
28 | Cc: linux-mm@kvack.org | |
29 | Cc: stable@vger.kernel.org | |
30 | Signed-off-by: Ingo Molnar <mingo@kernel.org> | |
31 | --- | |
32 | arch/x86/include/asm/mmu_context.h | 33 ++++++++++++++++++++++++++++++++- | |
33 | arch/x86/mm/tlb.c | 29 ++++++++++++++++++++++++++--- | |
34 | 2 files changed, 58 insertions(+), 4 deletions(-) | |
35 | ||
36 | diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h | |
37 | index 379cd36..1edc9cd 100644 | |
38 | --- a/arch/x86/include/asm/mmu_context.h | |
39 | +++ b/arch/x86/include/asm/mmu_context.h | |
40 | @@ -116,8 +116,34 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, | |
41 | #endif | |
42 | cpumask_set_cpu(cpu, mm_cpumask(next)); | |
43 | ||
44 | - /* Re-load page tables */ | |
45 | + /* | |
46 | + * Re-load page tables. | |
47 | + * | |
48 | + * This logic has an ordering constraint: | |
49 | + * | |
50 | + * CPU 0: Write to a PTE for 'next' | |
51 | + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. | |
52 | + * CPU 1: set bit 1 in next's mm_cpumask | |
53 | + * CPU 1: load from the PTE that CPU 0 writes (implicit) | |
54 | + * | |
55 | + * We need to prevent an outcome in which CPU 1 observes | |
56 | + * the new PTE value and CPU 0 observes bit 1 clear in | |
57 | + * mm_cpumask. (If that occurs, then the IPI will never | |
58 | + * be sent, and CPU 0's TLB will contain a stale entry.) | |
59 | + * | |
60 | + * The bad outcome can occur if either CPU's load is | |
61 | + * reordered before that CPU's store, so both CPUs much | |
62 | + * execute full barriers to prevent this from happening. | |
63 | + * | |
64 | + * Thus, switch_mm needs a full barrier between the | |
65 | + * store to mm_cpumask and any operation that could load | |
66 | + * from next->pgd. This barrier synchronizes with | |
67 | + * remote TLB flushers. Fortunately, load_cr3 is | |
68 | + * serializing and thus acts as a full barrier. | |
69 | + * | |
70 | + */ | |
71 | load_cr3(next->pgd); | |
72 | + | |
73 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | |
74 | ||
75 | /* Stop flush ipis for the previous mm */ | |
76 | @@ -156,10 +182,15 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, | |
77 | * schedule, protecting us from simultaneous changes. | |
78 | */ | |
79 | cpumask_set_cpu(cpu, mm_cpumask(next)); | |
80 | + | |
81 | /* | |
82 | * We were in lazy tlb mode and leave_mm disabled | |
83 | * tlb flush IPI delivery. We must reload CR3 | |
84 | * to make sure to use no freed page tables. | |
85 | + * | |
86 | + * As above, this is a barrier that forces | |
87 | + * TLB repopulation to be ordered after the | |
88 | + * store to mm_cpumask. | |
89 | */ | |
90 | load_cr3(next->pgd); | |
91 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | |
92 | diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c | |
93 | index 8ddb5d0..8f4cc3d 100644 | |
94 | --- a/arch/x86/mm/tlb.c | |
95 | +++ b/arch/x86/mm/tlb.c | |
96 | @@ -161,7 +161,10 @@ void flush_tlb_current_task(void) | |
97 | preempt_disable(); | |
98 | ||
99 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | |
100 | + | |
101 | + /* This is an implicit full barrier that synchronizes with switch_mm. */ | |
102 | local_flush_tlb(); | |
103 | + | |
104 | trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); | |
105 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | |
106 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); | |
107 | @@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | |
108 | unsigned long base_pages_to_flush = TLB_FLUSH_ALL; | |
109 | ||
110 | preempt_disable(); | |
111 | - if (current->active_mm != mm) | |
112 | + if (current->active_mm != mm) { | |
113 | + /* Synchronize with switch_mm. */ | |
114 | + smp_mb(); | |
115 | + | |
116 | goto out; | |
117 | + } | |
118 | ||
119 | if (!current->mm) { | |
120 | leave_mm(smp_processor_id()); | |
121 | + | |
122 | + /* Synchronize with switch_mm. */ | |
123 | + smp_mb(); | |
124 | + | |
125 | goto out; | |
126 | } | |
127 | ||
128 | if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) | |
129 | base_pages_to_flush = (end - start) >> PAGE_SHIFT; | |
130 | ||
131 | + /* | |
132 | + * Both branches below are implicit full barriers (MOV to CR or | |
133 | + * INVLPG) that synchronize with switch_mm. | |
134 | + */ | |
135 | if (base_pages_to_flush > tlb_single_page_flush_ceiling) { | |
136 | base_pages_to_flush = TLB_FLUSH_ALL; | |
137 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | |
138 | @@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) | |
139 | preempt_disable(); | |
140 | ||
141 | if (current->active_mm == mm) { | |
142 | - if (current->mm) | |
143 | + if (current->mm) { | |
144 | + /* | |
145 | + * Implicit full barrier (INVLPG) that synchronizes | |
146 | + * with switch_mm. | |
147 | + */ | |
148 | __flush_tlb_one(start); | |
149 | - else | |
150 | + } else { | |
151 | leave_mm(smp_processor_id()); | |
152 | + | |
153 | + /* Synchronize with switch_mm. */ | |
154 | + smp_mb(); | |
155 | + } | |
156 | } | |
157 | ||
158 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | |
159 | -- | |
160 | 2.1.4 | |
161 |