1 #include <linux/init.h>
4 #include <linux/spinlock.h>
6 #include <linux/interrupt.h>
7 #include <linux/export.h>
10 #include <asm/tlbflush.h>
11 #include <asm/mmu_context.h>
12 #include <asm/cache.h>
14 #include <asm/uv/uv.h>
15 #include <linux/debugfs.h>
18 * TLB flushing, formerly SMP-only
21 * These mean you can really definitely utterly forget about
22 * writing to user space from interrupts. (Its not allowed anyway).
24 * Optimizations Manfred Spraul <manfred@colorfullife.com>
26 * More scalable flush, from Andi Kleen
28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
31 void leave_mm(int cpu
)
33 struct mm_struct
*loaded_mm
= this_cpu_read(cpu_tlbstate
.loaded_mm
);
36 * It's plausible that we're in lazy TLB mode while our mm is init_mm.
37 * If so, our callers still expect us to flush the TLB, but there
38 * aren't any user TLB entries in init_mm to worry about.
40 * This needs to happen before any other sanity checks due to
41 * intel_idle's shenanigans.
43 if (loaded_mm
== &init_mm
)
46 if (this_cpu_read(cpu_tlbstate
.state
) == TLBSTATE_OK
)
49 switch_mm(NULL
, &init_mm
, NULL
);
51 EXPORT_SYMBOL_GPL(leave_mm
);
53 void switch_mm(struct mm_struct
*prev
, struct mm_struct
*next
,
54 struct task_struct
*tsk
)
58 local_irq_save(flags
);
59 switch_mm_irqs_off(prev
, next
, tsk
);
60 local_irq_restore(flags
);
63 void switch_mm_irqs_off(struct mm_struct
*prev
, struct mm_struct
*next
,
64 struct task_struct
*tsk
)
66 unsigned cpu
= smp_processor_id();
67 struct mm_struct
*real_prev
= this_cpu_read(cpu_tlbstate
.loaded_mm
);
70 * NB: The scheduler will call us with prev == next when
71 * switching from lazy TLB mode to normal mode if active_mm
72 * isn't changing. When this happens, there is no guarantee
73 * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
75 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
78 this_cpu_write(cpu_tlbstate
.state
, TLBSTATE_OK
);
80 if (real_prev
== next
) {
82 * There's nothing to do: we always keep the per-mm control
83 * regs in sync with cpu_tlbstate.loaded_mm. Just
84 * sanity-check mm_cpumask.
86 if (WARN_ON_ONCE(!cpumask_test_cpu(cpu
, mm_cpumask(next
))))
87 cpumask_set_cpu(cpu
, mm_cpumask(next
));
91 if (IS_ENABLED(CONFIG_VMAP_STACK
)) {
93 * If our current stack is in vmalloc space and isn't
94 * mapped in the new pgd, we'll double-fault. Forcibly
97 unsigned int stack_pgd_index
= pgd_index(current_stack_pointer());
99 pgd_t
*pgd
= next
->pgd
+ stack_pgd_index
;
101 if (unlikely(pgd_none(*pgd
)))
102 set_pgd(pgd
, init_mm
.pgd
[stack_pgd_index
]);
105 this_cpu_write(cpu_tlbstate
.loaded_mm
, next
);
107 WARN_ON_ONCE(cpumask_test_cpu(cpu
, mm_cpumask(next
)));
108 cpumask_set_cpu(cpu
, mm_cpumask(next
));
111 * Re-load page tables.
113 * This logic has an ordering constraint:
115 * CPU 0: Write to a PTE for 'next'
116 * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
117 * CPU 1: set bit 1 in next's mm_cpumask
118 * CPU 1: load from the PTE that CPU 0 writes (implicit)
120 * We need to prevent an outcome in which CPU 1 observes
121 * the new PTE value and CPU 0 observes bit 1 clear in
122 * mm_cpumask. (If that occurs, then the IPI will never
123 * be sent, and CPU 0's TLB will contain a stale entry.)
125 * The bad outcome can occur if either CPU's load is
126 * reordered before that CPU's store, so both CPUs must
127 * execute full barriers to prevent this from happening.
129 * Thus, switch_mm needs a full barrier between the
130 * store to mm_cpumask and any operation that could load
131 * from next->pgd. TLB fills are special and can happen
132 * due to instruction fetches or for no reason at all,
133 * and neither LOCK nor MFENCE orders them.
134 * Fortunately, load_cr3() is serializing and gives the
135 * ordering guarantee we need.
140 * This gets called via leave_mm() in the idle path where RCU
141 * functions differently. Tracing normally uses RCU, so we have to
142 * call the tracepoint specially here.
144 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH
, TLB_FLUSH_ALL
);
146 /* Stop flush ipis for the previous mm */
147 WARN_ON_ONCE(!cpumask_test_cpu(cpu
, mm_cpumask(real_prev
)) &&
148 real_prev
!= &init_mm
);
149 cpumask_clear_cpu(cpu
, mm_cpumask(real_prev
));
151 /* Load per-mm CR4 and LDTR state */
153 switch_ldt(real_prev
, next
);
157 * The flush IPI assumes that a thread switch happens in this order:
158 * [cpu0: the cpu that switches]
159 * 1) switch_mm() either 1a) or 1b)
160 * 1a) thread switch to a different mm
161 * 1a1) set cpu_tlbstate to TLBSTATE_OK
162 * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
163 * if cpu0 was in lazy tlb mode.
164 * 1a2) update cpu active_mm
165 * Now cpu0 accepts tlb flushes for the new mm.
166 * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
167 * Now the other cpus will send tlb flush ipis.
169 * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
170 * Stop ipi delivery for the old mm. This is not synchronized with
171 * the other cpus, but flush_tlb_func ignore flush ipis for the wrong
172 * mm, and in the worst case we perform a superfluous tlb flush.
173 * 1b) thread switch without mm change
174 * cpu active_mm is correct, cpu0 already handles flush ipis.
175 * 1b1) set cpu_tlbstate to TLBSTATE_OK
176 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
177 * Atomically set the bit [other cpus will start sending flush ipis],
179 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
180 * 2) switch %%esp, ie current
182 * The interrupt must handle 2 special cases:
183 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
184 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
185 * runs in kernel space, the cpu could load tlb entries for user space
188 * The good news is that cpu_tlbstate is local to each cpu, no
189 * write/read ordering problems.
192 static void flush_tlb_func_common(const struct flush_tlb_info
*f
,
193 bool local
, enum tlb_flush_reason reason
)
195 if (this_cpu_read(cpu_tlbstate
.state
) != TLBSTATE_OK
) {
196 leave_mm(smp_processor_id());
200 if (f
->end
== TLB_FLUSH_ALL
) {
203 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL
);
204 trace_tlb_flush(reason
, TLB_FLUSH_ALL
);
207 unsigned long nr_pages
= (f
->end
- f
->start
) >> PAGE_SHIFT
;
209 while (addr
< f
->end
) {
210 __flush_tlb_single(addr
);
214 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE
, nr_pages
);
215 trace_tlb_flush(reason
, nr_pages
);
219 static void flush_tlb_func_local(void *info
, enum tlb_flush_reason reason
)
221 const struct flush_tlb_info
*f
= info
;
223 flush_tlb_func_common(f
, true, reason
);
226 static void flush_tlb_func_remote(void *info
)
228 const struct flush_tlb_info
*f
= info
;
230 inc_irq_stat(irq_tlb_count
);
232 if (f
->mm
&& f
->mm
!= this_cpu_read(cpu_tlbstate
.loaded_mm
))
235 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED
);
236 flush_tlb_func_common(f
, false, TLB_REMOTE_SHOOTDOWN
);
239 void native_flush_tlb_others(const struct cpumask
*cpumask
,
240 const struct flush_tlb_info
*info
)
242 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH
);
243 if (info
->end
== TLB_FLUSH_ALL
)
244 trace_tlb_flush(TLB_REMOTE_SEND_IPI
, TLB_FLUSH_ALL
);
246 trace_tlb_flush(TLB_REMOTE_SEND_IPI
,
247 (info
->end
- info
->start
) >> PAGE_SHIFT
);
249 if (is_uv_system()) {
252 cpu
= smp_processor_id();
253 cpumask
= uv_flush_tlb_others(cpumask
, info
);
255 smp_call_function_many(cpumask
, flush_tlb_func_remote
,
259 smp_call_function_many(cpumask
, flush_tlb_func_remote
,
264 * See Documentation/x86/tlb.txt for details. We choose 33
265 * because it is large enough to cover the vast majority (at
266 * least 95%) of allocations, and is small enough that we are
267 * confident it will not cause too much overhead. Each single
268 * flush is about 100 ns, so this caps the maximum overhead at
271 * This is in units of pages.
273 static unsigned long tlb_single_page_flush_ceiling __read_mostly
= 33;
275 void flush_tlb_mm_range(struct mm_struct
*mm
, unsigned long start
,
276 unsigned long end
, unsigned long vmflag
)
280 struct flush_tlb_info info
= {
286 /* Synchronize with switch_mm. */
289 /* Should we flush just the requested range? */
290 if ((end
!= TLB_FLUSH_ALL
) &&
291 !(vmflag
& VM_HUGETLB
) &&
292 ((end
- start
) >> PAGE_SHIFT
) <= tlb_single_page_flush_ceiling
) {
297 info
.end
= TLB_FLUSH_ALL
;
300 if (mm
== this_cpu_read(cpu_tlbstate
.loaded_mm
))
301 flush_tlb_func_local(&info
, TLB_LOCAL_MM_SHOOTDOWN
);
302 if (cpumask_any_but(mm_cpumask(mm
), cpu
) < nr_cpu_ids
)
303 flush_tlb_others(mm_cpumask(mm
), &info
);
308 static void do_flush_tlb_all(void *info
)
310 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED
);
312 if (this_cpu_read(cpu_tlbstate
.state
) == TLBSTATE_LAZY
)
313 leave_mm(smp_processor_id());
316 void flush_tlb_all(void)
318 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH
);
319 on_each_cpu(do_flush_tlb_all
, NULL
, 1);
322 static void do_kernel_range_flush(void *info
)
324 struct flush_tlb_info
*f
= info
;
327 /* flush range by one by one 'invlpg' */
328 for (addr
= f
->start
; addr
< f
->end
; addr
+= PAGE_SIZE
)
329 __flush_tlb_single(addr
);
332 void flush_tlb_kernel_range(unsigned long start
, unsigned long end
)
335 /* Balance as user space task's flush, a bit conservative */
336 if (end
== TLB_FLUSH_ALL
||
337 (end
- start
) > tlb_single_page_flush_ceiling
<< PAGE_SHIFT
) {
338 on_each_cpu(do_flush_tlb_all
, NULL
, 1);
340 struct flush_tlb_info info
;
343 on_each_cpu(do_kernel_range_flush
, &info
, 1);
347 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch
*batch
)
349 struct flush_tlb_info info
= {
352 .end
= TLB_FLUSH_ALL
,
357 if (cpumask_test_cpu(cpu
, &batch
->cpumask
))
358 flush_tlb_func_local(&info
, TLB_LOCAL_SHOOTDOWN
);
359 if (cpumask_any_but(&batch
->cpumask
, cpu
) < nr_cpu_ids
)
360 flush_tlb_others(&batch
->cpumask
, &info
);
361 cpumask_clear(&batch
->cpumask
);
366 static ssize_t
tlbflush_read_file(struct file
*file
, char __user
*user_buf
,
367 size_t count
, loff_t
*ppos
)
372 len
= sprintf(buf
, "%ld\n", tlb_single_page_flush_ceiling
);
373 return simple_read_from_buffer(user_buf
, count
, ppos
, buf
, len
);
376 static ssize_t
tlbflush_write_file(struct file
*file
,
377 const char __user
*user_buf
, size_t count
, loff_t
*ppos
)
383 len
= min(count
, sizeof(buf
) - 1);
384 if (copy_from_user(buf
, user_buf
, len
))
388 if (kstrtoint(buf
, 0, &ceiling
))
394 tlb_single_page_flush_ceiling
= ceiling
;
398 static const struct file_operations fops_tlbflush
= {
399 .read
= tlbflush_read_file
,
400 .write
= tlbflush_write_file
,
401 .llseek
= default_llseek
,
404 static int __init
create_tlb_single_page_flush_ceiling(void)
406 debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR
| S_IWUSR
,
407 arch_debugfs_dir
, NULL
, &fops_tlbflush
);
410 late_initcall(create_tlb_single_page_flush_ceiling
);