arch/x86/mm/tlb.c

   1 #include <linux/init.h>
   2
   3 #include <linux/mm.h>
   4 #include <linux/spinlock.h>
   5 #include <linux/smp.h>
   6 #include <linux/interrupt.h>
   7 #include <linux/export.h>
   8 #include <linux/cpu.h>
   9
  10 #include <asm/tlbflush.h>
  11 #include <asm/mmu_context.h>
  12 #include <asm/cache.h>
  13 #include <asm/apic.h>
  14 #include <asm/uv/uv.h>
  15 #include <linux/debugfs.h>
  16
  17 /*
  18  *      Smarter SMP flushing macros.
  19  *              c/o Linus Torvalds.
  20  *
  21  *      These mean you can really definitely utterly forget about
  22  *      writing to user space from interrupts. (Its not allowed anyway).
  23  *
  24  *      Optimizations Manfred Spraul <manfred@colorfullife.com>
  25  *
  26  *      More scalable flush, from Andi Kleen
  27  *
  28  *      Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  29  */
  30
  31 #ifdef CONFIG_SMP
  32
  33 /*
  34  * We cannot call mmdrop() because we are in interrupt context,
  35  * instead update mm->cpu_vm_mask.
  36  */
  37 void leave_mm(int cpu)
  38 {
  39         struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
  40         if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
  41                 BUG();
  42         if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
  43                 cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
  44                 load_cr3(swapper_pg_dir);
  45                 /*
  46                  * This gets called in the idle path where RCU
  47                  * functions differently.  Tracing normally
  48                  * uses RCU, so we have to call the tracepoint
  49                  * specially here.
  50                  */
  51                 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
  52         }
  53 }
  54 EXPORT_SYMBOL_GPL(leave_mm);
  55
  56 #endif /* CONFIG_SMP */
  57
  58 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  59                struct task_struct *tsk)
  60 {
  61         unsigned long flags;
  62
  63         local_irq_save(flags);
  64         switch_mm_irqs_off(prev, next, tsk);
  65         local_irq_restore(flags);
  66 }
  67
  68 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
  69                         struct task_struct *tsk)
  70 {
  71         unsigned cpu = smp_processor_id();
  72
  73         if (likely(prev != next)) {
  74                 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
  75                         /*
  76                          * If our current stack is in vmalloc space and isn't
  77                          * mapped in the new pgd, we'll double-fault.  Forcibly
  78                          * map it.
  79                          */
  80                         unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
  81
  82                         pgd_t *pgd = next->pgd + stack_pgd_index;
  83
  84                         if (unlikely(pgd_none(*pgd)))
  85                                 set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
  86                 }
  87
  88 #ifdef CONFIG_SMP
  89                 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
  90                 this_cpu_write(cpu_tlbstate.active_mm, next);
  91 #endif
  92
  93                 cpumask_set_cpu(cpu, mm_cpumask(next));
  94
  95                 /*
  96                  * Re-load page tables.
  97                  *
  98                  * This logic has an ordering constraint:
  99                  *
 100                  *  CPU 0: Write to a PTE for 'next'
 101                  *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
 102                  *  CPU 1: set bit 1 in next's mm_cpumask
 103                  *  CPU 1: load from the PTE that CPU 0 writes (implicit)
 104                  *
 105                  * We need to prevent an outcome in which CPU 1 observes
 106                  * the new PTE value and CPU 0 observes bit 1 clear in
 107                  * mm_cpumask.  (If that occurs, then the IPI will never
 108                  * be sent, and CPU 0's TLB will contain a stale entry.)
 109                  *
 110                  * The bad outcome can occur if either CPU's load is
 111                  * reordered before that CPU's store, so both CPUs must
 112                  * execute full barriers to prevent this from happening.
 113                  *
 114                  * Thus, switch_mm needs a full barrier between the
 115                  * store to mm_cpumask and any operation that could load
 116                  * from next->pgd.  TLB fills are special and can happen
 117                  * due to instruction fetches or for no reason at all,
 118                  * and neither LOCK nor MFENCE orders them.
 119                  * Fortunately, load_cr3() is serializing and gives the
 120                  * ordering guarantee we need.
 121                  *
 122                  */
 123                 load_cr3(next->pgd);
 124
 125                 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 126
 127                 /* Stop flush ipis for the previous mm */
 128                 cpumask_clear_cpu(cpu, mm_cpumask(prev));
 129
 130                 /* Load per-mm CR4 state */
 131                 load_mm_cr4(next);
 132
 133 #ifdef CONFIG_MODIFY_LDT_SYSCALL
 134                 /*
 135                  * Load the LDT, if the LDT is different.
 136                  *
 137                  * It's possible that prev->context.ldt doesn't match
 138                  * the LDT register.  This can happen if leave_mm(prev)
 139                  * was called and then modify_ldt changed
 140                  * prev->context.ldt but suppressed an IPI to this CPU.
 141                  * In this case, prev->context.ldt != NULL, because we
 142                  * never set context.ldt to NULL while the mm still
 143                  * exists.  That means that next->context.ldt !=
 144                  * prev->context.ldt, because mms never share an LDT.
 145                  */
 146                 if (unlikely(prev->context.ldt != next->context.ldt))
 147                         load_mm_ldt(next);
 148 #endif
 149         }
 150 #ifdef CONFIG_SMP
 151           else {
 152                 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
 153                 BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
 154
 155                 if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
 156                         /*
 157                          * On established mms, the mm_cpumask is only changed
 158                          * from irq context, from ptep_clear_flush() while in
 159                          * lazy tlb mode, and here. Irqs are blocked during
 160                          * schedule, protecting us from simultaneous changes.
 161                          */
 162                         cpumask_set_cpu(cpu, mm_cpumask(next));
 163
 164                         /*
 165                          * We were in lazy tlb mode and leave_mm disabled
 166                          * tlb flush IPI delivery. We must reload CR3
 167                          * to make sure to use no freed page tables.
 168                          *
 169                          * As above, load_cr3() is serializing and orders TLB
 170                          * fills with respect to the mm_cpumask write.
 171                          */
 172                         load_cr3(next->pgd);
 173                         trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 174                         load_mm_cr4(next);
 175                         load_mm_ldt(next);
 176                 }
 177         }
 178 #endif
 179 }
 180
 181 #ifdef CONFIG_SMP
 182
 183 /*
 184  * The flush IPI assumes that a thread switch happens in this order:
 185  * [cpu0: the cpu that switches]
 186  * 1) switch_mm() either 1a) or 1b)
 187  * 1a) thread switch to a different mm
 188  * 1a1) set cpu_tlbstate to TLBSTATE_OK
 189  *      Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
 190  *      if cpu0 was in lazy tlb mode.
 191  * 1a2) update cpu active_mm
 192  *      Now cpu0 accepts tlb flushes for the new mm.
 193  * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
 194  *      Now the other cpus will send tlb flush ipis.
 195  * 1a4) change cr3.
 196  * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
 197  *      Stop ipi delivery for the old mm. This is not synchronized with
 198  *      the other cpus, but flush_tlb_func ignore flush ipis for the wrong
 199  *      mm, and in the worst case we perform a superfluous tlb flush.
 200  * 1b) thread switch without mm change
 201  *      cpu active_mm is correct, cpu0 already handles flush ipis.
 202  * 1b1) set cpu_tlbstate to TLBSTATE_OK
 203  * 1b2) test_and_set the cpu bit in cpu_vm_mask.
 204  *      Atomically set the bit [other cpus will start sending flush ipis],
 205  *      and test the bit.
 206  * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
 207  * 2) switch %%esp, ie current
 208  *
 209  * The interrupt must handle 2 special cases:
 210  * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
 211  * - the cpu performs speculative tlb reads, i.e. even if the cpu only
 212  *   runs in kernel space, the cpu could load tlb entries for user space
 213  *   pages.
 214  *
 215  * The good news is that cpu_tlbstate is local to each cpu, no
 216  * write/read ordering problems.
 217  */
 218
 219 /*
 220  * TLB flush funcation:
 221  * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
 222  * 2) Leave the mm if we are in the lazy tlb mode.
 223  */
 224 static void flush_tlb_func(void *info)
 225 {
 226         const struct flush_tlb_info *f = info;
 227
 228         inc_irq_stat(irq_tlb_count);
 229
 230         if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.active_mm))
 231                 return;
 232
 233         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
 234
 235         if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
 236                 leave_mm(smp_processor_id());
 237                 return;
 238         }
 239
 240         if (f->end == TLB_FLUSH_ALL) {
 241                 local_flush_tlb();
 242                 trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
 243         } else {
 244                 unsigned long addr;
 245                 unsigned long nr_pages =
 246                         (f->end - f->start) / PAGE_SIZE;
 247                 addr = f->start;
 248                 while (addr < f->end) {
 249                         __flush_tlb_single(addr);
 250                         addr += PAGE_SIZE;
 251                 }
 252                 trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
 253         }
 254 }
 255
 256 void native_flush_tlb_others(const struct cpumask *cpumask,
 257                              const struct flush_tlb_info *info)
 258 {
 259         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
 260         if (info->end == TLB_FLUSH_ALL)
 261                 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
 262         else
 263                 trace_tlb_flush(TLB_REMOTE_SEND_IPI,
 264                                 (info->end - info->start) >> PAGE_SHIFT);
 265
 266         if (is_uv_system()) {
 267                 unsigned int cpu;
 268
 269                 cpu = smp_processor_id();
 270                 cpumask = uv_flush_tlb_others(cpumask, info);
 271                 if (cpumask)
 272                         smp_call_function_many(cpumask, flush_tlb_func,
 273                                                (void *)info, 1);
 274                 return;
 275         }
 276         smp_call_function_many(cpumask, flush_tlb_func,
 277                                (void *)info, 1);
 278 }
 279
 280 /*
 281  * See Documentation/x86/tlb.txt for details.  We choose 33
 282  * because it is large enough to cover the vast majority (at
 283  * least 95%) of allocations, and is small enough that we are
 284  * confident it will not cause too much overhead.  Each single
 285  * flush is about 100 ns, so this caps the maximum overhead at
 286  * _about_ 3,000 ns.
 287  *
 288  * This is in units of pages.
 289  */
 290 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 291
 292 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 293                                 unsigned long end, unsigned long vmflag)
 294 {
 295         unsigned long addr;
 296         struct flush_tlb_info info;
 297         /* do a global flush by default */
 298         unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
 299
 300         preempt_disable();
 301
 302         if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
 303                 base_pages_to_flush = (end - start) >> PAGE_SHIFT;
 304         if (base_pages_to_flush > tlb_single_page_flush_ceiling)
 305                 base_pages_to_flush = TLB_FLUSH_ALL;
 306
 307         if (current->active_mm != mm) {
 308                 /* Synchronize with switch_mm. */
 309                 smp_mb();
 310
 311                 goto out;
 312         }
 313
 314         if (!current->mm) {
 315                 leave_mm(smp_processor_id());
 316
 317                 /* Synchronize with switch_mm. */
 318                 smp_mb();
 319
 320                 goto out;
 321         }
 322
 323         /*
 324          * Both branches below are implicit full barriers (MOV to CR or
 325          * INVLPG) that synchronize with switch_mm.
 326          */
 327         if (base_pages_to_flush == TLB_FLUSH_ALL) {
 328                 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
 329                 local_flush_tlb();
 330         } else {
 331                 /* flush range by one by one 'invlpg' */
 332                 for (addr = start; addr < end;  addr += PAGE_SIZE) {
 333                         count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
 334                         __flush_tlb_single(addr);
 335                 }
 336         }
 337         trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
 338 out:
 339         info.mm = mm;
 340         if (base_pages_to_flush == TLB_FLUSH_ALL) {
 341                 info.start = 0UL;
 342                 info.end = TLB_FLUSH_ALL;
 343         } else {
 344                 info.start = start;
 345                 info.end = end;
 346         }
 347         if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
 348                 flush_tlb_others(mm_cpumask(mm), &info);
 349         preempt_enable();
 350 }
 351
 352
 353 static void do_flush_tlb_all(void *info)
 354 {
 355         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
 356         __flush_tlb_all();
 357         if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
 358                 leave_mm(smp_processor_id());
 359 }
 360
 361 void flush_tlb_all(void)
 362 {
 363         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
 364         on_each_cpu(do_flush_tlb_all, NULL, 1);
 365 }
 366
 367 static void do_kernel_range_flush(void *info)
 368 {
 369         struct flush_tlb_info *f = info;
 370         unsigned long addr;
 371
 372         /* flush range by one by one 'invlpg' */
 373         for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
 374                 __flush_tlb_single(addr);
 375 }
 376
 377 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 378 {
 379
 380         /* Balance as user space task's flush, a bit conservative */
 381         if (end == TLB_FLUSH_ALL ||
 382             (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
 383                 on_each_cpu(do_flush_tlb_all, NULL, 1);
 384         } else {
 385                 struct flush_tlb_info info;
 386                 info.start = start;
 387                 info.end = end;
 388                 on_each_cpu(do_kernel_range_flush, &info, 1);
 389         }
 390 }
 391
 392 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 393 {
 394         struct flush_tlb_info info = {
 395                 .mm = NULL,
 396                 .start = 0UL,
 397                 .end = TLB_FLUSH_ALL,
 398         };
 399
 400         int cpu = get_cpu();
 401
 402         if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 403                 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
 404                 local_flush_tlb();
 405                 trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
 406         }
 407
 408         if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
 409                 flush_tlb_others(&batch->cpumask, &info);
 410         cpumask_clear(&batch->cpumask);
 411
 412         put_cpu();
 413 }
 414
 415 static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
 416                              size_t count, loff_t *ppos)
 417 {
 418         char buf[32];
 419         unsigned int len;
 420
 421         len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
 422         return simple_read_from_buffer(user_buf, count, ppos, buf, len);
 423 }
 424
 425 static ssize_t tlbflush_write_file(struct file *file,
 426                  const char __user *user_buf, size_t count, loff_t *ppos)
 427 {
 428         char buf[32];
 429         ssize_t len;
 430         int ceiling;
 431
 432         len = min(count, sizeof(buf) - 1);
 433         if (copy_from_user(buf, user_buf, len))
 434                 return -EFAULT;
 435
 436         buf[len] = '\0';
 437         if (kstrtoint(buf, 0, &ceiling))
 438                 return -EINVAL;
 439
 440         if (ceiling < 0)
 441                 return -EINVAL;
 442
 443         tlb_single_page_flush_ceiling = ceiling;
 444         return count;
 445 }
 446
 447 static const struct file_operations fops_tlbflush = {
 448         .read = tlbflush_read_file,
 449         .write = tlbflush_write_file,
 450         .llseek = default_llseek,
 451 };
 452
 453 static int __init create_tlb_single_page_flush_ceiling(void)
 454 {
 455         debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
 456                             arch_debugfs_dir, NULL, &fops_tlbflush);
 457         return 0;
 458 }
 459 late_initcall(create_tlb_single_page_flush_ceiling);
 460
 461 #endif /* CONFIG_SMP */