patches/kernel/0041-x86-mm-Rework-lazy-TLB-mode-and-TLB-freshness-tracki.patch

   1 From caa3549fe709971498eaf080c1710ef627a0df5a Mon Sep 17 00:00:00 2001
   2 From: Andy Lutomirski <luto@kernel.org>
   3 Date: Thu, 29 Jun 2017 08:53:17 -0700
   4 Subject: [PATCH 041/241] x86/mm: Rework lazy TLB mode and TLB freshness
   5  tracking
   6 MIME-Version: 1.0
   7 Content-Type: text/plain; charset=UTF-8
   8 Content-Transfer-Encoding: 8bit
   9
  10 CVE-2017-5754
  11
  12 x86's lazy TLB mode used to be fairly weak -- it would switch to
  13 init_mm the first time it tried to flush a lazy TLB.  This meant an
  14 unnecessary CR3 write and, if the flush was remote, an unnecessary
  15 IPI.
  16
  17 Rewrite it entirely.  When we enter lazy mode, we simply remove the
  18 CPU from mm_cpumask.  This means that we need a way to figure out
  19 whether we've missed a flush when we switch back out of lazy mode.
  20 I use the tlb_gen machinery to track whether a context is up to
  21 date.
  22
  23 Note to reviewers: this patch, my itself, looks a bit odd.  I'm
  24 using an array of length 1 containing (ctx_id, tlb_gen) rather than
  25 just storing tlb_gen, and making it at array isn't necessary yet.
  26 I'm doing this because the next few patches add PCID support, and,
  27 with PCID, we need ctx_id, and the array will end up with a length
  28 greater than 1.  Making it an array now means that there will be
  29 less churn and therefore less stress on your eyeballs.
  30
  31 NB: This is dubious but, AFAICT, still correct on Xen and UV.
  32 xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
  33 patch changes the way that mm_cpumask() works.  This should be okay,
  34 since Xen *also* iterates all online CPUs to find all the CPUs it
  35 needs to twiddle.
  36
  37 The UV tlbflush code is rather dated and should be changed.
  38
  39 Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
  40 (turbo off, intel_pstate requesting max performance) under KVM with
  41 the guest using idle=poll (to avoid artifacts when bouncing between
  42 CPUs).  I haven't done any real statistics here -- I just ran them
  43 in a loop and picked the fastest results that didn't look like
  44 outliers.  Unpatched means commit a4eb8b993554, so all the
  45 bookkeeping overhead is gone.
  46
  47 MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity.  In
  48 an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
  49 This is intended to be a nearly worst-case test.
  50
  51   patched:         13.4µs
  52   unpatched:       21.6µs
  53
  54 Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
  55 nrounds = 100, 256M data
  56
  57   patched:         1.1 seconds or so
  58   unpatched:       1.9 seconds or so
  59
  60 The sleepup on Vitaly's test appearss to be because it spends a lot
  61 of time blocked on mmap_sem, and this patch avoids sending IPIs to
  62 blocked CPUs.
  63
  64 Signed-off-by: Andy Lutomirski <luto@kernel.org>
  65 Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
  66 Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
  67 Cc: Andrew Banman <abanman@sgi.com>
  68 Cc: Andrew Morton <akpm@linux-foundation.org>
  69 Cc: Arjan van de Ven <arjan@linux.intel.com>
  70 Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
  71 Cc: Borislav Petkov <bp@alien8.de>
  72 Cc: Dave Hansen <dave.hansen@intel.com>
  73 Cc: Dimitri Sivanich <sivanich@sgi.com>
  74 Cc: Juergen Gross <jgross@suse.com>
  75 Cc: Linus Torvalds <torvalds@linux-foundation.org>
  76 Cc: Mel Gorman <mgorman@suse.de>
  77 Cc: Mike Travis <travis@sgi.com>
  78 Cc: Peter Zijlstra <peterz@infradead.org>
  79 Cc: Rik van Riel <riel@redhat.com>
  80 Cc: linux-mm@kvack.org
  81 Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
  82 Signed-off-by: Ingo Molnar <mingo@kernel.org>
  83 (cherry picked from commit 94b1b03b519b81c494900cb112aa00ed205cc2d9)
  84 Signed-off-by: Andy Whitcroft <apw@canonical.com>
  85 Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
  86 (cherry picked from commit b381b7ae452f2bc6384507a897247be7c93a71cc)
  87 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
  88 ---
  89  arch/x86/include/asm/mmu_context.h |   6 +-
  90  arch/x86/include/asm/tlbflush.h    |   4 -
  91  arch/x86/mm/init.c                 |   1 -
  92  arch/x86/mm/tlb.c                  | 197 ++++++++++++++++++++++---------------
  93  arch/x86/xen/mmu_pv.c              |   5 +-
  94  5 files changed, 124 insertions(+), 89 deletions(-)
  95
  96 diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
  97 index 6c05679c715b..d6b055b328f2 100644
  98 --- a/arch/x86/include/asm/mmu_context.h
  99 +++ b/arch/x86/include/asm/mmu_context.h
 100 @@ -128,8 +128,10 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
 101
 102  static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 103  {
 104 -       if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
 105 -               this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
 106 +       int cpu = smp_processor_id();
 107 +
 108 +       if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
 109 +               cpumask_clear_cpu(cpu, mm_cpumask(mm));
 110  }
 111
 112  static inline int init_new_context(struct task_struct *tsk,
 113 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
 114 index 3a167c214560..6397275008db 100644
 115 --- a/arch/x86/include/asm/tlbflush.h
 116 +++ b/arch/x86/include/asm/tlbflush.h
 117 @@ -95,7 +95,6 @@ struct tlb_state {
 118          * mode even if we've already switched back to swapper_pg_dir.
 119          */
 120         struct mm_struct *loaded_mm;
 121 -       int state;
 122
 123         /*
 124          * Access to this CR4 shadow and to H/W CR4 is protected by
 125 @@ -318,9 +317,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 126  void native_flush_tlb_others(const struct cpumask *cpumask,
 127                              const struct flush_tlb_info *info);
 128
 129 -#define TLBSTATE_OK    1
 130 -#define TLBSTATE_LAZY  2
 131 -
 132  static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
 133                                         struct mm_struct *mm)
 134  {
 135 diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
 136 index df2624b091a7..c86dc071bb10 100644
 137 --- a/arch/x86/mm/init.c
 138 +++ b/arch/x86/mm/init.c
 139 @@ -849,7 +849,6 @@ void __init zone_sizes_init(void)
 140
 141  DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 142         .loaded_mm = &init_mm,
 143 -       .state = 0,
 144         .cr4 = ~0UL,    /* fail hard if we screw up cr4 shadow initialization */
 145  };
 146  EXPORT_SYMBOL_GPL(cpu_tlbstate);
 147 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
 148 index 4e5a5ddb9e4d..0982c997d36f 100644
 149 --- a/arch/x86/mm/tlb.c
 150 +++ b/arch/x86/mm/tlb.c
 151 @@ -45,8 +45,8 @@ void leave_mm(int cpu)
 152         if (loaded_mm == &init_mm)
 153                 return;
 154
 155 -       if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
 156 -               BUG();
 157 +       /* Warn if we're not lazy. */
 158 +       WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
 159
 160         switch_mm(NULL, &init_mm, NULL);
 161  }
 162 @@ -65,94 +65,117 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 163  void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 164                         struct task_struct *tsk)
 165  {
 166 -       unsigned cpu = smp_processor_id();
 167         struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
 168 +       unsigned cpu = smp_processor_id();
 169 +       u64 next_tlb_gen;
 170
 171         /*
 172 -        * NB: The scheduler will call us with prev == next when
 173 -        * switching from lazy TLB mode to normal mode if active_mm
 174 -        * isn't changing.  When this happens, there is no guarantee
 175 -        * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
 176 +        * NB: The scheduler will call us with prev == next when switching
 177 +        * from lazy TLB mode to normal mode if active_mm isn't changing.
 178 +        * When this happens, we don't assume that CR3 (and hence
 179 +        * cpu_tlbstate.loaded_mm) matches next.
 180          *
 181          * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
 182          */
 183
 184 -       this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
 185 +       /* We don't want flush_tlb_func_* to run concurrently with us. */
 186 +       if (IS_ENABLED(CONFIG_PROVE_LOCKING))
 187 +               WARN_ON_ONCE(!irqs_disabled());
 188 +
 189 +       /*
 190 +        * Verify that CR3 is what we think it is.  This will catch
 191 +        * hypothetical buggy code that directly switches to swapper_pg_dir
 192 +        * without going through leave_mm() / switch_mm_irqs_off().
 193 +        */
 194 +       VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd));
 195
 196         if (real_prev == next) {
 197 -               /*
 198 -                * There's nothing to do: we always keep the per-mm control
 199 -                * regs in sync with cpu_tlbstate.loaded_mm.  Just
 200 -                * sanity-check mm_cpumask.
 201 -                */
 202 -               if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
 203 -                       cpumask_set_cpu(cpu, mm_cpumask(next));
 204 -               return;
 205 -       }
 206 +               VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
 207 +                         next->context.ctx_id);
 208 +
 209 +               if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
 210 +                       /*
 211 +                        * There's nothing to do: we weren't lazy, and we
 212 +                        * aren't changing our mm.  We don't need to flush
 213 +                        * anything, nor do we need to update CR3, CR4, or
 214 +                        * LDTR.
 215 +                        */
 216 +                       return;
 217 +               }
 218 +
 219 +               /* Resume remote flushes and then read tlb_gen. */
 220 +               cpumask_set_cpu(cpu, mm_cpumask(next));
 221 +               next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 222 +
 223 +               if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) {
 224 +                       /*
 225 +                        * Ideally, we'd have a flush_tlb() variant that
 226 +                        * takes the known CR3 value as input.  This would
 227 +                        * be faster on Xen PV and on hypothetical CPUs
 228 +                        * on which INVPCID is fast.
 229 +                        */
 230 +                       this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
 231 +                                      next_tlb_gen);
 232 +                       write_cr3(__pa(next->pgd));
 233 +
 234 +                       /*
 235 +                        * This gets called via leave_mm() in the idle path
 236 +                        * where RCU functions differently.  Tracing normally
 237 +                        * uses RCU, so we have to call the tracepoint
 238 +                        * specially here.
 239 +                        */
 240 +                       trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
 241 +                                               TLB_FLUSH_ALL);
 242 +               }
 243
 244 -       if (IS_ENABLED(CONFIG_VMAP_STACK)) {
 245                 /*
 246 -                * If our current stack is in vmalloc space and isn't
 247 -                * mapped in the new pgd, we'll double-fault.  Forcibly
 248 -                * map it.
 249 +                * We just exited lazy mode, which means that CR4 and/or LDTR
 250 +                * may be stale.  (Changes to the required CR4 and LDTR states
 251 +                * are not reflected in tlb_gen.)
 252                  */
 253 -               unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
 254 -
 255 -               pgd_t *pgd = next->pgd + stack_pgd_index;
 256 -
 257 -               if (unlikely(pgd_none(*pgd)))
 258 -                       set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
 259 -       }
 260 +       } else {
 261 +               VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) ==
 262 +                         next->context.ctx_id);
 263 +
 264 +               if (IS_ENABLED(CONFIG_VMAP_STACK)) {
 265 +                       /*
 266 +                        * If our current stack is in vmalloc space and isn't
 267 +                        * mapped in the new pgd, we'll double-fault.  Forcibly
 268 +                        * map it.
 269 +                        */
 270 +                       unsigned int index = pgd_index(current_stack_pointer());
 271 +                       pgd_t *pgd = next->pgd + index;
 272 +
 273 +                       if (unlikely(pgd_none(*pgd)))
 274 +                               set_pgd(pgd, init_mm.pgd[index]);
 275 +               }
 276
 277 -       this_cpu_write(cpu_tlbstate.loaded_mm, next);
 278 -       this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
 279 -       this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen));
 280 +               /* Stop remote flushes for the previous mm */
 281 +               if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
 282 +                       cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
 283
 284 -       WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
 285 -       cpumask_set_cpu(cpu, mm_cpumask(next));
 286 +               VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
 287
 288 -       /*
 289 -        * Re-load page tables.
 290 -        *
 291 -        * This logic has an ordering constraint:
 292 -        *
 293 -        *  CPU 0: Write to a PTE for 'next'
 294 -        *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
 295 -        *  CPU 1: set bit 1 in next's mm_cpumask
 296 -        *  CPU 1: load from the PTE that CPU 0 writes (implicit)
 297 -        *
 298 -        * We need to prevent an outcome in which CPU 1 observes
 299 -        * the new PTE value and CPU 0 observes bit 1 clear in
 300 -        * mm_cpumask.  (If that occurs, then the IPI will never
 301 -        * be sent, and CPU 0's TLB will contain a stale entry.)
 302 -        *
 303 -        * The bad outcome can occur if either CPU's load is
 304 -        * reordered before that CPU's store, so both CPUs must
 305 -        * execute full barriers to prevent this from happening.
 306 -        *
 307 -        * Thus, switch_mm needs a full barrier between the
 308 -        * store to mm_cpumask and any operation that could load
 309 -        * from next->pgd.  TLB fills are special and can happen
 310 -        * due to instruction fetches or for no reason at all,
 311 -        * and neither LOCK nor MFENCE orders them.
 312 -        * Fortunately, load_cr3() is serializing and gives the
 313 -        * ordering guarantee we need.
 314 -        */
 315 -       load_cr3(next->pgd);
 316 +               /*
 317 +                * Start remote flushes and then read tlb_gen.
 318 +                */
 319 +               cpumask_set_cpu(cpu, mm_cpumask(next));
 320 +               next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 321
 322 -       /*
 323 -        * This gets called via leave_mm() in the idle path where RCU
 324 -        * functions differently.  Tracing normally uses RCU, so we have to
 325 -        * call the tracepoint specially here.
 326 -        */
 327 -       trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 328 +               this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
 329 +               this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen);
 330 +               this_cpu_write(cpu_tlbstate.loaded_mm, next);
 331 +               write_cr3(__pa(next->pgd));
 332
 333 -       /* Stop flush ipis for the previous mm */
 334 -       WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
 335 -                    real_prev != &init_mm);
 336 -       cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
 337 +               /*
 338 +                * This gets called via leave_mm() in the idle path where RCU
 339 +                * functions differently.  Tracing normally uses RCU, so we
 340 +                * have to call the tracepoint specially here.
 341 +                */
 342 +               trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
 343 +                                       TLB_FLUSH_ALL);
 344 +       }
 345
 346 -       /* Load per-mm CR4 and LDTR state */
 347         load_mm_cr4(next);
 348         switch_ldt(real_prev, next);
 349  }
 350 @@ -186,13 +209,13 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 351         VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
 352                    loaded_mm->context.ctx_id);
 353
 354 -       if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
 355 +       if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
 356                 /*
 357 -                * leave_mm() is adequate to handle any type of flush, and
 358 -                * we would prefer not to receive further IPIs.  leave_mm()
 359 -                * clears this CPU's bit in mm_cpumask().
 360 +                * We're in lazy mode -- don't flush.  We can get here on
 361 +                * remote flushes due to races and on local flushes if a
 362 +                * kernel thread coincidentally flushes the mm it's lazily
 363 +                * still using.
 364                  */
 365 -               leave_mm(smp_processor_id());
 366                 return;
 367         }
 368
 369 @@ -203,6 +226,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 370                  * be handled can catch us all the way up, leaving no work for
 371                  * the second flush.
 372                  */
 373 +               trace_tlb_flush(reason, 0);
 374                 return;
 375         }
 376
 377 @@ -304,6 +328,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 378                                 (info->end - info->start) >> PAGE_SHIFT);
 379
 380         if (is_uv_system()) {
 381 +               /*
 382 +                * This whole special case is confused.  UV has a "Broadcast
 383 +                * Assist Unit", which seems to be a fancy way to send IPIs.
 384 +                * Back when x86 used an explicit TLB flush IPI, UV was
 385 +                * optimized to use its own mechanism.  These days, x86 uses
 386 +                * smp_call_function_many(), but UV still uses a manual IPI,
 387 +                * and that IPI's action is out of date -- it does a manual
 388 +                * flush instead of calling flush_tlb_func_remote().  This
 389 +                * means that the percpu tlb_gen variables won't be updated
 390 +                * and we'll do pointless flushes on future context switches.
 391 +                *
 392 +                * Rather than hooking native_flush_tlb_others() here, I think
 393 +                * that UV should be updated so that smp_call_function_many(),
 394 +                * etc, are optimal on UV.
 395 +                */
 396                 unsigned int cpu;
 397
 398                 cpu = smp_processor_id();
 399 @@ -363,6 +402,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 400
 401         if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
 402                 flush_tlb_others(mm_cpumask(mm), &info);
 403 +
 404         put_cpu();
 405  }
 406
 407 @@ -371,8 +411,6 @@ static void do_flush_tlb_all(void *info)
 408  {
 409         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
 410         __flush_tlb_all();
 411 -       if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
 412 -               leave_mm(smp_processor_id());
 413  }
 414
 415  void flush_tlb_all(void)
 416 @@ -425,6 +463,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 417
 418         if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
 419                 flush_tlb_others(&batch->cpumask, &info);
 420 +
 421         cpumask_clear(&batch->cpumask);
 422
 423         put_cpu();
 424 diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
 425 index 5f61b7e2e6b2..ba76f3ce997f 100644
 426 --- a/arch/x86/xen/mmu_pv.c
 427 +++ b/arch/x86/xen/mmu_pv.c
 428 @@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
 429         /* Get the "official" set of cpus referring to our pagetable. */
 430         if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
 431                 for_each_online_cpu(cpu) {
 432 -                       if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
 433 -                           && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
 434 +                       if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
 435                                 continue;
 436                         smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
 437                 }
 438                 return;
 439         }
 440 -       cpumask_copy(mask, mm_cpumask(mm));
 441
 442         /*
 443          * It's possible that a vcpu may have a stale reference to our
 444 @@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
 445          * look at its actual current cr3 value, and force it to flush
 446          * if needed.
 447          */
 448 +       cpumask_clear(mask);
 449         for_each_online_cpu(cpu) {
 450                 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
 451                         cpumask_set_cpu(cpu, mask);
 452 --
 453 2.14.2
 454