patches/kernel/0040-x86-mm-Track-the-TLB-s-tlb_gen-and-update-the-flushi.patch

   1 From c1f19d153ad69363ac1bc62bbd9be05ca48c526c Mon Sep 17 00:00:00 2001
   2 From: Andy Lutomirski <luto@kernel.org>
   3 Date: Thu, 29 Jun 2017 08:53:16 -0700
   4 Subject: [PATCH 040/241] x86/mm: Track the TLB's tlb_gen and update the
   5  flushing algorithm
   6 MIME-Version: 1.0
   7 Content-Type: text/plain; charset=UTF-8
   8 Content-Transfer-Encoding: 8bit
   9
  10 CVE-2017-5754
  11
  12 There are two kernel features that would benefit from tracking
  13 how up-to-date each CPU's TLB is in the case where IPIs aren't keeping
  14 it up to date in real time:
  15
  16  - Lazy mm switching currently works by switching to init_mm when
  17    it would otherwise flush.  This is wasteful: there isn't fundamentally
  18    any need to update CR3 at all when going lazy or when returning from
  19    lazy mode, nor is there any need to receive flush IPIs at all.  Instead,
  20    we should just stop trying to keep the TLB coherent when we go lazy and,
  21    when unlazying, check whether we missed any flushes.
  22
  23  - PCID will let us keep recent user contexts alive in the TLB.  If we
  24    start doing this, we need a way to decide whether those contexts are
  25    up to date.
  26
  27 On some paravirt systems, remote TLBs can be flushed without IPIs.
  28 This won't update the target CPUs' tlb_gens, which may cause
  29 unnecessary local flushes later on.  We can address this if it becomes
  30 a problem by carefully updating the target CPU's tlb_gen directly.
  31
  32 By itself, this patch is a very minor optimization that avoids
  33 unnecessary flushes when multiple TLB flushes targetting the same CPU
  34 race.  The complexity in this patch would not be worth it on its own,
  35 but it will enable improved lazy TLB tracking and PCID.
  36
  37 Signed-off-by: Andy Lutomirski <luto@kernel.org>
  38 Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
  39 Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
  40 Cc: Andrew Morton <akpm@linux-foundation.org>
  41 Cc: Arjan van de Ven <arjan@linux.intel.com>
  42 Cc: Borislav Petkov <bp@alien8.de>
  43 Cc: Dave Hansen <dave.hansen@intel.com>
  44 Cc: Linus Torvalds <torvalds@linux-foundation.org>
  45 Cc: Mel Gorman <mgorman@suse.de>
  46 Cc: Peter Zijlstra <peterz@infradead.org>
  47 Cc: Rik van Riel <riel@redhat.com>
  48 Cc: linux-mm@kvack.org
  49 Link: http://lkml.kernel.org/r/1210fb244bc9cbe7677f7f0b72db4d359675f24b.1498751203.git.luto@kernel.org
  50 Signed-off-by: Ingo Molnar <mingo@kernel.org>
  51 (cherry picked from commit b0579ade7cd82391360e959cc844e50a160e8a96)
  52 Signed-off-by: Andy Whitcroft <apw@canonical.com>
  53 Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
  54 (cherry picked from commit d34881c25f3c70228ed792fd62881185a25c4422)
  55 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
  56 ---
  57  arch/x86/include/asm/tlbflush.h |  43 +++++++++++++++--
  58  arch/x86/mm/tlb.c               | 102 +++++++++++++++++++++++++++++++++++++---
  59  2 files changed, 135 insertions(+), 10 deletions(-)
  60
  61 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
  62 index f1f2e73b7b77..3a167c214560 100644
  63 --- a/arch/x86/include/asm/tlbflush.h
  64 +++ b/arch/x86/include/asm/tlbflush.h
  65 @@ -82,6 +82,11 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
  66  #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
  67  #endif
  68
  69 +struct tlb_context {
  70 +       u64 ctx_id;
  71 +       u64 tlb_gen;
  72 +};
  73 +
  74  struct tlb_state {
  75         /*
  76          * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
  77 @@ -97,6 +102,21 @@ struct tlb_state {
  78          * disabling interrupts when modifying either one.
  79          */
  80         unsigned long cr4;
  81 +
  82 +       /*
  83 +        * This is a list of all contexts that might exist in the TLB.
  84 +        * Since we don't yet use PCID, there is only one context.
  85 +        *
  86 +        * For each context, ctx_id indicates which mm the TLB's user
  87 +        * entries came from.  As an invariant, the TLB will never
  88 +        * contain entries that are out-of-date as when that mm reached
  89 +        * the tlb_gen in the list.
  90 +        *
  91 +        * To be clear, this means that it's legal for the TLB code to
  92 +        * flush the TLB without updating tlb_gen.  This can happen
  93 +        * (for now, at least) due to paravirt remote flushes.
  94 +        */
  95 +       struct tlb_context ctxs[1];
  96  };
  97  DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
  98
  99 @@ -256,9 +276,26 @@ static inline void __flush_tlb_one(unsigned long addr)
 100   * and page-granular flushes are available only on i486 and up.
 101   */
 102  struct flush_tlb_info {
 103 -       struct mm_struct *mm;
 104 -       unsigned long start;
 105 -       unsigned long end;
 106 +       /*
 107 +        * We support several kinds of flushes.
 108 +        *
 109 +        * - Fully flush a single mm.  .mm will be set, .end will be
 110 +        *   TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
 111 +        *   which the IPI sender is trying to catch us up.
 112 +        *
 113 +        * - Partially flush a single mm.  .mm will be set, .start and
 114 +        *   .end will indicate the range, and .new_tlb_gen will be set
 115 +        *   such that the changes between generation .new_tlb_gen-1 and
 116 +        *   .new_tlb_gen are entirely contained in the indicated range.
 117 +        *
 118 +        * - Fully flush all mms whose tlb_gens have been updated.  .mm
 119 +        *   will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
 120 +        *   will be zero.
 121 +        */
 122 +       struct mm_struct        *mm;
 123 +       unsigned long           start;
 124 +       unsigned long           end;
 125 +       u64                     new_tlb_gen;
 126  };
 127
 128  #define local_flush_tlb() __flush_tlb()
 129 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
 130 index 14f4f8f66aa8..4e5a5ddb9e4d 100644
 131 --- a/arch/x86/mm/tlb.c
 132 +++ b/arch/x86/mm/tlb.c
 133 @@ -105,6 +105,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 134         }
 135
 136         this_cpu_write(cpu_tlbstate.loaded_mm, next);
 137 +       this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
 138 +       this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen));
 139
 140         WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
 141         cpumask_set_cpu(cpu, mm_cpumask(next));
 142 @@ -155,25 +157,102 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 143         switch_ldt(real_prev, next);
 144  }
 145
 146 +/*
 147 + * flush_tlb_func_common()'s memory ordering requirement is that any
 148 + * TLB fills that happen after we flush the TLB are ordered after we
 149 + * read active_mm's tlb_gen.  We don't need any explicit barriers
 150 + * because all x86 flush operations are serializing and the
 151 + * atomic64_read operation won't be reordered by the compiler.
 152 + */
 153  static void flush_tlb_func_common(const struct flush_tlb_info *f,
 154                                   bool local, enum tlb_flush_reason reason)
 155  {
 156 +       /*
 157 +        * We have three different tlb_gen values in here.  They are:
 158 +        *
 159 +        * - mm_tlb_gen:     the latest generation.
 160 +        * - local_tlb_gen:  the generation that this CPU has already caught
 161 +        *                   up to.
 162 +        * - f->new_tlb_gen: the generation that the requester of the flush
 163 +        *                   wants us to catch up to.
 164 +        */
 165 +       struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
 166 +       u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
 167 +       u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen);
 168 +
 169         /* This code cannot presently handle being reentered. */
 170         VM_WARN_ON(!irqs_disabled());
 171
 172 +       VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
 173 +                  loaded_mm->context.ctx_id);
 174 +
 175         if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
 176 +               /*
 177 +                * leave_mm() is adequate to handle any type of flush, and
 178 +                * we would prefer not to receive further IPIs.  leave_mm()
 179 +                * clears this CPU's bit in mm_cpumask().
 180 +                */
 181                 leave_mm(smp_processor_id());
 182                 return;
 183         }
 184
 185 -       if (f->end == TLB_FLUSH_ALL) {
 186 -               local_flush_tlb();
 187 -               if (local)
 188 -                       count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
 189 -               trace_tlb_flush(reason, TLB_FLUSH_ALL);
 190 -       } else {
 191 +       if (unlikely(local_tlb_gen == mm_tlb_gen)) {
 192 +               /*
 193 +                * There's nothing to do: we're already up to date.  This can
 194 +                * happen if two concurrent flushes happen -- the first flush to
 195 +                * be handled can catch us all the way up, leaving no work for
 196 +                * the second flush.
 197 +                */
 198 +               return;
 199 +       }
 200 +
 201 +       WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
 202 +       WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
 203 +
 204 +       /*
 205 +        * If we get to this point, we know that our TLB is out of date.
 206 +        * This does not strictly imply that we need to flush (it's
 207 +        * possible that f->new_tlb_gen <= local_tlb_gen), but we're
 208 +        * going to need to flush in the very near future, so we might
 209 +        * as well get it over with.
 210 +        *
 211 +        * The only question is whether to do a full or partial flush.
 212 +        *
 213 +        * We do a partial flush if requested and two extra conditions
 214 +        * are met:
 215 +        *
 216 +        * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
 217 +        *    we've always done all needed flushes to catch up to
 218 +        *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
 219 +        *    f->new_tlb_gen == 3, then we know that the flush needed to bring
 220 +        *    us up to date for tlb_gen 3 is the partial flush we're
 221 +        *    processing.
 222 +        *
 223 +        *    As an example of why this check is needed, suppose that there
 224 +        *    are two concurrent flushes.  The first is a full flush that
 225 +        *    changes context.tlb_gen from 1 to 2.  The second is a partial
 226 +        *    flush that changes context.tlb_gen from 2 to 3.  If they get
 227 +        *    processed on this CPU in reverse order, we'll see
 228 +        *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
 229 +        *    If we were to use __flush_tlb_single() and set local_tlb_gen to
 230 +        *    3, we'd be break the invariant: we'd update local_tlb_gen above
 231 +        *    1 without the full flush that's needed for tlb_gen 2.
 232 +        *
 233 +        * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
 234 +        *    Partial TLB flushes are not all that much cheaper than full TLB
 235 +        *    flushes, so it seems unlikely that it would be a performance win
 236 +        *    to do a partial flush if that won't bring our TLB fully up to
 237 +        *    date.  By doing a full flush instead, we can increase
 238 +        *    local_tlb_gen all the way to mm_tlb_gen and we can probably
 239 +        *    avoid another flush in the very near future.
 240 +        */
 241 +       if (f->end != TLB_FLUSH_ALL &&
 242 +           f->new_tlb_gen == local_tlb_gen + 1 &&
 243 +           f->new_tlb_gen == mm_tlb_gen) {
 244 +               /* Partial flush */
 245                 unsigned long addr;
 246                 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
 247 +
 248                 addr = f->start;
 249                 while (addr < f->end) {
 250                         __flush_tlb_single(addr);
 251 @@ -182,7 +261,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 252                 if (local)
 253                         count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
 254                 trace_tlb_flush(reason, nr_pages);
 255 +       } else {
 256 +               /* Full flush. */
 257 +               local_flush_tlb();
 258 +               if (local)
 259 +                       count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
 260 +               trace_tlb_flush(reason, TLB_FLUSH_ALL);
 261         }
 262 +
 263 +       /* Both paths above update our state to mm_tlb_gen. */
 264 +       this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen);
 265  }
 266
 267  static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
 268 @@ -253,7 +341,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 269         cpu = get_cpu();
 270
 271         /* This is also a barrier that synchronizes with switch_mm(). */
 272 -       inc_mm_tlb_gen(mm);
 273 +       info.new_tlb_gen = inc_mm_tlb_gen(mm);
 274
 275         /* Should we flush just the requested range? */
 276         if ((end != TLB_FLUSH_ALL) &&
 277 --
 278 2.14.2
 279