]> git.proxmox.com Git - pve-kernel-jessie.git/commitdiff
Fix CVE-2016-2069: TBL flushing
authorWolfgang Bumiller <w.bumiller@proxmox.com>
Tue, 26 Jan 2016 07:44:09 +0000 (08:44 +0100)
committerDietmar Maurer <dietmar@proxmox.com>
Tue, 26 Jan 2016 09:26:43 +0000 (10:26 +0100)
CVE-2016-2069-x86-mm-Add-barriers.patch [new file with mode: 0644]
Makefile

diff --git a/CVE-2016-2069-x86-mm-Add-barriers.patch b/CVE-2016-2069-x86-mm-Add-barriers.patch
new file mode 100644 (file)
index 0000000..e1a44b3
--- /dev/null
@@ -0,0 +1,161 @@
+From 71b3c126e61177eb693423f2e18a1914205b165e Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Wed, 6 Jan 2016 12:21:01 -0800
+Subject: [PATCH] x86/mm: Add barriers and document switch_mm()-vs-flush
+ synchronization
+
+When switch_mm() activates a new PGD, it also sets a bit that
+tells other CPUs that the PGD is in use so that TLB flush IPIs
+will be sent.  In order for that to work correctly, the bit
+needs to be visible prior to loading the PGD and therefore
+starting to fill the local TLB.
+
+Document all the barriers that make this work correctly and add
+a couple that were missing.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-mm@kvack.org
+Cc: stable@vger.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+---
+ arch/x86/include/asm/mmu_context.h | 33 ++++++++++++++++++++++++++++++++-
+ arch/x86/mm/tlb.c                  | 29 ++++++++++++++++++++++++++---
+ 2 files changed, 58 insertions(+), 4 deletions(-)
+
+diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
+index 379cd36..1edc9cd 100644
+--- a/arch/x86/include/asm/mmu_context.h
++++ b/arch/x86/include/asm/mmu_context.h
+@@ -116,8 +116,34 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ #endif
+               cpumask_set_cpu(cpu, mm_cpumask(next));
+-              /* Re-load page tables */
++              /*
++               * Re-load page tables.
++               *
++               * This logic has an ordering constraint:
++               *
++               *  CPU 0: Write to a PTE for 'next'
++               *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
++               *  CPU 1: set bit 1 in next's mm_cpumask
++               *  CPU 1: load from the PTE that CPU 0 writes (implicit)
++               *
++               * We need to prevent an outcome in which CPU 1 observes
++               * the new PTE value and CPU 0 observes bit 1 clear in
++               * mm_cpumask.  (If that occurs, then the IPI will never
++               * be sent, and CPU 0's TLB will contain a stale entry.)
++               *
++               * The bad outcome can occur if either CPU's load is
++               * reordered before that CPU's store, so both CPUs much
++               * execute full barriers to prevent this from happening.
++               *
++               * Thus, switch_mm needs a full barrier between the
++               * store to mm_cpumask and any operation that could load
++               * from next->pgd.  This barrier synchronizes with
++               * remote TLB flushers.  Fortunately, load_cr3 is
++               * serializing and thus acts as a full barrier.
++               *
++               */
+               load_cr3(next->pgd);
++
+               trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+               /* Stop flush ipis for the previous mm */
+@@ -156,10 +182,15 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+                        * schedule, protecting us from simultaneous changes.
+                        */
+                       cpumask_set_cpu(cpu, mm_cpumask(next));
++
+                       /*
+                        * We were in lazy tlb mode and leave_mm disabled
+                        * tlb flush IPI delivery. We must reload CR3
+                        * to make sure to use no freed page tables.
++                       *
++                       * As above, this is a barrier that forces
++                       * TLB repopulation to be ordered after the
++                       * store to mm_cpumask.
+                        */
+                       load_cr3(next->pgd);
+                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index 8ddb5d0..8f4cc3d 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
+       preempt_disable();
+       count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
++
++      /* This is an implicit full barrier that synchronizes with switch_mm. */
+       local_flush_tlb();
++
+       trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
+       if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+               flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
+@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+       unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
+       preempt_disable();
+-      if (current->active_mm != mm)
++      if (current->active_mm != mm) {
++              /* Synchronize with switch_mm. */
++              smp_mb();
++
+               goto out;
++      }
+       if (!current->mm) {
+               leave_mm(smp_processor_id());
++
++              /* Synchronize with switch_mm. */
++              smp_mb();
++
+               goto out;
+       }
+       if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
+               base_pages_to_flush = (end - start) >> PAGE_SHIFT;
++      /*
++       * Both branches below are implicit full barriers (MOV to CR or
++       * INVLPG) that synchronize with switch_mm.
++       */
+       if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
+               base_pages_to_flush = TLB_FLUSH_ALL;
+               count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
+       preempt_disable();
+       if (current->active_mm == mm) {
+-              if (current->mm)
++              if (current->mm) {
++                      /*
++                       * Implicit full barrier (INVLPG) that synchronizes
++                       * with switch_mm.
++                       */
+                       __flush_tlb_one(start);
+-              else
++              } else {
+                       leave_mm(smp_processor_id());
++
++                      /* Synchronize with switch_mm. */
++                      smp_mb();
++              }
+       }
+       if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+-- 
+2.1.4
+
index f4a823750006c4817ddc8c97aab3e70f62668a61..6b62ea678f79690fe325ab485a520e9f38ea6443 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -242,6 +242,7 @@ ${KERNEL_SRC}/README ${KERNEL_CFG_ORG}: ${KERNELSRCTAR}
        cd ${KERNEL_SRC}; patch -p1 <../apparmor-socket-mediation.patch
        cd ${KERNEL_SRC}; patch -p1 <../CVE-2015-7513-KVM-x86-Reload-pit-counters-for-all-channels.patch
        cd ${KERNEL_SRC}; patch -p1 <../CVE-2015-8785-fuse-break-infinite-loop-in-fuse_fill_write_pages.patch
+       cd ${KERNEL_SRC}; patch -p1 <../CVE-2016-2069-x86-mm-Add-barriers.patch
        # backport iSCSI fix from 4.4rc5
        cd ${KERNEL_SRC}; patch -p1 <../iSCSI-block-sd-Fix-device-imposed-transfer-length-limits.patch
        # backport aacraid update from kernel 4.4rc5