mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale...

author Mel Gorman <mgorman@suse.de>

Wed, 2 Aug 2017 20:31:52 +0000 (13:31 -0700)

committer Thadeu Lima de Souza Cascardo <cascardo@canonical.com>

Tue, 22 Aug 2017 09:58:12 +0000 (06:58 -0300)
author Mel Gorman <mgorman@suse.de>
Wed, 2 Aug 2017 20:31:52 +0000 (13:31 -0700)
committer Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Tue, 22 Aug 2017 09:58:12 +0000 (06:58 -0300)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index c0052c644687df171df31643cdcb9ccaed6853fa..04b48bb943ad92617e9d43b618c008c096674b12 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -505,6 +505,10 @@ struct mm_struct {
          * PROT_NONE or PROT_NUMA mapped page.
          */
         bool tlb_flush_pending;
+#endif
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+       /* See flush_tlb_batched_pending() */
+       bool tlb_flush_batched;
  #endif
         struct uprobes_state uprobes_state;
  #ifdef CONFIG_X86_INTEL_MPX
diff --git a/mm/internal.h b/mm/internal.h

index fee09cdb1086f6cf5691455673b1957a9ddac248..1d5684f59efd6e0086bc857fab8edf632f654e1d 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -454,6 +454,7 @@ struct tlbflush_unmap_batch;
  #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
  void try_to_unmap_flush(void);
  void try_to_unmap_flush_dirty(void);
+void flush_tlb_batched_pending(struct mm_struct *mm);
  #else
  static inline void try_to_unmap_flush(void)
  {
@@ -461,6 +462,8 @@ static inline void try_to_unmap_flush(void)
  static inline void try_to_unmap_flush_dirty(void)
  {
  }
-
+static inline void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+}
  #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
  #endif /* __MM_INTERNAL_H */
diff --git a/mm/memory.c b/mm/memory.c

index 16390b50a564833166ded59d0debbaaa89163262..0bac6d97902aaa96a533780af7afbc19d0c354a1 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1127,6 +1127,7 @@ again:
         init_rss_vec(rss);
         start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
         pte = start_pte;
+       flush_tlb_batched_pending(mm);
         arch_enter_lazy_mmu_mode();
         do {
                 pte_t ptent = *pte;
diff --git a/mm/mprotect.c b/mm/mprotect.c

index ef5be8eaab001792b469fac1bd5b43cb139d1b0b..c0b4b2a4946284bc98b4e953c7b69b2a0cde4933 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -72,6 +72,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
         if (!pte)
                 return 0;
  
+       flush_tlb_batched_pending(vma->vm_mm);
         arch_enter_lazy_mmu_mode();
         do {
                 oldpte = *pte;
diff --git a/mm/mremap.c b/mm/mremap.c

index c25bc6268e46506dba0320d15bb5c2e7d3e079f3..fe7b7f65f4f435463cb7891fc0b79e4c68b85a0a 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -135,6 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
         new_ptl = pte_lockptr(mm, new_pmd);
         if (new_ptl != old_ptl)
                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+       flush_tlb_batched_pending(vma->vm_mm);
         arch_enter_lazy_mmu_mode();
  
         for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
diff --git a/mm/rmap.c b/mm/rmap.c

index b577fbb98d4baf352fa5e51cc536d02356b8392e..ede183c32f45630d94b5b8f67ec64eedfa60e4fc 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -648,6 +648,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
         cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
         tlb_ubc->flush_required = true;
  
+       /*
+        * Ensure compiler does not re-order the setting of tlb_flush_batched
+        * before the PTE is cleared.
+        */
+       barrier();
+       mm->tlb_flush_batched = true;
+
         /*
          * If the PTE was dirty then it's best to assume it's writable. The
          * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
@@ -675,6 +682,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
  
         return should_defer;
  }
+
+/*
+ * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
+ * releasing the PTL if TLB flushes are batched. It's possible for a parallel
+ * operation such as mprotect or munmap to race between reclaim unmapping
+ * the page and flushing the page. If this race occurs, it potentially allows
+ * access to data via a stale TLB entry. Tracking all mm's that have TLB
+ * batching in flight would be expensive during reclaim so instead track
+ * whether TLB batching occurred in the past and if so then do a flush here
+ * if required. This will cost one additional flush per reclaim cycle paid
+ * by the first operation at risk such as mprotect and mumap.
+ *
+ * This must be called under the PTL so that an access to tlb_flush_batched
+ * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
+ * via the PTL.
+ */
+void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+       if (mm->tlb_flush_batched) {
+               flush_tlb_mm(mm);
+
+               /*
+                * Do not allow the compiler to re-order the clearing of
+                * tlb_flush_batched before the tlb is flushed.
+                */
+               barrier();
+               mm->tlb_flush_batched = false;
+       }
+}
  #else
  static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
                 struct page *page, bool writable)
author	Mel Gorman <mgorman@suse.de>
	Wed, 2 Aug 2017 20:31:52 +0000 (13:31 -0700)
committer	Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
	Tue, 22 Aug 2017 09:58:12 +0000 (06:58 -0300)
include/linux/mm_types.h		patch \| blob \| blame \| history
mm/internal.h		patch \| blob \| blame \| history
mm/memory.c		patch \| blob \| blame \| history
mm/mprotect.c		patch \| blob \| blame \| history
mm/mremap.c		patch \| blob \| blame \| history
mm/rmap.c		patch \| blob \| blame \| history