UBUNTU: Ubuntu-4.15.0-96.97

[mirror_ubuntu-bionic-kernel.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index ca5674cbaff2b65c4e51086e5922fbbd274f2cfa..9a779c0d31b5a06d6146f6a245b4bf07d57003b8 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -81,7 +81,7 @@
  
  #include "internal.h"
  
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
  #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
  #endif
  
@@ -246,9 +246,6 @@ static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
  
         tlb_flush(tlb);
         mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-       tlb_table_flush(tlb);
-#endif
         __tlb_reset_range(tlb);
  }
  
@@ -256,6 +253,9 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
  {
         struct mmu_gather_batch *batch;
  
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+       tlb_table_flush(tlb);
+#endif
         for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
                 free_pages_and_swap_cache(batch->pages, batch->nr);
                 batch->nr = 0;
@@ -331,6 +331,21 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
   * See the comment near struct mmu_table_batch.
   */
  
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+{
+#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
+       /*
+        * Invalidate page-table caches used by hardware walkers. Then we still
+        * need to RCU-sched wait while freeing the pages because software
+        * walkers can still be in-flight.
+        */
+       tlb_flush_mmu_tlbonly(tlb);
+#endif
+}
+
  static void tlb_remove_table_smp_sync(void *arg)
  {
         /* Simply deliver the interrupt */
@@ -367,6 +382,7 @@ void tlb_table_flush(struct mmu_gather *tlb)
         struct mmu_table_batch **batch = &tlb->batch;
  
         if (*batch) {
+               tlb_table_invalidate(tlb);
                 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
                 *batch = NULL;
         }
@@ -376,23 +392,16 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
  {
         struct mmu_table_batch **batch = &tlb->batch;
  
-       /*
-        * When there's less then two users of this mm there cannot be a
-        * concurrent page-table walk.
-        */
-       if (atomic_read(&tlb->mm->mm_users) < 2) {
-               __tlb_remove_table(table);
-               return;
-       }
-
         if (*batch == NULL) {
                 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                 if (*batch == NULL) {
+                       tlb_table_invalidate(tlb);
                         tlb_remove_table_one(table);
                         return;
                 }
                 (*batch)->nr = 0;
         }
+
         (*batch)->tables[(*batch)->nr++] = table;
         if ((*batch)->nr == MAX_TABLE_BATCH)
                 tlb_table_flush(tlb);
@@ -1418,11 +1427,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
         do {
                 next = pmd_addr_end(addr, end);
                 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
-                       if (next - addr != HPAGE_PMD_SIZE) {
-                               VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
-                                   !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+                       if (next - addr != HPAGE_PMD_SIZE)
                                 __split_huge_pmd(vma, pmd, addr, false, NULL);
-                       } else if (zap_huge_pmd(tlb, vma, pmd, addr))
+                       else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                 goto next;
                         /* fall through */
                 }
@@ -1798,14 +1805,21 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                          * in may not match the PFN we have mapped if the
                          * mapped PFN is a writeable COW page.  In the mkwrite
                          * case we are creating a writable PTE for a shared
-                        * mapping and we expect the PFNs to match.
+                        * mapping and we expect the PFNs to match. If they
+                        * don't match, we are likely racing with block
+                        * allocation and mapping invalidation so just skip the
+                        * update.
                          */
-                       if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+                       if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
+                               WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
                                 goto out_unlock;
-                       entry = *pte;
-                       goto out_mkwrite;
-               } else
-                       goto out_unlock;
+                       }
+                       entry = pte_mkyoung(*pte);
+                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                       if (ptep_set_access_flags(vma, addr, pte, entry, 1))
+                               update_mmu_cache(vma, addr, pte);
+               }
+               goto out_unlock;
         }
  
         /* Ok, finally just insert the thing.. */
@@ -1814,7 +1828,6 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
         else
                 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
  
-out_mkwrite:
         if (mkwrite) {
                 entry = pte_mkyoung(entry);
                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1888,6 +1901,9 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
         if (addr < vma->vm_start || addr >= vma->vm_end)
                 return -EFAULT;
  
+       if (!pfn_modify_allowed(pfn, pgprot))
+               return -EACCES;
+
         track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
  
         ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
@@ -1897,18 +1913,35 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
  }
  EXPORT_SYMBOL(vm_insert_pfn_prot);
  
+static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
+{
+       /* these checks mirror the abort conditions in vm_normal_page */
+       if (vma->vm_flags & VM_MIXEDMAP)
+               return true;
+       if (pfn_t_devmap(pfn))
+               return true;
+       if (pfn_t_special(pfn))
+               return true;
+       if (is_zero_pfn(pfn_t_to_pfn(pfn)))
+               return true;
+       return false;
+}
+
  static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                         pfn_t pfn, bool mkwrite)
  {
         pgprot_t pgprot = vma->vm_page_prot;
  
-       BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
+       BUG_ON(!vm_mixed_ok(vma, pfn));
  
         if (addr < vma->vm_start || addr >= vma->vm_end)
                 return -EFAULT;
  
         track_pfn_insert(vma, &pgprot, pfn);
  
+       if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+               return -EACCES;
+
         /*
          * If we don't have pte special, then we have to use the pfn_valid()
          * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
@@ -1956,6 +1989,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
  {
         pte_t *pte;
         spinlock_t *ptl;
+       int err = 0;
  
         pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
         if (!pte)
@@ -1963,12 +1997,16 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
         arch_enter_lazy_mmu_mode();
         do {
                 BUG_ON(!pte_none(*pte));
+               if (!pfn_modify_allowed(pfn, prot)) {
+                       err = -EACCES;
+                       break;
+               }
                 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                 pfn++;
         } while (pte++, addr += PAGE_SIZE, addr != end);
         arch_leave_lazy_mmu_mode();
         pte_unmap_unlock(pte - 1, ptl);
-       return 0;
+       return err;
  }
  
  static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1977,6 +2015,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
  {
         pmd_t *pmd;
         unsigned long next;
+       int err;
  
         pfn -= addr >> PAGE_SHIFT;
         pmd = pmd_alloc(mm, pud, addr);
@@ -1985,9 +2024,10 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
         VM_BUG_ON(pmd_trans_huge(*pmd));
         do {
                 next = pmd_addr_end(addr, end);
-               if (remap_pte_range(mm, pmd, addr, next,
-                               pfn + (addr >> PAGE_SHIFT), prot))
-                       return -ENOMEM;
+               err = remap_pte_range(mm, pmd, addr, next,
+                               pfn + (addr >> PAGE_SHIFT), prot);
+               if (err)
+                       return err;
         } while (pmd++, addr = next, addr != end);
         return 0;
  }
@@ -1998,6 +2038,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
  {
         pud_t *pud;
         unsigned long next;
+       int err;
  
         pfn -= addr >> PAGE_SHIFT;
         pud = pud_alloc(mm, p4d, addr);
@@ -2005,9 +2046,10 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
                 return -ENOMEM;
         do {
                 next = pud_addr_end(addr, end);
-               if (remap_pmd_range(mm, pud, addr, next,
-                               pfn + (addr >> PAGE_SHIFT), prot))
-                       return -ENOMEM;
+               err = remap_pmd_range(mm, pud, addr, next,
+                               pfn + (addr >> PAGE_SHIFT), prot);
+               if (err)
+                       return err;
         } while (pud++, addr = next, addr != end);
         return 0;
  }
@@ -2018,6 +2060,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
  {
         p4d_t *p4d;
         unsigned long next;
+       int err;
  
         pfn -= addr >> PAGE_SHIFT;
         p4d = p4d_alloc(mm, pgd, addr);
@@ -2025,9 +2068,10 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                 return -ENOMEM;
         do {
                 next = p4d_addr_end(addr, end);
-               if (remap_pud_range(mm, p4d, addr, next,
-                               pfn + (addr >> PAGE_SHIFT), prot))
-                       return -ENOMEM;
+               err = remap_pud_range(mm, p4d, addr, next,
+                               pfn + (addr >> PAGE_SHIFT), prot);
+               if (err)
+                       return err;
         } while (p4d++, addr = next, addr != end);
         return 0;
  }
@@ -2857,8 +2901,11 @@ int do_swap_page(struct vm_fault *vmf)
         int ret = 0;
         bool vma_readahead = swap_use_vma_readahead();
  
-       if (vma_readahead)
+       if (vma_readahead) {
                 page = swap_readahead_detect(vmf, &swap_ra);
+               swapcache = page;
+       }
+
         if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
                 if (page)
                         put_page(page);
@@ -2889,9 +2936,12 @@ int do_swap_page(struct vm_fault *vmf)
  
  
         delayacct_set_flag(DELAYACCT_PF_SWAPIN);
-       if (!page)
+       if (!page) {
                 page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
                                          vmf->address);
+               swapcache = page;
+       }
+
         if (!page) {
                 struct swap_info_struct *si = swp_swap_info(entry);
  
@@ -3193,6 +3243,29 @@ static int __do_fault(struct vm_fault *vmf)
         struct vm_area_struct *vma = vmf->vma;
         int ret;
  
+       /*
+        * Preallocate pte before we take page_lock because this might lead to
+        * deadlocks for memcg reclaim which waits for pages under writeback:
+        *                              lock_page(A)
+        *                              SetPageWriteback(A)
+        *                              unlock_page(A)
+        * lock_page(B)
+        *                              lock_page(B)
+        * pte_alloc_pne
+        *   shrink_page_list
+        *     wait_on_page_writeback(A)
+        *                              SetPageWriteback(B)
+        *                              unlock_page(B)
+        *                              # flush A, B to clear the writeback
+        */
+       if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
+               vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
+                                                 vmf->address);
+               if (!vmf->prealloc_pte)
+                       return VM_FAULT_OOM;
+               smp_wmb(); /* See comment in __pte_alloc() */
+       }
+
         ret = vma->vm_ops->fault(vmf);
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
                             VM_FAULT_DONE_COW)))
@@ -3693,16 +3766,45 @@ static int do_shared_fault(struct vm_fault *vmf)
   * but allow concurrent faults).
   * The mmap_sem may have been released depending on flags and our
   * return value.  See filemap_fault() and __lock_page_or_retry().
+ * If mmap_sem is released, vma may become invalid (for example
+ * by other thread calling munmap()).
   */
  static int do_fault(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
+       struct mm_struct *vm_mm = vma->vm_mm;
         int ret;
  
-       /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
-       if (!vma->vm_ops->fault)
-               ret = VM_FAULT_SIGBUS;
-       else if (!(vmf->flags & FAULT_FLAG_WRITE))
+       /*
+        * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
+        */
+       if (!vma->vm_ops->fault) {
+               /*
+                * If we find a migration pmd entry or a none pmd entry, which
+                * should never happen, return SIGBUS
+                */
+               if (unlikely(!pmd_present(*vmf->pmd)))
+                       ret = VM_FAULT_SIGBUS;
+               else {
+                       vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
+                                                      vmf->pmd,
+                                                      vmf->address,
+                                                      &vmf->ptl);
+                       /*
+                        * Make sure this is not a temporary clearing of pte
+                        * by holding ptl and checking again. A R/M/W update
+                        * of pte involves: take ptl, clearing the pte so that
+                        * we don't have concurrent modification by hardware
+                        * followed by an update.
+                        */
+                       if (unlikely(pte_none(*vmf->pte)))
+                               ret = VM_FAULT_SIGBUS;
+                       else
+                               ret = VM_FAULT_NOPAGE;
+
+                       pte_unmap_unlock(vmf->pte, vmf->ptl);
+               }
+       } else if (!(vmf->flags & FAULT_FLAG_WRITE))
                 ret = do_read_fault(vmf);
         else if (!(vma->vm_flags & VM_SHARED))
                 ret = do_cow_fault(vmf);
@@ -3711,7 +3813,7 @@ static int do_fault(struct vm_fault *vmf)
  
         /* preallocated pagetable is unused: free it */
         if (vmf->prealloc_pte) {
-               pte_free(vma->vm_mm, vmf->prealloc_pte);
+               pte_free(vm_mm, vmf->prealloc_pte);
                 vmf->prealloc_pte = NULL;
         }
         return ret;
@@ -4362,6 +4464,9 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                 return -EINVAL;
  
         maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
+       if (!maddr)
+               return -ENOMEM;
+
         if (write)
                 memcpy_toio(maddr + offset, buf, len);
         else
@@ -4384,7 +4489,9 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
         void *old_buf = buf;
         int write = gup_flags & FOLL_WRITE;
  
-       down_read(&mm->mmap_sem);
+       if (down_read_killable(&mm->mmap_sem))
+               return 0;
+
         /* ignore errors, just check how much was successfully transferred */
         while (len) {
                 int bytes, ret, offset;