]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - mm/memory.c
UBUNTU: Ubuntu-4.15.0-96.97
[mirror_ubuntu-bionic-kernel.git] / mm / memory.c
index ca5674cbaff2b65c4e51086e5922fbbd274f2cfa..9a779c0d31b5a06d6146f6a245b4bf07d57003b8 100644 (file)
@@ -81,7 +81,7 @@
 
 #include "internal.h"
 
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
 #endif
 
@@ -246,9 +246,6 @@ static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 
        tlb_flush(tlb);
        mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-       tlb_table_flush(tlb);
-#endif
        __tlb_reset_range(tlb);
 }
 
@@ -256,6 +253,9 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
 {
        struct mmu_gather_batch *batch;
 
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+       tlb_table_flush(tlb);
+#endif
        for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
                free_pages_and_swap_cache(batch->pages, batch->nr);
                batch->nr = 0;
@@ -331,6 +331,21 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
  * See the comment near struct mmu_table_batch.
  */
 
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+{
+#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
+       /*
+        * Invalidate page-table caches used by hardware walkers. Then we still
+        * need to RCU-sched wait while freeing the pages because software
+        * walkers can still be in-flight.
+        */
+       tlb_flush_mmu_tlbonly(tlb);
+#endif
+}
+
 static void tlb_remove_table_smp_sync(void *arg)
 {
        /* Simply deliver the interrupt */
@@ -367,6 +382,7 @@ void tlb_table_flush(struct mmu_gather *tlb)
        struct mmu_table_batch **batch = &tlb->batch;
 
        if (*batch) {
+               tlb_table_invalidate(tlb);
                call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
                *batch = NULL;
        }
@@ -376,23 +392,16 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 {
        struct mmu_table_batch **batch = &tlb->batch;
 
-       /*
-        * When there's less then two users of this mm there cannot be a
-        * concurrent page-table walk.
-        */
-       if (atomic_read(&tlb->mm->mm_users) < 2) {
-               __tlb_remove_table(table);
-               return;
-       }
-
        if (*batch == NULL) {
                *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                if (*batch == NULL) {
+                       tlb_table_invalidate(tlb);
                        tlb_remove_table_one(table);
                        return;
                }
                (*batch)->nr = 0;
        }
+
        (*batch)->tables[(*batch)->nr++] = table;
        if ((*batch)->nr == MAX_TABLE_BATCH)
                tlb_table_flush(tlb);
@@ -1418,11 +1427,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
-                       if (next - addr != HPAGE_PMD_SIZE) {
-                               VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
-                                   !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+                       if (next - addr != HPAGE_PMD_SIZE)
                                __split_huge_pmd(vma, pmd, addr, false, NULL);
-                       else if (zap_huge_pmd(tlb, vma, pmd, addr))
+                       else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
                }
@@ -1798,14 +1805,21 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                         * in may not match the PFN we have mapped if the
                         * mapped PFN is a writeable COW page.  In the mkwrite
                         * case we are creating a writable PTE for a shared
-                        * mapping and we expect the PFNs to match.
+                        * mapping and we expect the PFNs to match. If they
+                        * don't match, we are likely racing with block
+                        * allocation and mapping invalidation so just skip the
+                        * update.
                         */
-                       if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+                       if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
+                               WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
                                goto out_unlock;
-                       entry = *pte;
-                       goto out_mkwrite;
-               } else
-                       goto out_unlock;
+                       }
+                       entry = pte_mkyoung(*pte);
+                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                       if (ptep_set_access_flags(vma, addr, pte, entry, 1))
+                               update_mmu_cache(vma, addr, pte);
+               }
+               goto out_unlock;
        }
 
        /* Ok, finally just insert the thing.. */
@@ -1814,7 +1828,6 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
        else
                entry = pte_mkspecial(pfn_t_pte(pfn, prot));
 
-out_mkwrite:
        if (mkwrite) {
                entry = pte_mkyoung(entry);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1888,6 +1901,9 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
 
+       if (!pfn_modify_allowed(pfn, pgprot))
+               return -EACCES;
+
        track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
 
        ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
@@ -1897,18 +1913,35 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_insert_pfn_prot);
 
+static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
+{
+       /* these checks mirror the abort conditions in vm_normal_page */
+       if (vma->vm_flags & VM_MIXEDMAP)
+               return true;
+       if (pfn_t_devmap(pfn))
+               return true;
+       if (pfn_t_special(pfn))
+               return true;
+       if (is_zero_pfn(pfn_t_to_pfn(pfn)))
+               return true;
+       return false;
+}
+
 static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn, bool mkwrite)
 {
        pgprot_t pgprot = vma->vm_page_prot;
 
-       BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
+       BUG_ON(!vm_mixed_ok(vma, pfn));
 
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
 
        track_pfn_insert(vma, &pgprot, pfn);
 
+       if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+               return -EACCES;
+
        /*
         * If we don't have pte special, then we have to use the pfn_valid()
         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
@@ -1956,6 +1989,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 {
        pte_t *pte;
        spinlock_t *ptl;
+       int err = 0;
 
        pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
@@ -1963,12 +1997,16 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
        arch_enter_lazy_mmu_mode();
        do {
                BUG_ON(!pte_none(*pte));
+               if (!pfn_modify_allowed(pfn, prot)) {
+                       err = -EACCES;
+                       break;
+               }
                set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
-       return 0;
+       return err;
 }
 
 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1977,6 +2015,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
 {
        pmd_t *pmd;
        unsigned long next;
+       int err;
 
        pfn -= addr >> PAGE_SHIFT;
        pmd = pmd_alloc(mm, pud, addr);
@@ -1985,9 +2024,10 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
-               if (remap_pte_range(mm, pmd, addr, next,
-                               pfn + (addr >> PAGE_SHIFT), prot))
-                       return -ENOMEM;
+               err = remap_pte_range(mm, pmd, addr, next,
+                               pfn + (addr >> PAGE_SHIFT), prot);
+               if (err)
+                       return err;
        } while (pmd++, addr = next, addr != end);
        return 0;
 }
@@ -1998,6 +2038,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
 {
        pud_t *pud;
        unsigned long next;
+       int err;
 
        pfn -= addr >> PAGE_SHIFT;
        pud = pud_alloc(mm, p4d, addr);
@@ -2005,9 +2046,10 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
-               if (remap_pmd_range(mm, pud, addr, next,
-                               pfn + (addr >> PAGE_SHIFT), prot))
-                       return -ENOMEM;
+               err = remap_pmd_range(mm, pud, addr, next,
+                               pfn + (addr >> PAGE_SHIFT), prot);
+               if (err)
+                       return err;
        } while (pud++, addr = next, addr != end);
        return 0;
 }
@@ -2018,6 +2060,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 {
        p4d_t *p4d;
        unsigned long next;
+       int err;
 
        pfn -= addr >> PAGE_SHIFT;
        p4d = p4d_alloc(mm, pgd, addr);
@@ -2025,9 +2068,10 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
-               if (remap_pud_range(mm, p4d, addr, next,
-                               pfn + (addr >> PAGE_SHIFT), prot))
-                       return -ENOMEM;
+               err = remap_pud_range(mm, p4d, addr, next,
+                               pfn + (addr >> PAGE_SHIFT), prot);
+               if (err)
+                       return err;
        } while (p4d++, addr = next, addr != end);
        return 0;
 }
@@ -2857,8 +2901,11 @@ int do_swap_page(struct vm_fault *vmf)
        int ret = 0;
        bool vma_readahead = swap_use_vma_readahead();
 
-       if (vma_readahead)
+       if (vma_readahead) {
                page = swap_readahead_detect(vmf, &swap_ra);
+               swapcache = page;
+       }
+
        if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
                if (page)
                        put_page(page);
@@ -2889,9 +2936,12 @@ int do_swap_page(struct vm_fault *vmf)
 
 
        delayacct_set_flag(DELAYACCT_PF_SWAPIN);
-       if (!page)
+       if (!page) {
                page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
                                         vmf->address);
+               swapcache = page;
+       }
+
        if (!page) {
                struct swap_info_struct *si = swp_swap_info(entry);
 
@@ -3193,6 +3243,29 @@ static int __do_fault(struct vm_fault *vmf)
        struct vm_area_struct *vma = vmf->vma;
        int ret;
 
+       /*
+        * Preallocate pte before we take page_lock because this might lead to
+        * deadlocks for memcg reclaim which waits for pages under writeback:
+        *                              lock_page(A)
+        *                              SetPageWriteback(A)
+        *                              unlock_page(A)
+        * lock_page(B)
+        *                              lock_page(B)
+        * pte_alloc_pne
+        *   shrink_page_list
+        *     wait_on_page_writeback(A)
+        *                              SetPageWriteback(B)
+        *                              unlock_page(B)
+        *                              # flush A, B to clear the writeback
+        */
+       if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
+               vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
+                                                 vmf->address);
+               if (!vmf->prealloc_pte)
+                       return VM_FAULT_OOM;
+               smp_wmb(); /* See comment in __pte_alloc() */
+       }
+
        ret = vma->vm_ops->fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
                            VM_FAULT_DONE_COW)))
@@ -3693,16 +3766,45 @@ static int do_shared_fault(struct vm_fault *vmf)
  * but allow concurrent faults).
  * The mmap_sem may have been released depending on flags and our
  * return value.  See filemap_fault() and __lock_page_or_retry().
+ * If mmap_sem is released, vma may become invalid (for example
+ * by other thread calling munmap()).
  */
 static int do_fault(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
+       struct mm_struct *vm_mm = vma->vm_mm;
        int ret;
 
-       /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
-       if (!vma->vm_ops->fault)
-               ret = VM_FAULT_SIGBUS;
-       else if (!(vmf->flags & FAULT_FLAG_WRITE))
+       /*
+        * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
+        */
+       if (!vma->vm_ops->fault) {
+               /*
+                * If we find a migration pmd entry or a none pmd entry, which
+                * should never happen, return SIGBUS
+                */
+               if (unlikely(!pmd_present(*vmf->pmd)))
+                       ret = VM_FAULT_SIGBUS;
+               else {
+                       vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
+                                                      vmf->pmd,
+                                                      vmf->address,
+                                                      &vmf->ptl);
+                       /*
+                        * Make sure this is not a temporary clearing of pte
+                        * by holding ptl and checking again. A R/M/W update
+                        * of pte involves: take ptl, clearing the pte so that
+                        * we don't have concurrent modification by hardware
+                        * followed by an update.
+                        */
+                       if (unlikely(pte_none(*vmf->pte)))
+                               ret = VM_FAULT_SIGBUS;
+                       else
+                               ret = VM_FAULT_NOPAGE;
+
+                       pte_unmap_unlock(vmf->pte, vmf->ptl);
+               }
+       } else if (!(vmf->flags & FAULT_FLAG_WRITE))
                ret = do_read_fault(vmf);
        else if (!(vma->vm_flags & VM_SHARED))
                ret = do_cow_fault(vmf);
@@ -3711,7 +3813,7 @@ static int do_fault(struct vm_fault *vmf)
 
        /* preallocated pagetable is unused: free it */
        if (vmf->prealloc_pte) {
-               pte_free(vma->vm_mm, vmf->prealloc_pte);
+               pte_free(vm_mm, vmf->prealloc_pte);
                vmf->prealloc_pte = NULL;
        }
        return ret;
@@ -4362,6 +4464,9 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                return -EINVAL;
 
        maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
+       if (!maddr)
+               return -ENOMEM;
+
        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else
@@ -4384,7 +4489,9 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
        void *old_buf = buf;
        int write = gup_flags & FOLL_WRITE;
 
-       down_read(&mm->mmap_sem);
+       if (down_read_killable(&mm->mmap_sem))
+               return 0;
+
        /* ignore errors, just check how much was successfully transferred */
        while (len) {
                int bytes, ret, offset;