userfaultfd: hugetlbfs: prevent UFFDIO_COPY to fill beyond the end of i_size

[mirror_ubuntu-artful-kernel.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index e158f7ac67300b10b8827fe6825667506095f550..969c5bf31997f812c0214fe4601b35d391642107 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,6 +68,7 @@
  #include <linux/debugfs.h>
  #include <linux/userfaultfd_k.h>
  #include <linux/dax.h>
+#include <linux/oom.h>
  
  #include <asm/io.h>
  #include <asm/mmu_context.h>
@@ -1675,7 +1676,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
  EXPORT_SYMBOL(vm_insert_page);
  
  static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-                       pfn_t pfn, pgprot_t prot)
+                       pfn_t pfn, pgprot_t prot, bool mkwrite)
  {
         struct mm_struct *mm = vma->vm_mm;
         int retval;
@@ -1687,14 +1688,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
         if (!pte)
                 goto out;
         retval = -EBUSY;
-       if (!pte_none(*pte))
-               goto out_unlock;
+       if (!pte_none(*pte)) {
+               if (mkwrite) {
+                       /*
+                        * For read faults on private mappings the PFN passed
+                        * in may not match the PFN we have mapped if the
+                        * mapped PFN is a writeable COW page.  In the mkwrite
+                        * case we are creating a writable PTE for a shared
+                        * mapping and we expect the PFNs to match.
+                        */
+                       if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+                               goto out_unlock;
+                       entry = *pte;
+                       goto out_mkwrite;
+               } else
+                       goto out_unlock;
+       }
  
         /* Ok, finally just insert the thing.. */
         if (pfn_t_devmap(pfn))
                 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
         else
                 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+
+out_mkwrite:
+       if (mkwrite) {
+               entry = pte_mkyoung(entry);
+               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+       }
+
         set_pte_at(mm, addr, pte, entry);
         update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
  
@@ -1765,14 +1787,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
  
         track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
  
-       ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
+       ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+                       false);
  
         return ret;
  }
  EXPORT_SYMBOL(vm_insert_pfn_prot);
  
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-                       pfn_t pfn)
+static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+                       pfn_t pfn, bool mkwrite)
  {
         pgprot_t pgprot = vma->vm_page_prot;
  
@@ -1801,10 +1824,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                 page = pfn_to_page(pfn_t_to_pfn(pfn));
                 return insert_page(vma, addr, page, pgprot);
         }
-       return insert_pfn(vma, addr, pfn, pgprot);
+       return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+}
+
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+                       pfn_t pfn)
+{
+       return __vm_insert_mixed(vma, addr, pfn, false);
+
  }
  EXPORT_SYMBOL(vm_insert_mixed);
  
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+                       pfn_t pfn)
+{
+       return __vm_insert_mixed(vma, addr, pfn, true);
+}
+EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+
  /*
   * maps a range of physical memory into the requested pages. the old
   * mappings are removed. any references to nonexistent pages results
@@ -2893,6 +2930,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
         struct vm_area_struct *vma = vmf->vma;
         struct mem_cgroup *memcg;
         struct page *page;
+       int ret = 0;
         pte_t entry;
  
         /* File mapping without ->vm_ops ? */
@@ -2925,6 +2963,9 @@ static int do_anonymous_page(struct vm_fault *vmf)
                                 vmf->address, &vmf->ptl);
                 if (!pte_none(*vmf->pte))
                         goto unlock;
+               ret = check_stable_address_space(vma->vm_mm);
+               if (ret)
+                       goto unlock;
                 /* Deliver the page fault to userland, check inside PT lock */
                 if (userfaultfd_missing(vma)) {
                         pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2959,6 +3000,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
         if (!pte_none(*vmf->pte))
                 goto release;
  
+       ret = check_stable_address_space(vma->vm_mm);
+       if (ret)
+               goto release;
+
         /* Deliver the page fault to userland, check inside PT lock */
         if (userfaultfd_missing(vma)) {
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2978,7 +3023,7 @@ setpte:
         update_mmu_cache(vma, vmf->address, vmf->pte);
  unlock:
         pte_unmap_unlock(vmf->pte, vmf->ptl);
-       return 0;
+       return ret;
  release:
         mem_cgroup_cancel_charge(page, memcg, false);
         put_page(page);
@@ -3252,7 +3297,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
  int finish_fault(struct vm_fault *vmf)
  {
         struct page *page;
-       int ret;
+       int ret = 0;
  
         /* Did we COW the page? */
         if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3260,7 +3305,15 @@ int finish_fault(struct vm_fault *vmf)
                 page = vmf->cow_page;
         else
                 page = vmf->page;
-       ret = alloc_set_pte(vmf, vmf->memcg, page);
+
+       /*
+        * check even for read faults because we might have lost our CoWed
+        * page
+        */
+       if (!(vmf->vma->vm_flags & VM_SHARED))
+               ret = check_stable_address_space(vmf->vma->vm_mm);
+       if (!ret)
+               ret = alloc_set_pte(vmf, vmf->memcg, page);
         if (vmf->pte)
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
         return ret;
@@ -3871,6 +3924,11 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
         /* do counter updates before entering really critical section. */
         check_sync_rss_stat(current);
  
+       if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+                                           flags & FAULT_FLAG_INSTRUCTION,
+                                           flags & FAULT_FLAG_REMOTE))
+               return VM_FAULT_SIGSEGV;
+
         /*
          * Enable the memcg OOM handling for faults triggered in user
          * space.  Kernel faults are handled more gracefully.
@@ -3878,11 +3936,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
         if (flags & FAULT_FLAG_USER)
                 mem_cgroup_oom_enable();
  
-       if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
-                                           flags & FAULT_FLAG_INSTRUCTION,
-                                           flags & FAULT_FLAG_REMOTE))
-               return VM_FAULT_SIGSEGV;
-
         if (unlikely(is_vm_hugetlb_page(vma)))
                 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
         else
@@ -3900,19 +3953,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                         mem_cgroup_oom_synchronize(false);
         }
  
-       /*
-        * This mm has been already reaped by the oom reaper and so the
-        * refault cannot be trusted in general. Anonymous refaults would
-        * lose data and give a zero page instead e.g. This is especially
-        * problem for use_mm() because regular tasks will just die and
-        * the corrupted data will not be visible anywhere while kthread
-        * will outlive the oom victim and potentially propagate the date
-        * further.
-        */
-       if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
-                               && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
-               ret = VM_FAULT_SIGBUS;
-
         return ret;
  }
  EXPORT_SYMBOL_GPL(handle_mm_fault);
@@ -4004,7 +4044,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
  #endif /* __PAGETABLE_PMD_FOLDED */
  
  static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-               pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+                           unsigned long *start, unsigned long *end,
+                           pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
  {
         pgd_t *pgd;
         p4d_t *p4d;
@@ -4031,17 +4072,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
                 if (!pmdpp)
                         goto out;
  
+               if (start && end) {
+                       *start = address & PMD_MASK;
+                       *end = *start + PMD_SIZE;
+                       mmu_notifier_invalidate_range_start(mm, *start, *end);
+               }
                 *ptlp = pmd_lock(mm, pmd);
                 if (pmd_huge(*pmd)) {
                         *pmdpp = pmd;
                         return 0;
                 }
                 spin_unlock(*ptlp);
+               if (start && end)
+                       mmu_notifier_invalidate_range_end(mm, *start, *end);
         }
  
         if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
                 goto out;
  
+       if (start && end) {
+               *start = address & PAGE_MASK;
+               *end = *start + PAGE_SIZE;
+               mmu_notifier_invalidate_range_start(mm, *start, *end);
+       }
         ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
         if (!pte_present(*ptep))
                 goto unlock;
@@ -4049,6 +4102,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
         return 0;
  unlock:
         pte_unmap_unlock(ptep, *ptlp);
+       if (start && end)
+               mmu_notifier_invalidate_range_end(mm, *start, *end);
  out:
         return -EINVAL;
  }
@@ -4060,20 +4115,21 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
  
         /* (void) is needed to make gcc happy */
         (void) __cond_lock(*ptlp,
-                          !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
-                                          ptlp)));
+                          !(res = __follow_pte_pmd(mm, address, NULL, NULL,
+                                                   ptepp, NULL, ptlp)));
         return res;
  }
  
  int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+                            unsigned long *start, unsigned long *end,
                              pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
  {
         int res;
  
         /* (void) is needed to make gcc happy */
         (void) __cond_lock(*ptlp,
-                          !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
-                                          ptlp)));
+                          !(res = __follow_pte_pmd(mm, address, start, end,
+                                                   ptepp, pmdpp, ptlp)));
         return res;
  }
  EXPORT_SYMBOL(follow_pte_pmd);