]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - mm/userfaultfd.c
sched/headers: Prepare to move signal wakeup & sigpending methods from <linux/sched...
[mirror_ubuntu-artful-kernel.git] / mm / userfaultfd.c
index 09976745be2307a97945449584a2696983bc75eb..479e631d43c2f609466b1dee97ca2d0314ae12ba 100644 (file)
@@ -8,6 +8,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
@@ -16,6 +17,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
+#include <linux/shmem_fs.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
 
@@ -153,6 +155,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
                                              unsigned long len,
                                              bool zeropage)
 {
+       int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
+       int vm_shared = dst_vma->vm_flags & VM_SHARED;
        ssize_t err;
        pte_t *dst_pte;
        unsigned long src_addr, dst_addr;
@@ -194,23 +198,26 @@ retry:
         * retry, dst_vma will be set to NULL and we must lookup again.
         */
        if (!dst_vma) {
-               err = -EINVAL;
+               err = -ENOENT;
                dst_vma = find_vma(dst_mm, dst_start);
                if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
                        goto out_unlock;
-
-               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
-                       goto out_unlock;
-
                /*
-                * Make sure the vma is not shared, that the remaining dst
-                * range is both valid and fully within a single existing vma.
+                * Only allow __mcopy_atomic_hugetlb on userfaultfd
+                * registered ranges.
                 */
-               if (dst_vma->vm_flags & VM_SHARED)
+               if (!dst_vma->vm_userfaultfd_ctx.ctx)
                        goto out_unlock;
+
                if (dst_start < dst_vma->vm_start ||
                    dst_start + len > dst_vma->vm_end)
                        goto out_unlock;
+
+               err = -EINVAL;
+               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
+                       goto out_unlock;
+
+               vm_shared = dst_vma->vm_flags & VM_SHARED;
        }
 
        if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
@@ -218,17 +225,13 @@ retry:
                goto out_unlock;
 
        /*
-        * Only allow __mcopy_atomic_hugetlb on userfaultfd registered ranges.
-        */
-       if (!dst_vma->vm_userfaultfd_ctx.ctx)
-               goto out_unlock;
-
-       /*
-        * Ensure the dst_vma has a anon_vma.
+        * If not shared, ensure the dst_vma has a anon_vma.
         */
        err = -ENOMEM;
-       if (unlikely(anon_vma_prepare(dst_vma)))
-               goto out_unlock;
+       if (!vm_shared) {
+               if (unlikely(anon_vma_prepare(dst_vma)))
+                       goto out_unlock;
+       }
 
        h = hstate_vma(dst_vma);
 
@@ -265,6 +268,7 @@ retry:
                                                dst_addr, src_addr, &page);
 
                mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+               vm_alloc_shared = vm_shared;
 
                cond_resched();
 
@@ -301,8 +305,54 @@ retry:
 out_unlock:
        up_read(&dst_mm->mmap_sem);
 out:
-       if (page)
+       if (page) {
+               /*
+                * We encountered an error and are about to free a newly
+                * allocated huge page.
+                *
+                * Reservation handling is very subtle, and is different for
+                * private and shared mappings.  See the routine
+                * restore_reserve_on_error for details.  Unfortunately, we
+                * can not call restore_reserve_on_error now as it would
+                * require holding mmap_sem.
+                *
+                * If a reservation for the page existed in the reservation
+                * map of a private mapping, the map was modified to indicate
+                * the reservation was consumed when the page was allocated.
+                * We clear the PagePrivate flag now so that the global
+                * reserve count will not be incremented in free_huge_page.
+                * The reservation map will still indicate the reservation
+                * was consumed and possibly prevent later page allocation.
+                * This is better than leaking a global reservation.  If no
+                * reservation existed, it is still safe to clear PagePrivate
+                * as no adjustments to reservation counts were made during
+                * allocation.
+                *
+                * The reservation map for shared mappings indicates which
+                * pages have reservations.  When a huge page is allocated
+                * for an address with a reservation, no change is made to
+                * the reserve map.  In this case PagePrivate will be set
+                * to indicate that the global reservation count should be
+                * incremented when the page is freed.  This is the desired
+                * behavior.  However, when a huge page is allocated for an
+                * address without a reservation a reservation entry is added
+                * to the reservation map, and PagePrivate will not be set.
+                * When the page is freed, the global reserve count will NOT
+                * be incremented and it will appear as though we have leaked
+                * reserved page.  In this case, set PagePrivate so that the
+                * global reserve count will be incremented to match the
+                * reservation map entry which was created.
+                *
+                * Note that vm_alloc_shared is based on the flags of the vma
+                * for which the page was originally allocated.  dst_vma could
+                * be different or NULL on error.
+                */
+               if (vm_alloc_shared)
+                       SetPagePrivate(page);
+               else
+                       ClearPagePrivate(page);
                put_page(page);
+       }
        BUG_ON(copied < 0);
        BUG_ON(err > 0);
        BUG_ON(!copied && !err);
@@ -352,21 +402,10 @@ retry:
         * Make sure the vma is not shared, that the dst range is
         * both valid and fully within a single existing vma.
         */
-       err = -EINVAL;
+       err = -ENOENT;
        dst_vma = find_vma(dst_mm, dst_start);
-       if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
-               goto out_unlock;
-       if (dst_start < dst_vma->vm_start ||
-           dst_start + len > dst_vma->vm_end)
+       if (!dst_vma)
                goto out_unlock;
-
-       /*
-        * If this is a HUGETLB vma, pass off to appropriate routine
-        */
-       if (is_vm_hugetlb_page(dst_vma))
-               return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
-                                               src_start, len, zeropage);
-
        /*
         * Be strict and only allow __mcopy_atomic on userfaultfd
         * registered ranges to prevent userland errors going
@@ -379,11 +418,27 @@ retry:
        if (!dst_vma->vm_userfaultfd_ctx.ctx)
                goto out_unlock;
 
+       if (dst_start < dst_vma->vm_start ||
+           dst_start + len > dst_vma->vm_end)
+               goto out_unlock;
+
+       err = -EINVAL;
        /*
-        * FIXME: only allow copying on anonymous vmas, tmpfs should
-        * be added.
+        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
+        * it will overwrite vm_ops, so vma_is_anonymous must return false.
         */
-       if (!vma_is_anonymous(dst_vma))
+       if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
+           dst_vma->vm_flags & VM_SHARED))
+               goto out_unlock;
+
+       /*
+        * If this is a HUGETLB vma, pass off to appropriate routine
+        */
+       if (is_vm_hugetlb_page(dst_vma))
+               return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
+                                               src_start, len, zeropage);
+
+       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
                goto out_unlock;
 
        /*
@@ -392,7 +447,7 @@ retry:
         * dst_vma.
         */
        err = -ENOMEM;
-       if (unlikely(anon_vma_prepare(dst_vma)))
+       if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma)))
                goto out_unlock;
 
        while (src_addr < src_start + len) {
@@ -429,12 +484,21 @@ retry:
                BUG_ON(pmd_none(*dst_pmd));
                BUG_ON(pmd_trans_huge(*dst_pmd));
 
-               if (!zeropage)
-                       err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
-                                              dst_addr, src_addr, &page);
-               else
-                       err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
-                                                dst_addr);
+               if (vma_is_anonymous(dst_vma)) {
+                       if (!zeropage)
+                               err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+                                                      dst_addr, src_addr,
+                                                      &page);
+                       else
+                               err = mfill_zeropage_pte(dst_mm, dst_pmd,
+                                                        dst_vma, dst_addr);
+               } else {
+                       err = -EINVAL; /* if zeropage is true return -EINVAL */
+                       if (likely(!zeropage))
+                               err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
+                                                            dst_vma, dst_addr,
+                                                            src_addr, &page);
+               }
 
                cond_resched();