]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - mm/userfaultfd.c
sched/headers: Prepare to move signal wakeup & sigpending methods from <linux/sched...
[mirror_ubuntu-artful-kernel.git] / mm / userfaultfd.c
index af817e5060fbfbda2be8ba35024c1ad460055b20..479e631d43c2f609466b1dee97ca2d0314ae12ba 100644 (file)
@@ -8,12 +8,16 @@
  */
 
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/mmu_notifier.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/shmem_fs.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
 
@@ -139,6 +143,231 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
        return pmd;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * __mcopy_atomic processing for HUGETLB vmas.  Note that this routine is
+ * called with mmap_sem held, it will release mmap_sem before returning.
+ */
+static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+                                             struct vm_area_struct *dst_vma,
+                                             unsigned long dst_start,
+                                             unsigned long src_start,
+                                             unsigned long len,
+                                             bool zeropage)
+{
+       int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
+       int vm_shared = dst_vma->vm_flags & VM_SHARED;
+       ssize_t err;
+       pte_t *dst_pte;
+       unsigned long src_addr, dst_addr;
+       long copied;
+       struct page *page;
+       struct hstate *h;
+       unsigned long vma_hpagesize;
+       pgoff_t idx;
+       u32 hash;
+       struct address_space *mapping;
+
+       /*
+        * There is no default zero huge page for all huge page sizes as
+        * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
+        * by THP.  Since we can not reliably insert a zero page, this
+        * feature is not supported.
+        */
+       if (zeropage) {
+               up_read(&dst_mm->mmap_sem);
+               return -EINVAL;
+       }
+
+       src_addr = src_start;
+       dst_addr = dst_start;
+       copied = 0;
+       page = NULL;
+       vma_hpagesize = vma_kernel_pagesize(dst_vma);
+
+       /*
+        * Validate alignment based on huge page size
+        */
+       err = -EINVAL;
+       if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
+               goto out_unlock;
+
+retry:
+       /*
+        * On routine entry dst_vma is set.  If we had to drop mmap_sem and
+        * retry, dst_vma will be set to NULL and we must lookup again.
+        */
+       if (!dst_vma) {
+               err = -ENOENT;
+               dst_vma = find_vma(dst_mm, dst_start);
+               if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
+                       goto out_unlock;
+               /*
+                * Only allow __mcopy_atomic_hugetlb on userfaultfd
+                * registered ranges.
+                */
+               if (!dst_vma->vm_userfaultfd_ctx.ctx)
+                       goto out_unlock;
+
+               if (dst_start < dst_vma->vm_start ||
+                   dst_start + len > dst_vma->vm_end)
+                       goto out_unlock;
+
+               err = -EINVAL;
+               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
+                       goto out_unlock;
+
+               vm_shared = dst_vma->vm_flags & VM_SHARED;
+       }
+
+       if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
+                   (len - copied) & (vma_hpagesize - 1)))
+               goto out_unlock;
+
+       /*
+        * If not shared, ensure the dst_vma has a anon_vma.
+        */
+       err = -ENOMEM;
+       if (!vm_shared) {
+               if (unlikely(anon_vma_prepare(dst_vma)))
+                       goto out_unlock;
+       }
+
+       h = hstate_vma(dst_vma);
+
+       while (src_addr < src_start + len) {
+               pte_t dst_pteval;
+
+               BUG_ON(dst_addr >= dst_start + len);
+               VM_BUG_ON(dst_addr & ~huge_page_mask(h));
+
+               /*
+                * Serialize via hugetlb_fault_mutex
+                */
+               idx = linear_page_index(dst_vma, dst_addr);
+               mapping = dst_vma->vm_file->f_mapping;
+               hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
+                                                               idx, dst_addr);
+               mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+               err = -ENOMEM;
+               dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
+               if (!dst_pte) {
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       goto out_unlock;
+               }
+
+               err = -EEXIST;
+               dst_pteval = huge_ptep_get(dst_pte);
+               if (!huge_pte_none(dst_pteval)) {
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       goto out_unlock;
+               }
+
+               err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
+                                               dst_addr, src_addr, &page);
+
+               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+               vm_alloc_shared = vm_shared;
+
+               cond_resched();
+
+               if (unlikely(err == -EFAULT)) {
+                       up_read(&dst_mm->mmap_sem);
+                       BUG_ON(!page);
+
+                       err = copy_huge_page_from_user(page,
+                                               (const void __user *)src_addr,
+                                               pages_per_huge_page(h), true);
+                       if (unlikely(err)) {
+                               err = -EFAULT;
+                               goto out;
+                       }
+                       down_read(&dst_mm->mmap_sem);
+
+                       dst_vma = NULL;
+                       goto retry;
+               } else
+                       BUG_ON(page);
+
+               if (!err) {
+                       dst_addr += vma_hpagesize;
+                       src_addr += vma_hpagesize;
+                       copied += vma_hpagesize;
+
+                       if (fatal_signal_pending(current))
+                               err = -EINTR;
+               }
+               if (err)
+                       break;
+       }
+
+out_unlock:
+       up_read(&dst_mm->mmap_sem);
+out:
+       if (page) {
+               /*
+                * We encountered an error and are about to free a newly
+                * allocated huge page.
+                *
+                * Reservation handling is very subtle, and is different for
+                * private and shared mappings.  See the routine
+                * restore_reserve_on_error for details.  Unfortunately, we
+                * can not call restore_reserve_on_error now as it would
+                * require holding mmap_sem.
+                *
+                * If a reservation for the page existed in the reservation
+                * map of a private mapping, the map was modified to indicate
+                * the reservation was consumed when the page was allocated.
+                * We clear the PagePrivate flag now so that the global
+                * reserve count will not be incremented in free_huge_page.
+                * The reservation map will still indicate the reservation
+                * was consumed and possibly prevent later page allocation.
+                * This is better than leaking a global reservation.  If no
+                * reservation existed, it is still safe to clear PagePrivate
+                * as no adjustments to reservation counts were made during
+                * allocation.
+                *
+                * The reservation map for shared mappings indicates which
+                * pages have reservations.  When a huge page is allocated
+                * for an address with a reservation, no change is made to
+                * the reserve map.  In this case PagePrivate will be set
+                * to indicate that the global reservation count should be
+                * incremented when the page is freed.  This is the desired
+                * behavior.  However, when a huge page is allocated for an
+                * address without a reservation a reservation entry is added
+                * to the reservation map, and PagePrivate will not be set.
+                * When the page is freed, the global reserve count will NOT
+                * be incremented and it will appear as though we have leaked
+                * reserved page.  In this case, set PagePrivate so that the
+                * global reserve count will be incremented to match the
+                * reservation map entry which was created.
+                *
+                * Note that vm_alloc_shared is based on the flags of the vma
+                * for which the page was originally allocated.  dst_vma could
+                * be different or NULL on error.
+                */
+               if (vm_alloc_shared)
+                       SetPagePrivate(page);
+               else
+                       ClearPagePrivate(page);
+               put_page(page);
+       }
+       BUG_ON(copied < 0);
+       BUG_ON(err > 0);
+       BUG_ON(!copied && !err);
+       return copied ? copied : err;
+}
+#else /* !CONFIG_HUGETLB_PAGE */
+/* fail at build time if gcc attempts to use this */
+extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+                                     struct vm_area_struct *dst_vma,
+                                     unsigned long dst_start,
+                                     unsigned long src_start,
+                                     unsigned long len,
+                                     bool zeropage);
+#endif /* CONFIG_HUGETLB_PAGE */
+
 static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
                                              unsigned long dst_start,
                                              unsigned long src_start,
@@ -173,14 +402,10 @@ retry:
         * Make sure the vma is not shared, that the dst range is
         * both valid and fully within a single existing vma.
         */
-       err = -EINVAL;
+       err = -ENOENT;
        dst_vma = find_vma(dst_mm, dst_start);
-       if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
-               goto out_unlock;
-       if (dst_start < dst_vma->vm_start ||
-           dst_start + len > dst_vma->vm_end)
+       if (!dst_vma)
                goto out_unlock;
-
        /*
         * Be strict and only allow __mcopy_atomic on userfaultfd
         * registered ranges to prevent userland errors going
@@ -193,11 +418,27 @@ retry:
        if (!dst_vma->vm_userfaultfd_ctx.ctx)
                goto out_unlock;
 
+       if (dst_start < dst_vma->vm_start ||
+           dst_start + len > dst_vma->vm_end)
+               goto out_unlock;
+
+       err = -EINVAL;
+       /*
+        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
+        * it will overwrite vm_ops, so vma_is_anonymous must return false.
+        */
+       if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
+           dst_vma->vm_flags & VM_SHARED))
+               goto out_unlock;
+
        /*
-        * FIXME: only allow copying on anonymous vmas, tmpfs should
-        * be added.
+        * If this is a HUGETLB vma, pass off to appropriate routine
         */
-       if (dst_vma->vm_ops)
+       if (is_vm_hugetlb_page(dst_vma))
+               return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
+                                               src_start, len, zeropage);
+
+       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
                goto out_unlock;
 
        /*
@@ -206,7 +447,7 @@ retry:
         * dst_vma.
         */
        err = -ENOMEM;
-       if (unlikely(anon_vma_prepare(dst_vma)))
+       if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma)))
                goto out_unlock;
 
        while (src_addr < src_start + len) {
@@ -243,12 +484,21 @@ retry:
                BUG_ON(pmd_none(*dst_pmd));
                BUG_ON(pmd_trans_huge(*dst_pmd));
 
-               if (!zeropage)
-                       err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
-                                              dst_addr, src_addr, &page);
-               else
-                       err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
-                                                dst_addr);
+               if (vma_is_anonymous(dst_vma)) {
+                       if (!zeropage)
+                               err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+                                                      dst_addr, src_addr,
+                                                      &page);
+                       else
+                               err = mfill_zeropage_pte(dst_mm, dst_pmd,
+                                                        dst_vma, dst_addr);
+               } else {
+                       err = -EINVAL; /* if zeropage is true return -EINVAL */
+                       if (likely(!zeropage))
+                               err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
+                                                            dst_vma, dst_addr,
+                                                            src_addr, &page);
+               }
 
                cond_resched();