[mirror_ubuntu-artful-kernel.git] / mm / userfaultfd.c

/*
 *  mm/userfaultfd.c
 *
 *  Copyright (C) 2015  Red Hat, Inc.
 *
 *  This work is licensed under the terms of the GNU GPL, version 2. See
 *  the COPYING file in the top-level directory.
 */

#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <asm/tlbflush.h>
#include "internal.h"

static int mcopy_atomic_pte(struct mm_struct *dst_mm,
			    pmd_t *dst_pmd,
			    struct vm_area_struct *dst_vma,
			    unsigned long dst_addr,
			    unsigned long src_addr,
			    struct page **pagep)
{
	struct mem_cgroup *memcg;
	pte_t _dst_pte, *dst_pte;
	spinlock_t *ptl;
	void *page_kaddr;
	int ret;
	struct page *page;

	if (!*pagep) {
		ret = -ENOMEM;
		page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
		if (!page)
			goto out;

		page_kaddr = kmap_atomic(page);
		ret = copy_from_user(page_kaddr,
				     (const void __user *) src_addr,
				     PAGE_SIZE);
		kunmap_atomic(page_kaddr);

		/* fallback to copy_from_user outside mmap_sem */
		if (unlikely(ret)) {
			ret = -EFAULT;
			*pagep = page;
			/* don't free the page */
			goto out;
		}
	} else {
		page = *pagep;
		*pagep = NULL;
	}

	/*
	 * The memory barrier inside __SetPageUptodate makes sure that
	 * preceeding stores to the page contents become visible before
	 * the set_pte_at() write.
	 */
	__SetPageUptodate(page);

	ret = -ENOMEM;
	if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
		goto out_release;

	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
	if (dst_vma->vm_flags & VM_WRITE)
		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));

	ret = -EEXIST;
	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
	if (!pte_none(*dst_pte))
		goto out_release_uncharge_unlock;

	inc_mm_counter(dst_mm, MM_ANONPAGES);
	page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
	mem_cgroup_commit_charge(page, memcg, false, false);
	lru_cache_add_active_or_unevictable(page, dst_vma);

	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);

	/* No need to invalidate - it was non-present before */
	update_mmu_cache(dst_vma, dst_addr, dst_pte);

	pte_unmap_unlock(dst_pte, ptl);
	ret = 0;
out:
	return ret;
out_release_uncharge_unlock:
	pte_unmap_unlock(dst_pte, ptl);
	mem_cgroup_cancel_charge(page, memcg, false);
out_release:
	put_page(page);
	goto out;
}

static int mfill_zeropage_pte(struct mm_struct *dst_mm,
			      pmd_t *dst_pmd,
			      struct vm_area_struct *dst_vma,
			      unsigned long dst_addr)
{
	pte_t _dst_pte, *dst_pte;
	spinlock_t *ptl;
	int ret;

	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
					 dst_vma->vm_page_prot));
	ret = -EEXIST;
	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
	if (!pte_none(*dst_pte))
		goto out_unlock;
	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
	/* No need to invalidate - it was non-present before */
	update_mmu_cache(dst_vma, dst_addr, dst_pte);
	ret = 0;
out_unlock:
	pte_unmap_unlock(dst_pte, ptl);
	return ret;
}

static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd = NULL;

	pgd = pgd_offset(mm, address);
	pud = pud_alloc(mm, pgd, address);
	if (pud)
		/*
		 * Note that we didn't run this because the pmd was
		 * missing, the *pmd may be already established and in
		 * turn it may also be a trans_huge_pmd.
		 */
		pmd = pmd_alloc(mm, pud, address);
	return pmd;
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * __mcopy_atomic processing for HUGETLB vmas.  Note that this routine is
 * called with mmap_sem held, it will release mmap_sem before returning.
 */
static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
					      struct vm_area_struct *dst_vma,
					      unsigned long dst_start,
					      unsigned long src_start,
					      unsigned long len,
					      bool zeropage)
{
	ssize_t err;
	pte_t *dst_pte;
	unsigned long src_addr, dst_addr;
	long copied;
	struct page *page;
	struct hstate *h;
	unsigned long vma_hpagesize;
	pgoff_t idx;
	u32 hash;
	struct address_space *mapping;

	/*
	 * There is no default zero huge page for all huge page sizes as
	 * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
	 * by THP.  Since we can not reliably insert a zero page, this
	 * feature is not supported.
	 */
	if (zeropage) {
		up_read(&dst_mm->mmap_sem);
		return -EINVAL;
	}

	src_addr = src_start;
	dst_addr = dst_start;
	copied = 0;
	page = NULL;
	vma_hpagesize = vma_kernel_pagesize(dst_vma);

	/*
	 * Validate alignment based on huge page size
	 */
	err = -EINVAL;
	if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
		goto out_unlock;

retry:
	/*
	 * On routine entry dst_vma is set.  If we had to drop mmap_sem and
	 * retry, dst_vma will be set to NULL and we must lookup again.
	 */
	if (!dst_vma) {
		err = -EINVAL;
		dst_vma = find_vma(dst_mm, dst_start);
		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
			goto out_unlock;

		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
			goto out_unlock;

		/*
		 * Make sure the vma is not shared, that the remaining dst
		 * range is both valid and fully within a single existing vma.
		 */
		if (dst_vma->vm_flags & VM_SHARED)
			goto out_unlock;
		if (dst_start < dst_vma->vm_start ||
		    dst_start + len > dst_vma->vm_end)
			goto out_unlock;
	}

	if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
		    (len - copied) & (vma_hpagesize - 1)))
		goto out_unlock;

	/*
	 * Only allow __mcopy_atomic_hugetlb on userfaultfd registered ranges.
	 */
	if (!dst_vma->vm_userfaultfd_ctx.ctx)
		goto out_unlock;

	/*
	 * Ensure the dst_vma has a anon_vma.
	 */
	err = -ENOMEM;
	if (unlikely(anon_vma_prepare(dst_vma)))
		goto out_unlock;

	h = hstate_vma(dst_vma);

	while (src_addr < src_start + len) {
		pte_t dst_pteval;

		BUG_ON(dst_addr >= dst_start + len);
		VM_BUG_ON(dst_addr & ~huge_page_mask(h));

		/*
		 * Serialize via hugetlb_fault_mutex
		 */
		idx = linear_page_index(dst_vma, dst_addr);
		mapping = dst_vma->vm_file->f_mapping;
		hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
								idx, dst_addr);
		mutex_lock(&hugetlb_fault_mutex_table[hash]);

		err = -ENOMEM;
		dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
		if (!dst_pte) {
			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
			goto out_unlock;
		}

		err = -EEXIST;
		dst_pteval = huge_ptep_get(dst_pte);
		if (!huge_pte_none(dst_pteval)) {
			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
			goto out_unlock;
		}

		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
						dst_addr, src_addr, &page);

		mutex_unlock(&hugetlb_fault_mutex_table[hash]);

		cond_resched();

		if (unlikely(err == -EFAULT)) {
			up_read(&dst_mm->mmap_sem);
			BUG_ON(!page);

			err = copy_huge_page_from_user(page,
						(const void __user *)src_addr,
						pages_per_huge_page(h), true);
			if (unlikely(err)) {
				err = -EFAULT;
				goto out;
			}
			down_read(&dst_mm->mmap_sem);

			dst_vma = NULL;
			goto retry;
		} else
			BUG_ON(page);

		if (!err) {
			dst_addr += vma_hpagesize;
			src_addr += vma_hpagesize;
			copied += vma_hpagesize;

			if (fatal_signal_pending(current))
				err = -EINTR;
		}
		if (err)
			break;
	}

out_unlock:
	up_read(&dst_mm->mmap_sem);
out:
	if (page) {
		/*
		 * We encountered an error and are about to free a newly
		 * allocated huge page.  It is possible that there was a
		 * reservation associated with the page that has been
		 * consumed.  See the routine restore_reserve_on_error
		 * for details.  Unfortunately, we can not call
		 * restore_reserve_on_error now as it would require holding
		 * mmap_sem.  Clear the PagePrivate flag so that the global
		 * reserve count will not be incremented in free_huge_page.
		 * The reservation map will still indicate the reservation
		 * was consumed and possibly prevent later page allocation.
		 * This is better than leaking a global reservation.
		 */
		ClearPagePrivate(page);
		put_page(page);
	}
	BUG_ON(copied < 0);
	BUG_ON(err > 0);
	BUG_ON(!copied && !err);
	return copied ? copied : err;
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
				      struct vm_area_struct *dst_vma,
				      unsigned long dst_start,
				      unsigned long src_start,
				      unsigned long len,
				      bool zeropage);
#endif /* CONFIG_HUGETLB_PAGE */

static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
					      unsigned long dst_start,
					      unsigned long src_start,
					      unsigned long len,
					      bool zeropage)
{
	struct vm_area_struct *dst_vma;
	ssize_t err;
	pmd_t *dst_pmd;
	unsigned long src_addr, dst_addr;
	long copied;
	struct page *page;

	/*
	 * Sanitize the command parameters:
	 */
	BUG_ON(dst_start & ~PAGE_MASK);
	BUG_ON(len & ~PAGE_MASK);

	/* Does the address range wrap, or is the span zero-sized? */
	BUG_ON(src_start + len <= src_start);
	BUG_ON(dst_start + len <= dst_start);

	src_addr = src_start;
	dst_addr = dst_start;
	copied = 0;
	page = NULL;
retry:
	down_read(&dst_mm->mmap_sem);

	/*
	 * Make sure the vma is not shared, that the dst range is
	 * both valid and fully within a single existing vma.
	 */
	err = -EINVAL;
	dst_vma = find_vma(dst_mm, dst_start);
	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
		goto out_unlock;
	if (dst_start < dst_vma->vm_start ||
	    dst_start + len > dst_vma->vm_end)
		goto out_unlock;

	/*
	 * If this is a HUGETLB vma, pass off to appropriate routine
	 */
	if (is_vm_hugetlb_page(dst_vma))
		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
						src_start, len, zeropage);

	/*
	 * Be strict and only allow __mcopy_atomic on userfaultfd
	 * registered ranges to prevent userland errors going
	 * unnoticed. As far as the VM consistency is concerned, it
	 * would be perfectly safe to remove this check, but there's
	 * no useful usage for __mcopy_atomic ouside of userfaultfd
	 * registered ranges. This is after all why these are ioctls
	 * belonging to the userfaultfd and not syscalls.
	 */
	if (!dst_vma->vm_userfaultfd_ctx.ctx)
		goto out_unlock;

	/*
	 * FIXME: only allow copying on anonymous vmas, tmpfs should
	 * be added.
	 */
	if (!vma_is_anonymous(dst_vma))
		goto out_unlock;

	/*
	 * Ensure the dst_vma has a anon_vma or this page
	 * would get a NULL anon_vma when moved in the
	 * dst_vma.
	 */
	err = -ENOMEM;
	if (unlikely(anon_vma_prepare(dst_vma)))
		goto out_unlock;

	while (src_addr < src_start + len) {
		pmd_t dst_pmdval;

		BUG_ON(dst_addr >= dst_start + len);

		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
		if (unlikely(!dst_pmd)) {
			err = -ENOMEM;
			break;
		}

		dst_pmdval = pmd_read_atomic(dst_pmd);
		/*
		 * If the dst_pmd is mapped as THP don't
		 * override it and just be strict.
		 */
		if (unlikely(pmd_trans_huge(dst_pmdval))) {
			err = -EEXIST;
			break;
		}
		if (unlikely(pmd_none(dst_pmdval)) &&
		    unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
			err = -ENOMEM;
			break;
		}
		/* If an huge pmd materialized from under us fail */
		if (unlikely(pmd_trans_huge(*dst_pmd))) {
			err = -EFAULT;
			break;
		}

		BUG_ON(pmd_none(*dst_pmd));
		BUG_ON(pmd_trans_huge(*dst_pmd));

		if (!zeropage)
			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
					       dst_addr, src_addr, &page);
		else
			err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
						 dst_addr);

		cond_resched();

		if (unlikely(err == -EFAULT)) {
			void *page_kaddr;

			up_read(&dst_mm->mmap_sem);
			BUG_ON(!page);

			page_kaddr = kmap(page);
			err = copy_from_user(page_kaddr,
					     (const void __user *) src_addr,
					     PAGE_SIZE);
			kunmap(page);
			if (unlikely(err)) {
				err = -EFAULT;
				goto out;
			}
			goto retry;
		} else
			BUG_ON(page);

		if (!err) {
			dst_addr += PAGE_SIZE;
			src_addr += PAGE_SIZE;
			copied += PAGE_SIZE;

			if (fatal_signal_pending(current))
				err = -EINTR;
		}
		if (err)
			break;
	}

out_unlock:
	up_read(&dst_mm->mmap_sem);
out:
	if (page)
		put_page(page);
	BUG_ON(copied < 0);
	BUG_ON(err > 0);
	BUG_ON(!copied && !err);
	return copied ? copied : err;
}

ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
		     unsigned long src_start, unsigned long len)
{
	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
}

ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
		       unsigned long len)
{
	return __mcopy_atomic(dst_mm, start, 0, len, true);
}
Commit	Line	Data
c1a4de99 AA	1	/*
	2	* mm/userfaultfd.c
	3	*
	4	* Copyright (C) 2015 Red Hat, Inc.
	5	*
	6	* This work is licensed under the terms of the GNU GPL, version 2. See
	7	* the COPYING file in the top-level directory.
	8	*/
	9
	10	#include <linux/mm.h>
	11	#include <linux/pagemap.h>
	12	#include <linux/rmap.h>
	13	#include <linux/swap.h>
	14	#include <linux/swapops.h>
	15	#include <linux/userfaultfd_k.h>
	16	#include <linux/mmu_notifier.h>
60d4d2d2 MK	17	#include <linux/hugetlb.h>
60d4d2d2 MK	18	#include <linux/pagemap.h>
c1a4de99 AA	19	#include <asm/tlbflush.h>
	20	#include "internal.h"
	21
	22	static int mcopy_atomic_pte(struct mm_struct *dst_mm,
	23	pmd_t *dst_pmd,
	24	struct vm_area_struct *dst_vma,
	25	unsigned long dst_addr,
b6ebaedb AA	26	unsigned long src_addr,
b6ebaedb AA	27	struct page **pagep)
c1a4de99 AA	28	{
	29	struct mem_cgroup *memcg;
	30	pte_t _dst_pte, *dst_pte;
	31	spinlock_t *ptl;
c1a4de99 AA	32	void *page_kaddr;
c1a4de99 AA	33	int ret;
b6ebaedb	34	struct page *page;
c1a4de99	35
b6ebaedb AA	36	if (!*pagep) {
	37	ret = -ENOMEM;
	38	page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
	39	if (!page)
	40	goto out;
	41
	42	page_kaddr = kmap_atomic(page);
	43	ret = copy_from_user(page_kaddr,
	44	(const void __user *) src_addr,
	45	PAGE_SIZE);
	46	kunmap_atomic(page_kaddr);
	47
	48	/* fallback to copy_from_user outside mmap_sem */
	49	if (unlikely(ret)) {
	50	ret = -EFAULT;
	51	*pagep = page;
	52	/* don't free the page */
	53	goto out;
	54	}
	55	} else {
	56	page = *pagep;
	57	*pagep = NULL;
	58	}
c1a4de99 AA	59
	60	/*
	61	* The memory barrier inside __SetPageUptodate makes sure that
	62	* preceeding stores to the page contents become visible before
	63	* the set_pte_at() write.
	64	*/
	65	__SetPageUptodate(page);
	66
	67	ret = -ENOMEM;
f627c2f5	68	if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
c1a4de99 AA	69	goto out_release;
	70
	71	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
	72	if (dst_vma->vm_flags & VM_WRITE)
	73	_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
	74
	75	ret = -EEXIST;
	76	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
	77	if (!pte_none(*dst_pte))
	78	goto out_release_uncharge_unlock;
	79
	80	inc_mm_counter(dst_mm, MM_ANONPAGES);
d281ee61	81	page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
f627c2f5	82	mem_cgroup_commit_charge(page, memcg, false, false);
c1a4de99 AA	83	lru_cache_add_active_or_unevictable(page, dst_vma);
	84
	85	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
	86
	87	/* No need to invalidate - it was non-present before */
	88	update_mmu_cache(dst_vma, dst_addr, dst_pte);
	89
	90	pte_unmap_unlock(dst_pte, ptl);
	91	ret = 0;
	92	out:
	93	return ret;
	94	out_release_uncharge_unlock:
	95	pte_unmap_unlock(dst_pte, ptl);
f627c2f5	96	mem_cgroup_cancel_charge(page, memcg, false);
c1a4de99	97	out_release:
09cbfeaf	98	put_page(page);
c1a4de99	99	goto out;
c1a4de99 AA	100	}
	101
	102	static int mfill_zeropage_pte(struct mm_struct *dst_mm,
	103	pmd_t *dst_pmd,
	104	struct vm_area_struct *dst_vma,
	105	unsigned long dst_addr)
	106	{
	107	pte_t _dst_pte, *dst_pte;
	108	spinlock_t *ptl;
	109	int ret;
	110
	111	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
	112	dst_vma->vm_page_prot));
	113	ret = -EEXIST;
	114	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
	115	if (!pte_none(*dst_pte))
	116	goto out_unlock;
	117	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
	118	/* No need to invalidate - it was non-present before */
	119	update_mmu_cache(dst_vma, dst_addr, dst_pte);
	120	ret = 0;
	121	out_unlock:
	122	pte_unmap_unlock(dst_pte, ptl);
	123	return ret;
	124	}
	125
	126	static pmd_t mm_alloc_pmd(struct mm_struct mm, unsigned long address)
	127	{
	128	pgd_t *pgd;
	129	pud_t *pud;
	130	pmd_t *pmd = NULL;
	131
	132	pgd = pgd_offset(mm, address);
	133	pud = pud_alloc(mm, pgd, address);
	134	if (pud)
	135	/*
	136	* Note that we didn't run this because the pmd was
	137	* missing, the *pmd may be already established and in
	138	* turn it may also be a trans_huge_pmd.
	139	*/
	140	pmd = pmd_alloc(mm, pud, address);
	141	return pmd;
	142	}
	143
60d4d2d2 MK	144	#ifdef CONFIG_HUGETLB_PAGE
	145	/*
	146	* __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
	147	* called with mmap_sem held, it will release mmap_sem before returning.
	148	*/
	149	static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
	150	struct vm_area_struct *dst_vma,
	151	unsigned long dst_start,
	152	unsigned long src_start,
	153	unsigned long len,
	154	bool zeropage)
	155	{
	156	ssize_t err;
	157	pte_t *dst_pte;
	158	unsigned long src_addr, dst_addr;
	159	long copied;
	160	struct page *page;
	161	struct hstate *h;
	162	unsigned long vma_hpagesize;
	163	pgoff_t idx;
	164	u32 hash;
	165	struct address_space *mapping;
	166
	167	/*
	168	* There is no default zero huge page for all huge page sizes as
	169	* supported by hugetlb. A PMD_SIZE huge pages may exist as used
	170	* by THP. Since we can not reliably insert a zero page, this
	171	* feature is not supported.
	172	*/
	173	if (zeropage) {
	174	up_read(&dst_mm->mmap_sem);
	175	return -EINVAL;
	176	}
	177
	178	src_addr = src_start;
	179	dst_addr = dst_start;
	180	copied = 0;
	181	page = NULL;
	182	vma_hpagesize = vma_kernel_pagesize(dst_vma);
	183
	184	/*
	185	* Validate alignment based on huge page size
	186	*/
	187	err = -EINVAL;
	188	if (dst_start & (vma_hpagesize - 1) \|\| len & (vma_hpagesize - 1))
	189	goto out_unlock;
	190
	191	retry:
	192	/*
	193	* On routine entry dst_vma is set. If we had to drop mmap_sem and
	194	* retry, dst_vma will be set to NULL and we must lookup again.
	195	*/
	196	if (!dst_vma) {
	197	err = -EINVAL;
	198	dst_vma = find_vma(dst_mm, dst_start);
	199	if (!dst_vma \|\| !is_vm_hugetlb_page(dst_vma))
	200	goto out_unlock;
	201
	202	if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
	203	goto out_unlock;
	204
	205	/*
	206	* Make sure the vma is not shared, that the remaining dst
	207	* range is both valid and fully within a single existing vma.
208	*/
209	if (dst_vma->vm_flags & VM_SHARED)
210	goto out_unlock;
211	if (dst_start < dst_vma->vm_start \|\|
212	dst_start + len > dst_vma->vm_end)
213	goto out_unlock;
214	}
215
216	if (WARN_ON(dst_addr & (vma_hpagesize - 1) \|\|
217	(len - copied) & (vma_hpagesize - 1)))
218	goto out_unlock;
219
220	/*
221	* Only allow __mcopy_atomic_hugetlb on userfaultfd registered ranges.
222	*/
223	if (!dst_vma->vm_userfaultfd_ctx.ctx)
224	goto out_unlock;
225
226	/*
227	* Ensure the dst_vma has a anon_vma.
228	*/
229	err = -ENOMEM;
230	if (unlikely(anon_vma_prepare(dst_vma)))
231	goto out_unlock;
232
233	h = hstate_vma(dst_vma);
234
235	while (src_addr < src_start + len) {
236	pte_t dst_pteval;
237
238	BUG_ON(dst_addr >= dst_start + len);
239	VM_BUG_ON(dst_addr & ~huge_page_mask(h));
240
241	/*
242	* Serialize via hugetlb_fault_mutex
243	*/
244	idx = linear_page_index(dst_vma, dst_addr);
245	mapping = dst_vma->vm_file->f_mapping;
246	hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
247	idx, dst_addr);
248	mutex_lock(&hugetlb_fault_mutex_table[hash]);
249
250	err = -ENOMEM;
251	dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
252	if (!dst_pte) {
253	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
254	goto out_unlock;
255	}
256
257	err = -EEXIST;
258	dst_pteval = huge_ptep_get(dst_pte);
259	if (!huge_pte_none(dst_pteval)) {
260	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
261	goto out_unlock;
262	}
263
264	err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
265	dst_addr, src_addr, &page);
266
267	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
268
269	cond_resched();
270
271	if (unlikely(err == -EFAULT)) {
272	up_read(&dst_mm->mmap_sem);
273	BUG_ON(!page);
274
275	err = copy_huge_page_from_user(page,
276	(const void __user *)src_addr,
810a56b9	277	pages_per_huge_page(h), true);
60d4d2d2 MK	278	if (unlikely(err)) {
	279	err = -EFAULT;
	280	goto out;
	281	}
	282	down_read(&dst_mm->mmap_sem);
	283
	284	dst_vma = NULL;
	285	goto retry;
	286	} else
	287	BUG_ON(page);
	288
	289	if (!err) {
	290	dst_addr += vma_hpagesize;
	291	src_addr += vma_hpagesize;
	292	copied += vma_hpagesize;
	293
	294	if (fatal_signal_pending(current))
	295	err = -EINTR;
	296	}
	297	if (err)
	298	break;
	299	}
	300
	301	out_unlock:
	302	up_read(&dst_mm->mmap_sem);
	303	out:
21205bf8 MK	304	if (page) {
	305	/*
	306	* We encountered an error and are about to free a newly
	307	* allocated huge page. It is possible that there was a
	308	* reservation associated with the page that has been
	309	* consumed. See the routine restore_reserve_on_error
	310	* for details. Unfortunately, we can not call
	311	* restore_reserve_on_error now as it would require holding
	312	* mmap_sem. Clear the PagePrivate flag so that the global
	313	* reserve count will not be incremented in free_huge_page.
	314	* The reservation map will still indicate the reservation
	315	* was consumed and possibly prevent later page allocation.
	316	* This is better than leaking a global reservation.
	317	*/
	318	ClearPagePrivate(page);
60d4d2d2	319	put_page(page);
21205bf8	320	}
60d4d2d2 MK	321	BUG_ON(copied < 0);
	322	BUG_ON(err > 0);
	323	BUG_ON(!copied && !err);
	324	return copied ? copied : err;
	325	}
	326	#else /* !CONFIG_HUGETLB_PAGE */
	327	/* fail at build time if gcc attempts to use this */
	328	extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
	329	struct vm_area_struct *dst_vma,
	330	unsigned long dst_start,
	331	unsigned long src_start,
	332	unsigned long len,
	333	bool zeropage);
	334	#endif /* CONFIG_HUGETLB_PAGE */
	335
c1a4de99 AA	336	static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
	337	unsigned long dst_start,
	338	unsigned long src_start,
	339	unsigned long len,
	340	bool zeropage)
	341	{
	342	struct vm_area_struct *dst_vma;
	343	ssize_t err;
	344	pmd_t *dst_pmd;
	345	unsigned long src_addr, dst_addr;
b6ebaedb AA	346	long copied;
b6ebaedb AA	347	struct page *page;
c1a4de99 AA	348
	349	/*
	350	* Sanitize the command parameters:
	351	*/
	352	BUG_ON(dst_start & ~PAGE_MASK);
	353	BUG_ON(len & ~PAGE_MASK);
	354
	355	/* Does the address range wrap, or is the span zero-sized? */
	356	BUG_ON(src_start + len <= src_start);
	357	BUG_ON(dst_start + len <= dst_start);
	358
b6ebaedb AA	359	src_addr = src_start;
	360	dst_addr = dst_start;
	361	copied = 0;
	362	page = NULL;
	363	retry:
c1a4de99 AA	364	down_read(&dst_mm->mmap_sem);
	365
	366	/*
	367	* Make sure the vma is not shared, that the dst range is
	368	* both valid and fully within a single existing vma.
	369	*/
	370	err = -EINVAL;
	371	dst_vma = find_vma(dst_mm, dst_start);
	372	if (!dst_vma \|\| (dst_vma->vm_flags & VM_SHARED))
b6ebaedb	373	goto out_unlock;
c1a4de99 AA	374	if (dst_start < dst_vma->vm_start \|\|
c1a4de99 AA	375	dst_start + len > dst_vma->vm_end)
b6ebaedb	376	goto out_unlock;
c1a4de99	377
60d4d2d2 MK	378	/*
	379	* If this is a HUGETLB vma, pass off to appropriate routine
	380	*/
	381	if (is_vm_hugetlb_page(dst_vma))
	382	return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
	383	src_start, len, zeropage);
	384
c1a4de99 AA	385	/*
	386	* Be strict and only allow __mcopy_atomic on userfaultfd
	387	* registered ranges to prevent userland errors going
	388	* unnoticed. As far as the VM consistency is concerned, it
	389	* would be perfectly safe to remove this check, but there's
	390	* no useful usage for __mcopy_atomic ouside of userfaultfd
	391	* registered ranges. This is after all why these are ioctls
	392	* belonging to the userfaultfd and not syscalls.
	393	*/
	394	if (!dst_vma->vm_userfaultfd_ctx.ctx)
b6ebaedb	395	goto out_unlock;
c1a4de99 AA	396
	397	/*
	398	* FIXME: only allow copying on anonymous vmas, tmpfs should
	399	* be added.
	400	*/
a94720bf	401	if (!vma_is_anonymous(dst_vma))
b6ebaedb	402	goto out_unlock;
c1a4de99 AA	403
	404	/*
	405	* Ensure the dst_vma has a anon_vma or this page
	406	* would get a NULL anon_vma when moved in the
	407	* dst_vma.
	408	*/
	409	err = -ENOMEM;
	410	if (unlikely(anon_vma_prepare(dst_vma)))
b6ebaedb	411	goto out_unlock;
c1a4de99	412
b6ebaedb	413	while (src_addr < src_start + len) {
c1a4de99	414	pmd_t dst_pmdval;
b6ebaedb	415
c1a4de99	416	BUG_ON(dst_addr >= dst_start + len);
b6ebaedb	417
c1a4de99 AA	418	dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
	419	if (unlikely(!dst_pmd)) {
	420	err = -ENOMEM;
	421	break;
	422	}
	423
	424	dst_pmdval = pmd_read_atomic(dst_pmd);
	425	/*
	426	* If the dst_pmd is mapped as THP don't
	427	* override it and just be strict.
	428	*/
	429	if (unlikely(pmd_trans_huge(dst_pmdval))) {
	430	err = -EEXIST;
	431	break;
	432	}
	433	if (unlikely(pmd_none(dst_pmdval)) &&
3ed3a4f0	434	unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
c1a4de99 AA	435	err = -ENOMEM;
	436	break;
	437	}
	438	/* If an huge pmd materialized from under us fail */
	439	if (unlikely(pmd_trans_huge(*dst_pmd))) {
	440	err = -EFAULT;
	441	break;
	442	}
	443
	444	BUG_ON(pmd_none(*dst_pmd));
	445	BUG_ON(pmd_trans_huge(*dst_pmd));
	446
	447	if (!zeropage)
	448	err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
b6ebaedb	449	dst_addr, src_addr, &page);
c1a4de99 AA	450	else
	451	err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
	452	dst_addr);
	453
	454	cond_resched();
	455
b6ebaedb AA	456	if (unlikely(err == -EFAULT)) {
	457	void *page_kaddr;
	458
	459	up_read(&dst_mm->mmap_sem);
	460	BUG_ON(!page);
	461
	462	page_kaddr = kmap(page);
	463	err = copy_from_user(page_kaddr,
	464	(const void __user *) src_addr,
	465	PAGE_SIZE);
	466	kunmap(page);
	467	if (unlikely(err)) {
	468	err = -EFAULT;
	469	goto out;
	470	}
	471	goto retry;
	472	} else
	473	BUG_ON(page);
	474
c1a4de99 AA	475	if (!err) {
	476	dst_addr += PAGE_SIZE;
	477	src_addr += PAGE_SIZE;
	478	copied += PAGE_SIZE;
	479
	480	if (fatal_signal_pending(current))
	481	err = -EINTR;
	482	}
	483	if (err)
	484	break;
	485	}
	486
b6ebaedb	487	out_unlock:
c1a4de99	488	up_read(&dst_mm->mmap_sem);
b6ebaedb AA	489	out:
b6ebaedb AA	490	if (page)
09cbfeaf	491	put_page(page);
c1a4de99 AA	492	BUG_ON(copied < 0);
	493	BUG_ON(err > 0);
	494	BUG_ON(!copied && !err);
	495	return copied ? copied : err;
	496	}
	497
	498	ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
	499	unsigned long src_start, unsigned long len)
	500	{
	501	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
	502	}
	503
	504	ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
	505	unsigned long len)
	506	{
	507	return __mcopy_atomic(dst_mm, start, 0, len, true);
	508	}