]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - mm/userfaultfd.c
userfaultfd: hugetlbfs: add __mcopy_atomic_hugetlb for huge page UFFDIO_COPY
[mirror_ubuntu-artful-kernel.git] / mm / userfaultfd.c
CommitLineData
c1a4de99
AA
1/*
2 * mm/userfaultfd.c
3 *
4 * Copyright (C) 2015 Red Hat, Inc.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
8 */
9
10#include <linux/mm.h>
11#include <linux/pagemap.h>
12#include <linux/rmap.h>
13#include <linux/swap.h>
14#include <linux/swapops.h>
15#include <linux/userfaultfd_k.h>
16#include <linux/mmu_notifier.h>
60d4d2d2
MK
17#include <linux/hugetlb.h>
18#include <linux/pagemap.h>
c1a4de99
AA
19#include <asm/tlbflush.h>
20#include "internal.h"
21
22static int mcopy_atomic_pte(struct mm_struct *dst_mm,
23 pmd_t *dst_pmd,
24 struct vm_area_struct *dst_vma,
25 unsigned long dst_addr,
b6ebaedb
AA
26 unsigned long src_addr,
27 struct page **pagep)
c1a4de99
AA
28{
29 struct mem_cgroup *memcg;
30 pte_t _dst_pte, *dst_pte;
31 spinlock_t *ptl;
c1a4de99
AA
32 void *page_kaddr;
33 int ret;
b6ebaedb 34 struct page *page;
c1a4de99 35
b6ebaedb
AA
36 if (!*pagep) {
37 ret = -ENOMEM;
38 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
39 if (!page)
40 goto out;
41
42 page_kaddr = kmap_atomic(page);
43 ret = copy_from_user(page_kaddr,
44 (const void __user *) src_addr,
45 PAGE_SIZE);
46 kunmap_atomic(page_kaddr);
47
48 /* fallback to copy_from_user outside mmap_sem */
49 if (unlikely(ret)) {
50 ret = -EFAULT;
51 *pagep = page;
52 /* don't free the page */
53 goto out;
54 }
55 } else {
56 page = *pagep;
57 *pagep = NULL;
58 }
c1a4de99
AA
59
60 /*
61 * The memory barrier inside __SetPageUptodate makes sure that
62 * preceeding stores to the page contents become visible before
63 * the set_pte_at() write.
64 */
65 __SetPageUptodate(page);
66
67 ret = -ENOMEM;
f627c2f5 68 if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
c1a4de99
AA
69 goto out_release;
70
71 _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
72 if (dst_vma->vm_flags & VM_WRITE)
73 _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
74
75 ret = -EEXIST;
76 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
77 if (!pte_none(*dst_pte))
78 goto out_release_uncharge_unlock;
79
80 inc_mm_counter(dst_mm, MM_ANONPAGES);
d281ee61 81 page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
f627c2f5 82 mem_cgroup_commit_charge(page, memcg, false, false);
c1a4de99
AA
83 lru_cache_add_active_or_unevictable(page, dst_vma);
84
85 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
86
87 /* No need to invalidate - it was non-present before */
88 update_mmu_cache(dst_vma, dst_addr, dst_pte);
89
90 pte_unmap_unlock(dst_pte, ptl);
91 ret = 0;
92out:
93 return ret;
94out_release_uncharge_unlock:
95 pte_unmap_unlock(dst_pte, ptl);
f627c2f5 96 mem_cgroup_cancel_charge(page, memcg, false);
c1a4de99 97out_release:
09cbfeaf 98 put_page(page);
c1a4de99 99 goto out;
c1a4de99
AA
100}
101
102static int mfill_zeropage_pte(struct mm_struct *dst_mm,
103 pmd_t *dst_pmd,
104 struct vm_area_struct *dst_vma,
105 unsigned long dst_addr)
106{
107 pte_t _dst_pte, *dst_pte;
108 spinlock_t *ptl;
109 int ret;
110
111 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
112 dst_vma->vm_page_prot));
113 ret = -EEXIST;
114 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
115 if (!pte_none(*dst_pte))
116 goto out_unlock;
117 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
118 /* No need to invalidate - it was non-present before */
119 update_mmu_cache(dst_vma, dst_addr, dst_pte);
120 ret = 0;
121out_unlock:
122 pte_unmap_unlock(dst_pte, ptl);
123 return ret;
124}
125
126static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
127{
128 pgd_t *pgd;
129 pud_t *pud;
130 pmd_t *pmd = NULL;
131
132 pgd = pgd_offset(mm, address);
133 pud = pud_alloc(mm, pgd, address);
134 if (pud)
135 /*
136 * Note that we didn't run this because the pmd was
137 * missing, the *pmd may be already established and in
138 * turn it may also be a trans_huge_pmd.
139 */
140 pmd = pmd_alloc(mm, pud, address);
141 return pmd;
142}
143
60d4d2d2
MK
144#ifdef CONFIG_HUGETLB_PAGE
145/*
146 * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
147 * called with mmap_sem held, it will release mmap_sem before returning.
148 */
149static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
150 struct vm_area_struct *dst_vma,
151 unsigned long dst_start,
152 unsigned long src_start,
153 unsigned long len,
154 bool zeropage)
155{
156 ssize_t err;
157 pte_t *dst_pte;
158 unsigned long src_addr, dst_addr;
159 long copied;
160 struct page *page;
161 struct hstate *h;
162 unsigned long vma_hpagesize;
163 pgoff_t idx;
164 u32 hash;
165 struct address_space *mapping;
166
167 /*
168 * There is no default zero huge page for all huge page sizes as
169 * supported by hugetlb. A PMD_SIZE huge pages may exist as used
170 * by THP. Since we can not reliably insert a zero page, this
171 * feature is not supported.
172 */
173 if (zeropage) {
174 up_read(&dst_mm->mmap_sem);
175 return -EINVAL;
176 }
177
178 src_addr = src_start;
179 dst_addr = dst_start;
180 copied = 0;
181 page = NULL;
182 vma_hpagesize = vma_kernel_pagesize(dst_vma);
183
184 /*
185 * Validate alignment based on huge page size
186 */
187 err = -EINVAL;
188 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
189 goto out_unlock;
190
191retry:
192 /*
193 * On routine entry dst_vma is set. If we had to drop mmap_sem and
194 * retry, dst_vma will be set to NULL and we must lookup again.
195 */
196 if (!dst_vma) {
197 err = -EINVAL;
198 dst_vma = find_vma(dst_mm, dst_start);
199 if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
200 goto out_unlock;
201
202 if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
203 goto out_unlock;
204
205 /*
206 * Make sure the vma is not shared, that the remaining dst
207 * range is both valid and fully within a single existing vma.
208 */
209 if (dst_vma->vm_flags & VM_SHARED)
210 goto out_unlock;
211 if (dst_start < dst_vma->vm_start ||
212 dst_start + len > dst_vma->vm_end)
213 goto out_unlock;
214 }
215
216 if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
217 (len - copied) & (vma_hpagesize - 1)))
218 goto out_unlock;
219
220 /*
221 * Only allow __mcopy_atomic_hugetlb on userfaultfd registered ranges.
222 */
223 if (!dst_vma->vm_userfaultfd_ctx.ctx)
224 goto out_unlock;
225
226 /*
227 * Ensure the dst_vma has a anon_vma.
228 */
229 err = -ENOMEM;
230 if (unlikely(anon_vma_prepare(dst_vma)))
231 goto out_unlock;
232
233 h = hstate_vma(dst_vma);
234
235 while (src_addr < src_start + len) {
236 pte_t dst_pteval;
237
238 BUG_ON(dst_addr >= dst_start + len);
239 VM_BUG_ON(dst_addr & ~huge_page_mask(h));
240
241 /*
242 * Serialize via hugetlb_fault_mutex
243 */
244 idx = linear_page_index(dst_vma, dst_addr);
245 mapping = dst_vma->vm_file->f_mapping;
246 hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
247 idx, dst_addr);
248 mutex_lock(&hugetlb_fault_mutex_table[hash]);
249
250 err = -ENOMEM;
251 dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
252 if (!dst_pte) {
253 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
254 goto out_unlock;
255 }
256
257 err = -EEXIST;
258 dst_pteval = huge_ptep_get(dst_pte);
259 if (!huge_pte_none(dst_pteval)) {
260 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
261 goto out_unlock;
262 }
263
264 err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
265 dst_addr, src_addr, &page);
266
267 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
268
269 cond_resched();
270
271 if (unlikely(err == -EFAULT)) {
272 up_read(&dst_mm->mmap_sem);
273 BUG_ON(!page);
274
275 err = copy_huge_page_from_user(page,
276 (const void __user *)src_addr,
277 pages_per_huge_page(h));
278 if (unlikely(err)) {
279 err = -EFAULT;
280 goto out;
281 }
282 down_read(&dst_mm->mmap_sem);
283
284 dst_vma = NULL;
285 goto retry;
286 } else
287 BUG_ON(page);
288
289 if (!err) {
290 dst_addr += vma_hpagesize;
291 src_addr += vma_hpagesize;
292 copied += vma_hpagesize;
293
294 if (fatal_signal_pending(current))
295 err = -EINTR;
296 }
297 if (err)
298 break;
299 }
300
301out_unlock:
302 up_read(&dst_mm->mmap_sem);
303out:
304 if (page)
305 put_page(page);
306 BUG_ON(copied < 0);
307 BUG_ON(err > 0);
308 BUG_ON(!copied && !err);
309 return copied ? copied : err;
310}
311#else /* !CONFIG_HUGETLB_PAGE */
312/* fail at build time if gcc attempts to use this */
313extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
314 struct vm_area_struct *dst_vma,
315 unsigned long dst_start,
316 unsigned long src_start,
317 unsigned long len,
318 bool zeropage);
319#endif /* CONFIG_HUGETLB_PAGE */
320
c1a4de99
AA
321static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
322 unsigned long dst_start,
323 unsigned long src_start,
324 unsigned long len,
325 bool zeropage)
326{
327 struct vm_area_struct *dst_vma;
328 ssize_t err;
329 pmd_t *dst_pmd;
330 unsigned long src_addr, dst_addr;
b6ebaedb
AA
331 long copied;
332 struct page *page;
c1a4de99
AA
333
334 /*
335 * Sanitize the command parameters:
336 */
337 BUG_ON(dst_start & ~PAGE_MASK);
338 BUG_ON(len & ~PAGE_MASK);
339
340 /* Does the address range wrap, or is the span zero-sized? */
341 BUG_ON(src_start + len <= src_start);
342 BUG_ON(dst_start + len <= dst_start);
343
b6ebaedb
AA
344 src_addr = src_start;
345 dst_addr = dst_start;
346 copied = 0;
347 page = NULL;
348retry:
c1a4de99
AA
349 down_read(&dst_mm->mmap_sem);
350
351 /*
352 * Make sure the vma is not shared, that the dst range is
353 * both valid and fully within a single existing vma.
354 */
355 err = -EINVAL;
356 dst_vma = find_vma(dst_mm, dst_start);
357 if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
b6ebaedb 358 goto out_unlock;
c1a4de99
AA
359 if (dst_start < dst_vma->vm_start ||
360 dst_start + len > dst_vma->vm_end)
b6ebaedb 361 goto out_unlock;
c1a4de99 362
60d4d2d2
MK
363 /*
364 * If this is a HUGETLB vma, pass off to appropriate routine
365 */
366 if (is_vm_hugetlb_page(dst_vma))
367 return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
368 src_start, len, zeropage);
369
c1a4de99
AA
370 /*
371 * Be strict and only allow __mcopy_atomic on userfaultfd
372 * registered ranges to prevent userland errors going
373 * unnoticed. As far as the VM consistency is concerned, it
374 * would be perfectly safe to remove this check, but there's
375 * no useful usage for __mcopy_atomic ouside of userfaultfd
376 * registered ranges. This is after all why these are ioctls
377 * belonging to the userfaultfd and not syscalls.
378 */
379 if (!dst_vma->vm_userfaultfd_ctx.ctx)
b6ebaedb 380 goto out_unlock;
c1a4de99
AA
381
382 /*
383 * FIXME: only allow copying on anonymous vmas, tmpfs should
384 * be added.
385 */
a94720bf 386 if (!vma_is_anonymous(dst_vma))
b6ebaedb 387 goto out_unlock;
c1a4de99
AA
388
389 /*
390 * Ensure the dst_vma has a anon_vma or this page
391 * would get a NULL anon_vma when moved in the
392 * dst_vma.
393 */
394 err = -ENOMEM;
395 if (unlikely(anon_vma_prepare(dst_vma)))
b6ebaedb 396 goto out_unlock;
c1a4de99 397
b6ebaedb 398 while (src_addr < src_start + len) {
c1a4de99 399 pmd_t dst_pmdval;
b6ebaedb 400
c1a4de99 401 BUG_ON(dst_addr >= dst_start + len);
b6ebaedb 402
c1a4de99
AA
403 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
404 if (unlikely(!dst_pmd)) {
405 err = -ENOMEM;
406 break;
407 }
408
409 dst_pmdval = pmd_read_atomic(dst_pmd);
410 /*
411 * If the dst_pmd is mapped as THP don't
412 * override it and just be strict.
413 */
414 if (unlikely(pmd_trans_huge(dst_pmdval))) {
415 err = -EEXIST;
416 break;
417 }
418 if (unlikely(pmd_none(dst_pmdval)) &&
3ed3a4f0 419 unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
c1a4de99
AA
420 err = -ENOMEM;
421 break;
422 }
423 /* If an huge pmd materialized from under us fail */
424 if (unlikely(pmd_trans_huge(*dst_pmd))) {
425 err = -EFAULT;
426 break;
427 }
428
429 BUG_ON(pmd_none(*dst_pmd));
430 BUG_ON(pmd_trans_huge(*dst_pmd));
431
432 if (!zeropage)
433 err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
b6ebaedb 434 dst_addr, src_addr, &page);
c1a4de99
AA
435 else
436 err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
437 dst_addr);
438
439 cond_resched();
440
b6ebaedb
AA
441 if (unlikely(err == -EFAULT)) {
442 void *page_kaddr;
443
444 up_read(&dst_mm->mmap_sem);
445 BUG_ON(!page);
446
447 page_kaddr = kmap(page);
448 err = copy_from_user(page_kaddr,
449 (const void __user *) src_addr,
450 PAGE_SIZE);
451 kunmap(page);
452 if (unlikely(err)) {
453 err = -EFAULT;
454 goto out;
455 }
456 goto retry;
457 } else
458 BUG_ON(page);
459
c1a4de99
AA
460 if (!err) {
461 dst_addr += PAGE_SIZE;
462 src_addr += PAGE_SIZE;
463 copied += PAGE_SIZE;
464
465 if (fatal_signal_pending(current))
466 err = -EINTR;
467 }
468 if (err)
469 break;
470 }
471
b6ebaedb 472out_unlock:
c1a4de99 473 up_read(&dst_mm->mmap_sem);
b6ebaedb
AA
474out:
475 if (page)
09cbfeaf 476 put_page(page);
c1a4de99
AA
477 BUG_ON(copied < 0);
478 BUG_ON(err > 0);
479 BUG_ON(!copied && !err);
480 return copied ? copied : err;
481}
482
483ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
484 unsigned long src_start, unsigned long len)
485{
486 return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
487}
488
489ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
490 unsigned long len)
491{
492 return __mcopy_atomic(dst_mm, start, 0, len, true);
493}