]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - mm/hmm.c
mm/hmm: return the fault type from hmm_pte_need_fault()
[mirror_ubuntu-hirsute-kernel.git] / mm / hmm.c
CommitLineData
c942fddf 1// SPDX-License-Identifier: GPL-2.0-or-later
133ff0ea
JG
2/*
3 * Copyright 2013 Red Hat Inc.
4 *
f813f219 5 * Authors: Jérôme Glisse <jglisse@redhat.com>
133ff0ea
JG
6 */
7/*
8 * Refer to include/linux/hmm.h for information about heterogeneous memory
9 * management or HMM for short.
10 */
a520110e 11#include <linux/pagewalk.h>
133ff0ea 12#include <linux/hmm.h>
858b54da 13#include <linux/init.h>
da4c3c73
JG
14#include <linux/rmap.h>
15#include <linux/swap.h>
133ff0ea
JG
16#include <linux/slab.h>
17#include <linux/sched.h>
4ef589dc
JG
18#include <linux/mmzone.h>
19#include <linux/pagemap.h>
da4c3c73
JG
20#include <linux/swapops.h>
21#include <linux/hugetlb.h>
4ef589dc 22#include <linux/memremap.h>
c8a53b2d 23#include <linux/sched/mm.h>
7b2d55d2 24#include <linux/jump_label.h>
55c0ece8 25#include <linux/dma-mapping.h>
c0b12405 26#include <linux/mmu_notifier.h>
4ef589dc
JG
27#include <linux/memory_hotplug.h>
28
74eee180
JG
29struct hmm_vma_walk {
30 struct hmm_range *range;
31 unsigned long last;
9a4903e4 32 unsigned int flags;
74eee180
JG
33};
34
a3eb13c1
JG
35enum {
36 HMM_NEED_FAULT = 1 << 0,
37 HMM_NEED_WRITE_FAULT = 1 << 1,
38 HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
39};
40
d28c2c9a
RC
41static int hmm_pfns_fill(unsigned long addr, unsigned long end,
42 struct hmm_range *range, enum hmm_pfn_value_e value)
da4c3c73 43{
ff05c0c6 44 uint64_t *pfns = range->pfns;
da4c3c73
JG
45 unsigned long i;
46
47 i = (addr - range->start) >> PAGE_SHIFT;
48 for (; addr < end; addr += PAGE_SIZE, i++)
d28c2c9a 49 pfns[i] = range->values[value];
da4c3c73
JG
50
51 return 0;
52}
53
5504ed29 54/*
f8c888a3 55 * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s)
d2e8d551 56 * @addr: range virtual start address (inclusive)
5504ed29 57 * @end: range virtual end address (exclusive)
a3eb13c1 58 * @required_fault: HMM_NEED_* flags
5504ed29 59 * @walk: mm_walk structure
f8c888a3 60 * Return: -EBUSY after page fault, or page fault error
5504ed29
JG
61 *
62 * This function will be called whenever pmd_none() or pte_none() returns true,
63 * or whenever there is no page directory covering the virtual address range.
64 */
f8c888a3 65static int hmm_vma_fault(unsigned long addr, unsigned long end,
a3eb13c1 66 unsigned int required_fault, struct mm_walk *walk)
da4c3c73 67{
74eee180
JG
68 struct hmm_vma_walk *hmm_vma_walk = walk->private;
69 struct hmm_range *range = hmm_vma_walk->range;
5a0c38d3 70 struct vm_area_struct *vma = walk->vma;
ff05c0c6 71 uint64_t *pfns = range->pfns;
f8c888a3 72 unsigned long i = (addr - range->start) >> PAGE_SHIFT;
5a0c38d3 73 unsigned int fault_flags = FAULT_FLAG_REMOTE;
da4c3c73 74
a3eb13c1 75 WARN_ON_ONCE(!required_fault);
74eee180 76 hmm_vma_walk->last = addr;
63d5066f 77
5a0c38d3
CH
78 if (!vma)
79 goto out_error;
da4c3c73 80
a3eb13c1 81 if (required_fault & HMM_NEED_WRITE_FAULT) {
5a0c38d3
CH
82 if (!(vma->vm_flags & VM_WRITE))
83 return -EPERM;
84 fault_flags |= FAULT_FLAG_WRITE;
74eee180
JG
85 }
86
5a0c38d3
CH
87 for (; addr < end; addr += PAGE_SIZE, i++)
88 if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR)
89 goto out_error;
90
f8c888a3 91 return -EBUSY;
5a0c38d3
CH
92
93out_error:
94 pfns[i] = range->values[HMM_PFN_ERROR];
95 return -EFAULT;
2aee09d8
JG
96}
97
a3eb13c1
JG
98static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
99 uint64_t pfns, uint64_t cpu_flags)
2aee09d8 100{
f88a1e90
JG
101 struct hmm_range *range = hmm_vma_walk->range;
102
d45d464b 103 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
a3eb13c1 104 return 0;
2aee09d8 105
023a019a
JG
106 /*
107 * So we not only consider the individual per page request we also
108 * consider the default flags requested for the range. The API can
d2e8d551
RC
109 * be used 2 ways. The first one where the HMM user coalesces
110 * multiple page faults into one request and sets flags per pfn for
111 * those faults. The second one where the HMM user wants to pre-
023a019a
JG
112 * fault a range with specific flags. For the latter one it is a
113 * waste to have the user pre-fill the pfn arrays with a default
114 * flags value.
115 */
116 pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
117
2aee09d8 118 /* We aren't ask to do anything ... */
f88a1e90 119 if (!(pfns & range->flags[HMM_PFN_VALID]))
a3eb13c1 120 return 0;
f88a1e90 121
f88a1e90
JG
122 /* Need to write fault ? */
123 if ((pfns & range->flags[HMM_PFN_WRITE]) &&
a3eb13c1
JG
124 !(cpu_flags & range->flags[HMM_PFN_WRITE]))
125 return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;
126
127 /* If CPU page table is not valid then we need to fault */
128 if (!(cpu_flags & range->flags[HMM_PFN_VALID]))
129 return HMM_NEED_FAULT;
130 return 0;
2aee09d8
JG
131}
132
a3eb13c1
JG
133static unsigned int
134hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
135 const uint64_t *pfns, unsigned long npages,
136 uint64_t cpu_flags)
2aee09d8 137{
a3eb13c1 138 unsigned int required_fault = 0;
2aee09d8
JG
139 unsigned long i;
140
a3eb13c1
JG
141 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
142 return 0;
2aee09d8
JG
143
144 for (i = 0; i < npages; ++i) {
a3eb13c1
JG
145 required_fault |=
146 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags);
147 if (required_fault == HMM_NEED_ALL_BITS)
148 return required_fault;
2aee09d8 149 }
a3eb13c1 150 return required_fault;
2aee09d8
JG
151}
152
153static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
b7a16c7a 154 __always_unused int depth, struct mm_walk *walk)
2aee09d8
JG
155{
156 struct hmm_vma_walk *hmm_vma_walk = walk->private;
157 struct hmm_range *range = hmm_vma_walk->range;
a3eb13c1 158 unsigned int required_fault;
2aee09d8
JG
159 unsigned long i, npages;
160 uint64_t *pfns;
161
162 i = (addr - range->start) >> PAGE_SHIFT;
163 npages = (end - addr) >> PAGE_SHIFT;
164 pfns = &range->pfns[i];
a3eb13c1
JG
165 required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0);
166 if (required_fault)
167 return hmm_vma_fault(addr, end, required_fault, walk);
f8c888a3
CH
168 hmm_vma_walk->last = addr;
169 return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE);
2aee09d8
JG
170}
171
f88a1e90 172static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
2aee09d8
JG
173{
174 if (pmd_protnone(pmd))
175 return 0;
f88a1e90
JG
176 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
177 range->flags[HMM_PFN_WRITE] :
178 range->flags[HMM_PFN_VALID];
da4c3c73
JG
179}
180
992de9a8 181#ifdef CONFIG_TRANSPARENT_HUGEPAGE
9d3973d6
CH
182static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
183 unsigned long end, uint64_t *pfns, pmd_t pmd)
184{
53f5c3f4 185 struct hmm_vma_walk *hmm_vma_walk = walk->private;
f88a1e90 186 struct hmm_range *range = hmm_vma_walk->range;
2aee09d8 187 unsigned long pfn, npages, i;
a3eb13c1 188 unsigned int required_fault;
f88a1e90 189 uint64_t cpu_flags;
53f5c3f4 190
2aee09d8 191 npages = (end - addr) >> PAGE_SHIFT;
f88a1e90 192 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
a3eb13c1
JG
193 required_fault =
194 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags);
195 if (required_fault)
196 return hmm_vma_fault(addr, end, required_fault, walk);
53f5c3f4 197
309f9a4f 198 pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
068354ad 199 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
391aab11 200 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
53f5c3f4
JG
201 hmm_vma_walk->last = end;
202 return 0;
203}
9d3973d6
CH
204#else /* CONFIG_TRANSPARENT_HUGEPAGE */
205/* stub to allow the code below to compile */
206int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
207 unsigned long end, uint64_t *pfns, pmd_t pmd);
208#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
53f5c3f4 209
08ddddda
CH
210static inline bool hmm_is_device_private_entry(struct hmm_range *range,
211 swp_entry_t entry)
212{
213 return is_device_private_entry(entry) &&
214 device_private_entry_to_page(entry)->pgmap->owner ==
215 range->dev_private_owner;
216}
217
f88a1e90 218static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
2aee09d8 219{
789c2af8 220 if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
2aee09d8 221 return 0;
f88a1e90
JG
222 return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
223 range->flags[HMM_PFN_WRITE] :
224 range->flags[HMM_PFN_VALID];
2aee09d8
JG
225}
226
53f5c3f4
JG
227static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
228 unsigned long end, pmd_t *pmdp, pte_t *ptep,
229 uint64_t *pfn)
230{
231 struct hmm_vma_walk *hmm_vma_walk = walk->private;
f88a1e90 232 struct hmm_range *range = hmm_vma_walk->range;
a3eb13c1 233 unsigned int required_fault;
2aee09d8 234 uint64_t cpu_flags;
53f5c3f4 235 pte_t pte = *ptep;
f88a1e90 236 uint64_t orig_pfn = *pfn;
53f5c3f4 237
f88a1e90 238 *pfn = range->values[HMM_PFN_NONE];
53f5c3f4 239 if (pte_none(pte)) {
a3eb13c1
JG
240 required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
241 if (required_fault)
53f5c3f4
JG
242 goto fault;
243 return 0;
244 }
245
246 if (!pte_present(pte)) {
247 swp_entry_t entry = pte_to_swp_entry(pte);
248
53f5c3f4 249 /*
17ffdc48
CH
250 * Never fault in device private pages pages, but just report
251 * the PFN even if not present.
53f5c3f4 252 */
08ddddda 253 if (hmm_is_device_private_entry(range, entry)) {
391aab11
JG
254 *pfn = hmm_device_entry_from_pfn(range,
255 swp_offset(entry));
17ffdc48
CH
256 *pfn |= range->flags[HMM_PFN_VALID];
257 if (is_write_device_private_entry(entry))
258 *pfn |= range->flags[HMM_PFN_WRITE];
53f5c3f4
JG
259 return 0;
260 }
261
a3eb13c1
JG
262 required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
263 if (!required_fault)
53f5c3f4 264 return 0;
76612d6c
JG
265
266 if (!non_swap_entry(entry))
267 goto fault;
268
269 if (is_migration_entry(entry)) {
270 pte_unmap(ptep);
271 hmm_vma_walk->last = addr;
272 migration_entry_wait(walk->mm, pmdp, addr);
273 return -EBUSY;
53f5c3f4
JG
274 }
275
276 /* Report error for everything else */
dfdc2207 277 pte_unmap(ptep);
f88a1e90 278 *pfn = range->values[HMM_PFN_ERROR];
53f5c3f4
JG
279 return -EFAULT;
280 }
281
76612d6c 282 cpu_flags = pte_to_hmm_pfn_flags(range, pte);
a3eb13c1
JG
283 required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
284 if (required_fault)
53f5c3f4
JG
285 goto fault;
286
40550627
JG
287 /*
288 * Since each architecture defines a struct page for the zero page, just
289 * fall through and treat it like a normal page.
290 */
291 if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) {
a3eb13c1 292 if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) {
dfdc2207 293 pte_unmap(ptep);
ac541f25
RC
294 return -EFAULT;
295 }
40550627
JG
296 *pfn = range->values[HMM_PFN_SPECIAL];
297 return 0;
992de9a8
JG
298 }
299
391aab11 300 *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
53f5c3f4
JG
301 return 0;
302
303fault:
304 pte_unmap(ptep);
305 /* Fault any virtual address we were asked to fault */
a3eb13c1 306 return hmm_vma_fault(addr, end, required_fault, walk);
53f5c3f4
JG
307}
308
da4c3c73
JG
309static int hmm_vma_walk_pmd(pmd_t *pmdp,
310 unsigned long start,
311 unsigned long end,
312 struct mm_walk *walk)
313{
74eee180
JG
314 struct hmm_vma_walk *hmm_vma_walk = walk->private;
315 struct hmm_range *range = hmm_vma_walk->range;
2288a9a6
JG
316 uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
317 unsigned long npages = (end - start) >> PAGE_SHIFT;
318 unsigned long addr = start;
da4c3c73 319 pte_t *ptep;
d08faca0 320 pmd_t pmd;
da4c3c73 321
da4c3c73 322again:
d08faca0
JG
323 pmd = READ_ONCE(*pmdp);
324 if (pmd_none(pmd))
b7a16c7a 325 return hmm_vma_walk_hole(start, end, -1, walk);
da4c3c73 326
d08faca0 327 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
a3eb13c1 328 if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) {
d08faca0 329 hmm_vma_walk->last = addr;
d2e8d551 330 pmd_migration_entry_wait(walk->mm, pmdp);
73231612 331 return -EBUSY;
d08faca0 332 }
7d082987 333 return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
2288a9a6
JG
334 }
335
336 if (!pmd_present(pmd)) {
a3eb13c1 337 if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
2288a9a6 338 return -EFAULT;
d28c2c9a 339 return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
2288a9a6 340 }
da4c3c73 341
d08faca0 342 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
da4c3c73 343 /*
d2e8d551 344 * No need to take pmd_lock here, even if some other thread
da4c3c73
JG
345 * is splitting the huge pmd we will get that event through
346 * mmu_notifier callback.
347 *
d2e8d551 348 * So just read pmd value and check again it's a transparent
da4c3c73
JG
349 * huge or device mapping one and compute corresponding pfn
350 * values.
351 */
352 pmd = pmd_read_atomic(pmdp);
353 barrier();
354 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
355 goto again;
74eee180 356
2288a9a6 357 return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd);
da4c3c73
JG
358 }
359
d08faca0 360 /*
d2e8d551 361 * We have handled all the valid cases above ie either none, migration,
d08faca0
JG
362 * huge or transparent huge. At this point either it is a valid pmd
363 * entry pointing to pte directory or it is a bad pmd that will not
364 * recover.
365 */
2288a9a6 366 if (pmd_bad(pmd)) {
a3eb13c1 367 if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
2288a9a6 368 return -EFAULT;
d28c2c9a 369 return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
2288a9a6 370 }
da4c3c73
JG
371
372 ptep = pte_offset_map(pmdp, addr);
2288a9a6 373 for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) {
53f5c3f4 374 int r;
74eee180 375
2288a9a6 376 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns);
53f5c3f4 377 if (r) {
dfdc2207 378 /* hmm_vma_handle_pte() did pte_unmap() */
53f5c3f4
JG
379 hmm_vma_walk->last = addr;
380 return r;
74eee180 381 }
da4c3c73
JG
382 }
383 pte_unmap(ptep - 1);
384
53f5c3f4 385 hmm_vma_walk->last = addr;
da4c3c73
JG
386 return 0;
387}
388
f0b3c45c
CH
389#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
390 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
391static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
392{
393 if (!pud_present(pud))
394 return 0;
395 return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
396 range->flags[HMM_PFN_WRITE] :
397 range->flags[HMM_PFN_VALID];
398}
399
400static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
401 struct mm_walk *walk)
992de9a8
JG
402{
403 struct hmm_vma_walk *hmm_vma_walk = walk->private;
404 struct hmm_range *range = hmm_vma_walk->range;
3afc4236 405 unsigned long addr = start;
992de9a8 406 pud_t pud;
3afc4236
SP
407 int ret = 0;
408 spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
409
410 if (!ptl)
411 return 0;
412
413 /* Normally we don't want to split the huge page */
414 walk->action = ACTION_CONTINUE;
992de9a8 415
992de9a8 416 pud = READ_ONCE(*pudp);
3afc4236 417 if (pud_none(pud)) {
05fc1df9
JG
418 spin_unlock(ptl);
419 return hmm_vma_walk_hole(start, end, -1, walk);
3afc4236 420 }
992de9a8
JG
421
422 if (pud_huge(pud) && pud_devmap(pud)) {
423 unsigned long i, npages, pfn;
a3eb13c1 424 unsigned int required_fault;
992de9a8 425 uint64_t *pfns, cpu_flags;
992de9a8 426
3afc4236 427 if (!pud_present(pud)) {
05fc1df9
JG
428 spin_unlock(ptl);
429 return hmm_vma_walk_hole(start, end, -1, walk);
3afc4236 430 }
992de9a8
JG
431
432 i = (addr - range->start) >> PAGE_SHIFT;
433 npages = (end - addr) >> PAGE_SHIFT;
434 pfns = &range->pfns[i];
435
436 cpu_flags = pud_to_hmm_pfn_flags(range, pud);
a3eb13c1
JG
437 required_fault = hmm_range_need_fault(hmm_vma_walk, pfns,
438 npages, cpu_flags);
439 if (required_fault) {
05fc1df9 440 spin_unlock(ptl);
a3eb13c1 441 return hmm_vma_fault(addr, end, required_fault, walk);
3afc4236 442 }
992de9a8 443
992de9a8 444 pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
068354ad 445 for (i = 0; i < npages; ++i, ++pfn)
391aab11
JG
446 pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
447 cpu_flags;
992de9a8 448 hmm_vma_walk->last = end;
3afc4236 449 goto out_unlock;
992de9a8
JG
450 }
451
3afc4236
SP
452 /* Ask for the PUD to be split */
453 walk->action = ACTION_SUBTREE;
992de9a8 454
3afc4236
SP
455out_unlock:
456 spin_unlock(ptl);
457 return ret;
992de9a8 458}
f0b3c45c
CH
459#else
460#define hmm_vma_walk_pud NULL
461#endif
992de9a8 462
251bbe59 463#ifdef CONFIG_HUGETLB_PAGE
63d5066f
JG
464static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
465 unsigned long start, unsigned long end,
466 struct mm_walk *walk)
467{
05c23af4 468 unsigned long addr = start, i, pfn;
63d5066f
JG
469 struct hmm_vma_walk *hmm_vma_walk = walk->private;
470 struct hmm_range *range = hmm_vma_walk->range;
471 struct vm_area_struct *vma = walk->vma;
63d5066f 472 uint64_t orig_pfn, cpu_flags;
a3eb13c1 473 unsigned int required_fault;
63d5066f
JG
474 spinlock_t *ptl;
475 pte_t entry;
63d5066f 476
d2e8d551 477 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
63d5066f
JG
478 entry = huge_ptep_get(pte);
479
7f08263d 480 i = (start - range->start) >> PAGE_SHIFT;
63d5066f
JG
481 orig_pfn = range->pfns[i];
482 range->pfns[i] = range->values[HMM_PFN_NONE];
483 cpu_flags = pte_to_hmm_pfn_flags(range, entry);
a3eb13c1
JG
484 required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
485 if (required_fault) {
45050692 486 spin_unlock(ptl);
a3eb13c1 487 return hmm_vma_fault(addr, end, required_fault, walk);
63d5066f
JG
488 }
489
05c23af4 490 pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
7f08263d 491 for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
391aab11
JG
492 range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
493 cpu_flags;
63d5066f 494 hmm_vma_walk->last = end;
63d5066f 495 spin_unlock(ptl);
45050692 496 return 0;
63d5066f 497}
251bbe59
CH
498#else
499#define hmm_vma_walk_hugetlb_entry NULL
500#endif /* CONFIG_HUGETLB_PAGE */
63d5066f 501
d28c2c9a
RC
502static int hmm_vma_walk_test(unsigned long start, unsigned long end,
503 struct mm_walk *walk)
33cd47dc 504{
d28c2c9a
RC
505 struct hmm_vma_walk *hmm_vma_walk = walk->private;
506 struct hmm_range *range = hmm_vma_walk->range;
507 struct vm_area_struct *vma = walk->vma;
508
a3eb13c1
JG
509 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) &&
510 vma->vm_flags & VM_READ)
511 return 0;
512
d28c2c9a 513 /*
a3eb13c1
JG
514 * vma ranges that don't have struct page backing them or map I/O
515 * devices directly cannot be handled by hmm_range_fault().
c2579c9c 516 *
d28c2c9a 517 * If the vma does not allow read access, then assume that it does not
c2579c9c
JG
518 * allow write access either. HMM does not support architectures that
519 * allow write without read.
a3eb13c1
JG
520 *
521 * If a fault is requested for an unsupported range then it is a hard
522 * failure.
d28c2c9a 523 */
a3eb13c1
JG
524 if (hmm_range_need_fault(hmm_vma_walk,
525 range->pfns +
526 ((start - range->start) >> PAGE_SHIFT),
527 (end - start) >> PAGE_SHIFT, 0))
528 return -EFAULT;
d28c2c9a 529
a3eb13c1
JG
530 hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
531 hmm_vma_walk->last = end;
d28c2c9a 532
a3eb13c1
JG
533 /* Skip this vma and continue processing the next vma. */
534 return 1;
33cd47dc
JG
535}
536
7b86ac33
CH
537static const struct mm_walk_ops hmm_walk_ops = {
538 .pud_entry = hmm_vma_walk_pud,
539 .pmd_entry = hmm_vma_walk_pmd,
540 .pte_hole = hmm_vma_walk_hole,
541 .hugetlb_entry = hmm_vma_walk_hugetlb_entry,
d28c2c9a 542 .test_walk = hmm_vma_walk_test,
7b86ac33
CH
543};
544
9a4903e4
CH
545/**
546 * hmm_range_fault - try to fault some address in a virtual address range
547 * @range: range being faulted
548 * @flags: HMM_FAULT_* flags
549 *
550 * Return: the number of valid pages in range->pfns[] (from range start
551 * address), which may be zero. On error one of the following status codes
552 * can be returned:
73231612 553 *
9a4903e4
CH
554 * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
555 * (e.g., device file vma).
556 * -ENOMEM: Out of memory.
557 * -EPERM: Invalid permission (e.g., asking for write and range is read
558 * only).
9a4903e4
CH
559 * -EBUSY: The range has been invalidated and the caller needs to wait for
560 * the invalidation to finish.
561 * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access
562 * that range) number of valid pages in range->pfns[] (from
563 * range start address).
74eee180
JG
564 *
565 * This is similar to a regular CPU page fault except that it will not trigger
73231612
JG
566 * any memory migration if the memory being faulted is not accessible by CPUs
567 * and caller does not ask for migration.
74eee180 568 *
ff05c0c6
JG
569 * On error, for one virtual address in the range, the function will mark the
570 * corresponding HMM pfn entry with an error flag.
74eee180 571 */
9a4903e4 572long hmm_range_fault(struct hmm_range *range, unsigned int flags)
74eee180 573{
d28c2c9a
RC
574 struct hmm_vma_walk hmm_vma_walk = {
575 .range = range,
576 .last = range->start,
577 .flags = flags,
578 };
a22dd506 579 struct mm_struct *mm = range->notifier->mm;
74eee180
JG
580 int ret;
581
04ec32fb 582 lockdep_assert_held(&mm->mmap_sem);
704f3f2c 583
a3e0d41c
JG
584 do {
585 /* If range is no longer valid force retry. */
a22dd506
JG
586 if (mmu_interval_check_retry(range->notifier,
587 range->notifier_seq))
2bcbeaef 588 return -EBUSY;
d28c2c9a
RC
589 ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
590 &hmm_walk_ops, &hmm_vma_walk);
591 } while (ret == -EBUSY);
74eee180 592
d28c2c9a
RC
593 if (ret)
594 return ret;
73231612 595 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
74eee180 596}
73231612 597EXPORT_SYMBOL(hmm_range_fault);