]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
1da177e4 LT |
2 | /* |
3 | * linux/mm/madvise.c | |
4 | * | |
5 | * Copyright (C) 1999 Linus Torvalds | |
6 | * Copyright (C) 2002 Christoph Hellwig | |
7 | */ | |
8 | ||
9 | #include <linux/mman.h> | |
10 | #include <linux/pagemap.h> | |
11 | #include <linux/syscalls.h> | |
05b74384 | 12 | #include <linux/mempolicy.h> |
afcf938e | 13 | #include <linux/page-isolation.h> |
9c276cc6 | 14 | #include <linux/page_idle.h> |
05ce7724 | 15 | #include <linux/userfaultfd_k.h> |
1da177e4 | 16 | #include <linux/hugetlb.h> |
3f31d075 | 17 | #include <linux/falloc.h> |
692fe624 | 18 | #include <linux/fadvise.h> |
e8edc6e0 | 19 | #include <linux/sched.h> |
ecb8ac8b MK |
20 | #include <linux/sched/mm.h> |
21 | #include <linux/uio.h> | |
f8af4da3 | 22 | #include <linux/ksm.h> |
3f31d075 | 23 | #include <linux/fs.h> |
9ab4233d | 24 | #include <linux/file.h> |
1998cc04 | 25 | #include <linux/blkdev.h> |
66114cad | 26 | #include <linux/backing-dev.h> |
a520110e | 27 | #include <linux/pagewalk.h> |
1998cc04 SL |
28 | #include <linux/swap.h> |
29 | #include <linux/swapops.h> | |
3a4f8a0b | 30 | #include <linux/shmem_fs.h> |
854e9ed0 MK |
31 | #include <linux/mmu_notifier.h> |
32 | ||
33 | #include <asm/tlb.h> | |
1da177e4 | 34 | |
23519073 KS |
35 | #include "internal.h" |
36 | ||
d616d512 MK |
37 | struct madvise_walk_private { |
38 | struct mmu_gather *tlb; | |
39 | bool pageout; | |
40 | }; | |
41 | ||
0a27a14a NP |
42 | /* |
43 | * Any behaviour which results in changes to the vma->vm_flags needs to | |
c1e8d7c6 | 44 | * take mmap_lock for writing. Others, which simply traverse vmas, need |
0a27a14a NP |
45 | * to only take it for reading. |
46 | */ | |
47 | static int madvise_need_mmap_write(int behavior) | |
48 | { | |
49 | switch (behavior) { | |
50 | case MADV_REMOVE: | |
51 | case MADV_WILLNEED: | |
52 | case MADV_DONTNEED: | |
9c276cc6 | 53 | case MADV_COLD: |
1a4e58cc | 54 | case MADV_PAGEOUT: |
854e9ed0 | 55 | case MADV_FREE: |
0a27a14a NP |
56 | return 0; |
57 | default: | |
58 | /* be safe, default to 1. list exceptions explicitly */ | |
59 | return 1; | |
60 | } | |
61 | } | |
62 | ||
1da177e4 LT |
63 | /* |
64 | * We can potentially split a vm area into separate | |
65 | * areas, each area with its own behavior. | |
66 | */ | |
ec9bed9d | 67 | static long madvise_behavior(struct vm_area_struct *vma, |
05b74384 PM |
68 | struct vm_area_struct **prev, |
69 | unsigned long start, unsigned long end, int behavior) | |
1da177e4 | 70 | { |
ec9bed9d | 71 | struct mm_struct *mm = vma->vm_mm; |
1da177e4 | 72 | int error = 0; |
05b74384 | 73 | pgoff_t pgoff; |
3866ea90 | 74 | unsigned long new_flags = vma->vm_flags; |
e798c6e8 PM |
75 | |
76 | switch (behavior) { | |
f8225661 MT |
77 | case MADV_NORMAL: |
78 | new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; | |
79 | break; | |
e798c6e8 | 80 | case MADV_SEQUENTIAL: |
f8225661 | 81 | new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; |
e798c6e8 PM |
82 | break; |
83 | case MADV_RANDOM: | |
f8225661 | 84 | new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; |
e798c6e8 | 85 | break; |
f8225661 MT |
86 | case MADV_DONTFORK: |
87 | new_flags |= VM_DONTCOPY; | |
88 | break; | |
89 | case MADV_DOFORK: | |
3866ea90 HD |
90 | if (vma->vm_flags & VM_IO) { |
91 | error = -EINVAL; | |
92 | goto out; | |
93 | } | |
f8225661 | 94 | new_flags &= ~VM_DONTCOPY; |
e798c6e8 | 95 | break; |
d2cd9ede RR |
96 | case MADV_WIPEONFORK: |
97 | /* MADV_WIPEONFORK is only supported on anonymous memory. */ | |
98 | if (vma->vm_file || vma->vm_flags & VM_SHARED) { | |
99 | error = -EINVAL; | |
100 | goto out; | |
101 | } | |
102 | new_flags |= VM_WIPEONFORK; | |
103 | break; | |
104 | case MADV_KEEPONFORK: | |
105 | new_flags &= ~VM_WIPEONFORK; | |
106 | break; | |
accb61fe | 107 | case MADV_DONTDUMP: |
0103bd16 | 108 | new_flags |= VM_DONTDUMP; |
accb61fe JB |
109 | break; |
110 | case MADV_DODUMP: | |
d41aa525 | 111 | if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { |
0103bd16 KK |
112 | error = -EINVAL; |
113 | goto out; | |
114 | } | |
115 | new_flags &= ~VM_DONTDUMP; | |
accb61fe | 116 | break; |
f8af4da3 HD |
117 | case MADV_MERGEABLE: |
118 | case MADV_UNMERGEABLE: | |
119 | error = ksm_madvise(vma, start, end, behavior, &new_flags); | |
f3bc0dba MR |
120 | if (error) |
121 | goto out_convert_errno; | |
f8af4da3 | 122 | break; |
0af4e98b | 123 | case MADV_HUGEPAGE: |
a664b2d8 | 124 | case MADV_NOHUGEPAGE: |
60ab3244 | 125 | error = hugepage_madvise(vma, &new_flags, behavior); |
f3bc0dba MR |
126 | if (error) |
127 | goto out_convert_errno; | |
0af4e98b | 128 | break; |
e798c6e8 PM |
129 | } |
130 | ||
05b74384 PM |
131 | if (new_flags == vma->vm_flags) { |
132 | *prev = vma; | |
836d5ffd | 133 | goto out; |
05b74384 PM |
134 | } |
135 | ||
136 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | |
137 | *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, | |
19a809af AA |
138 | vma->vm_file, pgoff, vma_policy(vma), |
139 | vma->vm_userfaultfd_ctx); | |
05b74384 PM |
140 | if (*prev) { |
141 | vma = *prev; | |
142 | goto success; | |
143 | } | |
144 | ||
145 | *prev = vma; | |
1da177e4 LT |
146 | |
147 | if (start != vma->vm_start) { | |
def5efe0 DR |
148 | if (unlikely(mm->map_count >= sysctl_max_map_count)) { |
149 | error = -ENOMEM; | |
1da177e4 | 150 | goto out; |
def5efe0 DR |
151 | } |
152 | error = __split_vma(mm, vma, start, 1); | |
f3bc0dba MR |
153 | if (error) |
154 | goto out_convert_errno; | |
1da177e4 LT |
155 | } |
156 | ||
157 | if (end != vma->vm_end) { | |
def5efe0 DR |
158 | if (unlikely(mm->map_count >= sysctl_max_map_count)) { |
159 | error = -ENOMEM; | |
1da177e4 | 160 | goto out; |
def5efe0 DR |
161 | } |
162 | error = __split_vma(mm, vma, end, 0); | |
f3bc0dba MR |
163 | if (error) |
164 | goto out_convert_errno; | |
1da177e4 LT |
165 | } |
166 | ||
836d5ffd | 167 | success: |
1da177e4 | 168 | /* |
c1e8d7c6 | 169 | * vm_flags is protected by the mmap_lock held in write mode. |
1da177e4 | 170 | */ |
e798c6e8 | 171 | vma->vm_flags = new_flags; |
f3bc0dba MR |
172 | |
173 | out_convert_errno: | |
174 | /* | |
175 | * madvise() returns EAGAIN if kernel resources, such as | |
176 | * slab, are temporarily unavailable. | |
177 | */ | |
178 | if (error == -ENOMEM) | |
179 | error = -EAGAIN; | |
1da177e4 | 180 | out: |
1da177e4 LT |
181 | return error; |
182 | } | |
183 | ||
1998cc04 SL |
184 | #ifdef CONFIG_SWAP |
185 | static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, | |
186 | unsigned long end, struct mm_walk *walk) | |
187 | { | |
188 | pte_t *orig_pte; | |
189 | struct vm_area_struct *vma = walk->private; | |
190 | unsigned long index; | |
191 | ||
192 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | |
193 | return 0; | |
194 | ||
195 | for (index = start; index != end; index += PAGE_SIZE) { | |
196 | pte_t pte; | |
197 | swp_entry_t entry; | |
198 | struct page *page; | |
199 | spinlock_t *ptl; | |
200 | ||
201 | orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); | |
202 | pte = *(orig_pte + ((index - start) / PAGE_SIZE)); | |
203 | pte_unmap_unlock(orig_pte, ptl); | |
204 | ||
0661a336 | 205 | if (pte_present(pte) || pte_none(pte)) |
1998cc04 SL |
206 | continue; |
207 | entry = pte_to_swp_entry(pte); | |
208 | if (unlikely(non_swap_entry(entry))) | |
209 | continue; | |
210 | ||
211 | page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, | |
23955622 | 212 | vma, index, false); |
1998cc04 | 213 | if (page) |
09cbfeaf | 214 | put_page(page); |
1998cc04 SL |
215 | } |
216 | ||
217 | return 0; | |
218 | } | |
219 | ||
7b86ac33 CH |
220 | static const struct mm_walk_ops swapin_walk_ops = { |
221 | .pmd_entry = swapin_walk_pmd_entry, | |
222 | }; | |
1998cc04 SL |
223 | |
224 | static void force_shm_swapin_readahead(struct vm_area_struct *vma, | |
225 | unsigned long start, unsigned long end, | |
226 | struct address_space *mapping) | |
227 | { | |
e6e88712 | 228 | XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); |
66383800 | 229 | pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); |
1998cc04 | 230 | struct page *page; |
1998cc04 | 231 | |
e6e88712 MWO |
232 | rcu_read_lock(); |
233 | xas_for_each(&xas, page, end_index) { | |
234 | swp_entry_t swap; | |
1998cc04 | 235 | |
e6e88712 | 236 | if (!xa_is_value(page)) |
1998cc04 | 237 | continue; |
e6e88712 MWO |
238 | xas_pause(&xas); |
239 | rcu_read_unlock(); | |
240 | ||
1998cc04 SL |
241 | swap = radix_to_swp_entry(page); |
242 | page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, | |
23955622 | 243 | NULL, 0, false); |
1998cc04 | 244 | if (page) |
09cbfeaf | 245 | put_page(page); |
e6e88712 MWO |
246 | |
247 | rcu_read_lock(); | |
1998cc04 | 248 | } |
e6e88712 | 249 | rcu_read_unlock(); |
1998cc04 SL |
250 | |
251 | lru_add_drain(); /* Push any new pages onto the LRU now */ | |
252 | } | |
253 | #endif /* CONFIG_SWAP */ | |
254 | ||
1da177e4 LT |
255 | /* |
256 | * Schedule all required I/O operations. Do not wait for completion. | |
257 | */ | |
ec9bed9d VC |
258 | static long madvise_willneed(struct vm_area_struct *vma, |
259 | struct vm_area_struct **prev, | |
1da177e4 LT |
260 | unsigned long start, unsigned long end) |
261 | { | |
0726b01e | 262 | struct mm_struct *mm = vma->vm_mm; |
1da177e4 | 263 | struct file *file = vma->vm_file; |
692fe624 | 264 | loff_t offset; |
1da177e4 | 265 | |
6ea8d958 | 266 | *prev = vma; |
1998cc04 | 267 | #ifdef CONFIG_SWAP |
97b713ba | 268 | if (!file) { |
7b86ac33 CH |
269 | walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); |
270 | lru_add_drain(); /* Push any new pages onto the LRU now */ | |
1998cc04 SL |
271 | return 0; |
272 | } | |
1998cc04 | 273 | |
97b713ba | 274 | if (shmem_mapping(file->f_mapping)) { |
97b713ba CH |
275 | force_shm_swapin_readahead(vma, start, end, |
276 | file->f_mapping); | |
277 | return 0; | |
278 | } | |
279 | #else | |
1bef4003 S |
280 | if (!file) |
281 | return -EBADF; | |
97b713ba | 282 | #endif |
1bef4003 | 283 | |
e748dcd0 | 284 | if (IS_DAX(file_inode(file))) { |
fe77ba6f CO |
285 | /* no bad return value, but ignore advice */ |
286 | return 0; | |
287 | } | |
288 | ||
692fe624 JK |
289 | /* |
290 | * Filesystem's fadvise may need to take various locks. We need to | |
291 | * explicitly grab a reference because the vma (and hence the | |
292 | * vma's reference to the file) can go away as soon as we drop | |
c1e8d7c6 | 293 | * mmap_lock. |
692fe624 | 294 | */ |
c1e8d7c6 | 295 | *prev = NULL; /* tell sys_madvise we drop mmap_lock */ |
692fe624 | 296 | get_file(file); |
692fe624 JK |
297 | offset = (loff_t)(start - vma->vm_start) |
298 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | |
0726b01e | 299 | mmap_read_unlock(mm); |
692fe624 JK |
300 | vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); |
301 | fput(file); | |
0726b01e | 302 | mmap_read_lock(mm); |
1da177e4 LT |
303 | return 0; |
304 | } | |
305 | ||
d616d512 MK |
306 | static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, |
307 | unsigned long addr, unsigned long end, | |
308 | struct mm_walk *walk) | |
9c276cc6 | 309 | { |
d616d512 MK |
310 | struct madvise_walk_private *private = walk->private; |
311 | struct mmu_gather *tlb = private->tlb; | |
312 | bool pageout = private->pageout; | |
9c276cc6 MK |
313 | struct mm_struct *mm = tlb->mm; |
314 | struct vm_area_struct *vma = walk->vma; | |
315 | pte_t *orig_pte, *pte, ptent; | |
316 | spinlock_t *ptl; | |
d616d512 MK |
317 | struct page *page = NULL; |
318 | LIST_HEAD(page_list); | |
319 | ||
320 | if (fatal_signal_pending(current)) | |
321 | return -EINTR; | |
9c276cc6 MK |
322 | |
323 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
324 | if (pmd_trans_huge(*pmd)) { | |
325 | pmd_t orig_pmd; | |
326 | unsigned long next = pmd_addr_end(addr, end); | |
327 | ||
328 | tlb_change_page_size(tlb, HPAGE_PMD_SIZE); | |
329 | ptl = pmd_trans_huge_lock(pmd, vma); | |
330 | if (!ptl) | |
331 | return 0; | |
332 | ||
333 | orig_pmd = *pmd; | |
334 | if (is_huge_zero_pmd(orig_pmd)) | |
335 | goto huge_unlock; | |
336 | ||
337 | if (unlikely(!pmd_present(orig_pmd))) { | |
338 | VM_BUG_ON(thp_migration_supported() && | |
339 | !is_pmd_migration_entry(orig_pmd)); | |
340 | goto huge_unlock; | |
341 | } | |
342 | ||
343 | page = pmd_page(orig_pmd); | |
12e967fd MH |
344 | |
345 | /* Do not interfere with other mappings of this page */ | |
346 | if (page_mapcount(page) != 1) | |
347 | goto huge_unlock; | |
348 | ||
9c276cc6 MK |
349 | if (next - addr != HPAGE_PMD_SIZE) { |
350 | int err; | |
351 | ||
9c276cc6 MK |
352 | get_page(page); |
353 | spin_unlock(ptl); | |
354 | lock_page(page); | |
355 | err = split_huge_page(page); | |
356 | unlock_page(page); | |
357 | put_page(page); | |
358 | if (!err) | |
359 | goto regular_page; | |
360 | return 0; | |
361 | } | |
362 | ||
363 | if (pmd_young(orig_pmd)) { | |
364 | pmdp_invalidate(vma, addr, pmd); | |
365 | orig_pmd = pmd_mkold(orig_pmd); | |
366 | ||
367 | set_pmd_at(mm, addr, pmd, orig_pmd); | |
368 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | |
369 | } | |
370 | ||
d616d512 | 371 | ClearPageReferenced(page); |
9c276cc6 | 372 | test_and_clear_page_young(page); |
d616d512 | 373 | if (pageout) { |
82072962 | 374 | if (!isolate_lru_page(page)) { |
375 | if (PageUnevictable(page)) | |
376 | putback_lru_page(page); | |
377 | else | |
378 | list_add(&page->lru, &page_list); | |
379 | } | |
d616d512 MK |
380 | } else |
381 | deactivate_page(page); | |
9c276cc6 MK |
382 | huge_unlock: |
383 | spin_unlock(ptl); | |
d616d512 MK |
384 | if (pageout) |
385 | reclaim_pages(&page_list); | |
9c276cc6 MK |
386 | return 0; |
387 | } | |
388 | ||
ce268425 | 389 | regular_page: |
9c276cc6 MK |
390 | if (pmd_trans_unstable(pmd)) |
391 | return 0; | |
9c276cc6 MK |
392 | #endif |
393 | tlb_change_page_size(tlb, PAGE_SIZE); | |
394 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | |
395 | flush_tlb_batched_pending(mm); | |
396 | arch_enter_lazy_mmu_mode(); | |
397 | for (; addr < end; pte++, addr += PAGE_SIZE) { | |
398 | ptent = *pte; | |
399 | ||
400 | if (pte_none(ptent)) | |
401 | continue; | |
402 | ||
403 | if (!pte_present(ptent)) | |
404 | continue; | |
405 | ||
406 | page = vm_normal_page(vma, addr, ptent); | |
407 | if (!page) | |
408 | continue; | |
409 | ||
410 | /* | |
411 | * Creating a THP page is expensive so split it only if we | |
412 | * are sure it's worth. Split it if we are only owner. | |
413 | */ | |
414 | if (PageTransCompound(page)) { | |
415 | if (page_mapcount(page) != 1) | |
416 | break; | |
417 | get_page(page); | |
418 | if (!trylock_page(page)) { | |
419 | put_page(page); | |
420 | break; | |
421 | } | |
422 | pte_unmap_unlock(orig_pte, ptl); | |
423 | if (split_huge_page(page)) { | |
424 | unlock_page(page); | |
425 | put_page(page); | |
426 | pte_offset_map_lock(mm, pmd, addr, &ptl); | |
427 | break; | |
428 | } | |
429 | unlock_page(page); | |
430 | put_page(page); | |
431 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | |
432 | pte--; | |
433 | addr -= PAGE_SIZE; | |
434 | continue; | |
435 | } | |
436 | ||
12e967fd MH |
437 | /* Do not interfere with other mappings of this page */ |
438 | if (page_mapcount(page) != 1) | |
439 | continue; | |
440 | ||
9c276cc6 MK |
441 | VM_BUG_ON_PAGE(PageTransCompound(page), page); |
442 | ||
443 | if (pte_young(ptent)) { | |
444 | ptent = ptep_get_and_clear_full(mm, addr, pte, | |
445 | tlb->fullmm); | |
446 | ptent = pte_mkold(ptent); | |
447 | set_pte_at(mm, addr, pte, ptent); | |
448 | tlb_remove_tlb_entry(tlb, pte, addr); | |
449 | } | |
450 | ||
451 | /* | |
452 | * We are deactivating a page for accelerating reclaiming. | |
453 | * VM couldn't reclaim the page unless we clear PG_young. | |
454 | * As a side effect, it makes confuse idle-page tracking | |
455 | * because they will miss recent referenced history. | |
456 | */ | |
d616d512 | 457 | ClearPageReferenced(page); |
9c276cc6 | 458 | test_and_clear_page_young(page); |
d616d512 | 459 | if (pageout) { |
82072962 | 460 | if (!isolate_lru_page(page)) { |
461 | if (PageUnevictable(page)) | |
462 | putback_lru_page(page); | |
463 | else | |
464 | list_add(&page->lru, &page_list); | |
465 | } | |
d616d512 MK |
466 | } else |
467 | deactivate_page(page); | |
9c276cc6 MK |
468 | } |
469 | ||
470 | arch_leave_lazy_mmu_mode(); | |
471 | pte_unmap_unlock(orig_pte, ptl); | |
d616d512 MK |
472 | if (pageout) |
473 | reclaim_pages(&page_list); | |
9c276cc6 MK |
474 | cond_resched(); |
475 | ||
476 | return 0; | |
477 | } | |
478 | ||
479 | static const struct mm_walk_ops cold_walk_ops = { | |
d616d512 | 480 | .pmd_entry = madvise_cold_or_pageout_pte_range, |
9c276cc6 MK |
481 | }; |
482 | ||
483 | static void madvise_cold_page_range(struct mmu_gather *tlb, | |
484 | struct vm_area_struct *vma, | |
485 | unsigned long addr, unsigned long end) | |
486 | { | |
d616d512 MK |
487 | struct madvise_walk_private walk_private = { |
488 | .pageout = false, | |
489 | .tlb = tlb, | |
490 | }; | |
491 | ||
9c276cc6 | 492 | tlb_start_vma(tlb, vma); |
d616d512 | 493 | walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); |
9c276cc6 MK |
494 | tlb_end_vma(tlb, vma); |
495 | } | |
496 | ||
497 | static long madvise_cold(struct vm_area_struct *vma, | |
498 | struct vm_area_struct **prev, | |
499 | unsigned long start_addr, unsigned long end_addr) | |
500 | { | |
501 | struct mm_struct *mm = vma->vm_mm; | |
502 | struct mmu_gather tlb; | |
503 | ||
504 | *prev = vma; | |
505 | if (!can_madv_lru_vma(vma)) | |
506 | return -EINVAL; | |
507 | ||
508 | lru_add_drain(); | |
509 | tlb_gather_mmu(&tlb, mm, start_addr, end_addr); | |
510 | madvise_cold_page_range(&tlb, vma, start_addr, end_addr); | |
511 | tlb_finish_mmu(&tlb, start_addr, end_addr); | |
512 | ||
513 | return 0; | |
514 | } | |
515 | ||
1a4e58cc MK |
516 | static void madvise_pageout_page_range(struct mmu_gather *tlb, |
517 | struct vm_area_struct *vma, | |
518 | unsigned long addr, unsigned long end) | |
519 | { | |
d616d512 MK |
520 | struct madvise_walk_private walk_private = { |
521 | .pageout = true, | |
522 | .tlb = tlb, | |
523 | }; | |
524 | ||
1a4e58cc | 525 | tlb_start_vma(tlb, vma); |
d616d512 | 526 | walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); |
1a4e58cc MK |
527 | tlb_end_vma(tlb, vma); |
528 | } | |
529 | ||
530 | static inline bool can_do_pageout(struct vm_area_struct *vma) | |
531 | { | |
532 | if (vma_is_anonymous(vma)) | |
533 | return true; | |
534 | if (!vma->vm_file) | |
535 | return false; | |
536 | /* | |
537 | * paging out pagecache only for non-anonymous mappings that correspond | |
538 | * to the files the calling process could (if tried) open for writing; | |
539 | * otherwise we'd be including shared non-exclusive mappings, which | |
540 | * opens a side channel. | |
541 | */ | |
542 | return inode_owner_or_capable(file_inode(vma->vm_file)) || | |
543 | inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; | |
544 | } | |
545 | ||
546 | static long madvise_pageout(struct vm_area_struct *vma, | |
547 | struct vm_area_struct **prev, | |
548 | unsigned long start_addr, unsigned long end_addr) | |
549 | { | |
550 | struct mm_struct *mm = vma->vm_mm; | |
551 | struct mmu_gather tlb; | |
552 | ||
553 | *prev = vma; | |
554 | if (!can_madv_lru_vma(vma)) | |
555 | return -EINVAL; | |
556 | ||
557 | if (!can_do_pageout(vma)) | |
558 | return 0; | |
559 | ||
560 | lru_add_drain(); | |
561 | tlb_gather_mmu(&tlb, mm, start_addr, end_addr); | |
562 | madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); | |
563 | tlb_finish_mmu(&tlb, start_addr, end_addr); | |
564 | ||
565 | return 0; | |
566 | } | |
567 | ||
854e9ed0 MK |
568 | static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, |
569 | unsigned long end, struct mm_walk *walk) | |
570 | ||
571 | { | |
572 | struct mmu_gather *tlb = walk->private; | |
573 | struct mm_struct *mm = tlb->mm; | |
574 | struct vm_area_struct *vma = walk->vma; | |
575 | spinlock_t *ptl; | |
576 | pte_t *orig_pte, *pte, ptent; | |
577 | struct page *page; | |
64b42bc1 | 578 | int nr_swap = 0; |
b8d3c4c3 MK |
579 | unsigned long next; |
580 | ||
581 | next = pmd_addr_end(addr, end); | |
582 | if (pmd_trans_huge(*pmd)) | |
583 | if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) | |
584 | goto next; | |
854e9ed0 | 585 | |
854e9ed0 MK |
586 | if (pmd_trans_unstable(pmd)) |
587 | return 0; | |
588 | ||
ed6a7935 | 589 | tlb_change_page_size(tlb, PAGE_SIZE); |
854e9ed0 | 590 | orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
3ea27719 | 591 | flush_tlb_batched_pending(mm); |
854e9ed0 MK |
592 | arch_enter_lazy_mmu_mode(); |
593 | for (; addr != end; pte++, addr += PAGE_SIZE) { | |
594 | ptent = *pte; | |
595 | ||
64b42bc1 | 596 | if (pte_none(ptent)) |
854e9ed0 | 597 | continue; |
64b42bc1 MK |
598 | /* |
599 | * If the pte has swp_entry, just clear page table to | |
600 | * prevent swap-in which is more expensive rather than | |
601 | * (page allocation + zeroing). | |
602 | */ | |
603 | if (!pte_present(ptent)) { | |
604 | swp_entry_t entry; | |
605 | ||
606 | entry = pte_to_swp_entry(ptent); | |
607 | if (non_swap_entry(entry)) | |
608 | continue; | |
609 | nr_swap--; | |
610 | free_swap_and_cache(entry); | |
611 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | |
612 | continue; | |
613 | } | |
854e9ed0 | 614 | |
25b2995a | 615 | page = vm_normal_page(vma, addr, ptent); |
854e9ed0 MK |
616 | if (!page) |
617 | continue; | |
618 | ||
619 | /* | |
620 | * If pmd isn't transhuge but the page is THP and | |
621 | * is owned by only this process, split it and | |
622 | * deactivate all pages. | |
623 | */ | |
624 | if (PageTransCompound(page)) { | |
625 | if (page_mapcount(page) != 1) | |
626 | goto out; | |
627 | get_page(page); | |
628 | if (!trylock_page(page)) { | |
629 | put_page(page); | |
630 | goto out; | |
631 | } | |
632 | pte_unmap_unlock(orig_pte, ptl); | |
633 | if (split_huge_page(page)) { | |
634 | unlock_page(page); | |
635 | put_page(page); | |
636 | pte_offset_map_lock(mm, pmd, addr, &ptl); | |
637 | goto out; | |
638 | } | |
854e9ed0 | 639 | unlock_page(page); |
263630e8 | 640 | put_page(page); |
854e9ed0 MK |
641 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
642 | pte--; | |
643 | addr -= PAGE_SIZE; | |
644 | continue; | |
645 | } | |
646 | ||
647 | VM_BUG_ON_PAGE(PageTransCompound(page), page); | |
648 | ||
649 | if (PageSwapCache(page) || PageDirty(page)) { | |
650 | if (!trylock_page(page)) | |
651 | continue; | |
652 | /* | |
653 | * If page is shared with others, we couldn't clear | |
654 | * PG_dirty of the page. | |
655 | */ | |
656 | if (page_mapcount(page) != 1) { | |
657 | unlock_page(page); | |
658 | continue; | |
659 | } | |
660 | ||
661 | if (PageSwapCache(page) && !try_to_free_swap(page)) { | |
662 | unlock_page(page); | |
663 | continue; | |
664 | } | |
665 | ||
666 | ClearPageDirty(page); | |
667 | unlock_page(page); | |
668 | } | |
669 | ||
670 | if (pte_young(ptent) || pte_dirty(ptent)) { | |
671 | /* | |
672 | * Some of architecture(ex, PPC) don't update TLB | |
673 | * with set_pte_at and tlb_remove_tlb_entry so for | |
674 | * the portability, remap the pte with old|clean | |
675 | * after pte clearing. | |
676 | */ | |
677 | ptent = ptep_get_and_clear_full(mm, addr, pte, | |
678 | tlb->fullmm); | |
679 | ||
680 | ptent = pte_mkold(ptent); | |
681 | ptent = pte_mkclean(ptent); | |
682 | set_pte_at(mm, addr, pte, ptent); | |
683 | tlb_remove_tlb_entry(tlb, pte, addr); | |
684 | } | |
802a3a92 | 685 | mark_page_lazyfree(page); |
854e9ed0 MK |
686 | } |
687 | out: | |
64b42bc1 MK |
688 | if (nr_swap) { |
689 | if (current->mm == mm) | |
690 | sync_mm_rss(mm); | |
691 | ||
692 | add_mm_counter(mm, MM_SWAPENTS, nr_swap); | |
693 | } | |
854e9ed0 MK |
694 | arch_leave_lazy_mmu_mode(); |
695 | pte_unmap_unlock(orig_pte, ptl); | |
696 | cond_resched(); | |
b8d3c4c3 | 697 | next: |
854e9ed0 MK |
698 | return 0; |
699 | } | |
700 | ||
7b86ac33 CH |
701 | static const struct mm_walk_ops madvise_free_walk_ops = { |
702 | .pmd_entry = madvise_free_pte_range, | |
703 | }; | |
854e9ed0 MK |
704 | |
705 | static int madvise_free_single_vma(struct vm_area_struct *vma, | |
706 | unsigned long start_addr, unsigned long end_addr) | |
707 | { | |
854e9ed0 | 708 | struct mm_struct *mm = vma->vm_mm; |
ac46d4f3 | 709 | struct mmu_notifier_range range; |
854e9ed0 MK |
710 | struct mmu_gather tlb; |
711 | ||
854e9ed0 MK |
712 | /* MADV_FREE works for only anon vma at the moment */ |
713 | if (!vma_is_anonymous(vma)) | |
714 | return -EINVAL; | |
715 | ||
ac46d4f3 JG |
716 | range.start = max(vma->vm_start, start_addr); |
717 | if (range.start >= vma->vm_end) | |
854e9ed0 | 718 | return -EINVAL; |
ac46d4f3 JG |
719 | range.end = min(vma->vm_end, end_addr); |
720 | if (range.end <= vma->vm_start) | |
854e9ed0 | 721 | return -EINVAL; |
7269f999 | 722 | mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, |
6f4f13e8 | 723 | range.start, range.end); |
854e9ed0 MK |
724 | |
725 | lru_add_drain(); | |
ac46d4f3 | 726 | tlb_gather_mmu(&tlb, mm, range.start, range.end); |
854e9ed0 MK |
727 | update_hiwater_rss(mm); |
728 | ||
ac46d4f3 | 729 | mmu_notifier_invalidate_range_start(&range); |
7b86ac33 CH |
730 | tlb_start_vma(&tlb, vma); |
731 | walk_page_range(vma->vm_mm, range.start, range.end, | |
732 | &madvise_free_walk_ops, &tlb); | |
733 | tlb_end_vma(&tlb, vma); | |
ac46d4f3 JG |
734 | mmu_notifier_invalidate_range_end(&range); |
735 | tlb_finish_mmu(&tlb, range.start, range.end); | |
854e9ed0 MK |
736 | |
737 | return 0; | |
738 | } | |
739 | ||
1da177e4 LT |
740 | /* |
741 | * Application no longer needs these pages. If the pages are dirty, | |
742 | * it's OK to just throw them away. The app will be more careful about | |
743 | * data it wants to keep. Be sure to free swap resources too. The | |
7e6cbea3 | 744 | * zap_page_range call sets things up for shrink_active_list to actually free |
1da177e4 LT |
745 | * these pages later if no one else has touched them in the meantime, |
746 | * although we could add these pages to a global reuse list for | |
7e6cbea3 | 747 | * shrink_active_list to pick up before reclaiming other pages. |
1da177e4 LT |
748 | * |
749 | * NB: This interface discards data rather than pushes it out to swap, | |
750 | * as some implementations do. This has performance implications for | |
751 | * applications like large transactional databases which want to discard | |
752 | * pages in anonymous maps after committing to backing store the data | |
753 | * that was kept in them. There is no reason to write this data out to | |
754 | * the swap area if the application is discarding it. | |
755 | * | |
756 | * An interface that causes the system to free clean pages and flush | |
757 | * dirty pages is already available as msync(MS_INVALIDATE). | |
758 | */ | |
230ca982 MR |
759 | static long madvise_dontneed_single_vma(struct vm_area_struct *vma, |
760 | unsigned long start, unsigned long end) | |
761 | { | |
762 | zap_page_range(vma, start, end - start); | |
763 | return 0; | |
764 | } | |
765 | ||
766 | static long madvise_dontneed_free(struct vm_area_struct *vma, | |
767 | struct vm_area_struct **prev, | |
768 | unsigned long start, unsigned long end, | |
769 | int behavior) | |
1da177e4 | 770 | { |
0726b01e MK |
771 | struct mm_struct *mm = vma->vm_mm; |
772 | ||
05b74384 | 773 | *prev = vma; |
9c276cc6 | 774 | if (!can_madv_lru_vma(vma)) |
1da177e4 LT |
775 | return -EINVAL; |
776 | ||
70ccb92f | 777 | if (!userfaultfd_remove(vma, start, end)) { |
c1e8d7c6 | 778 | *prev = NULL; /* mmap_lock has been dropped, prev is stale */ |
70ccb92f | 779 | |
0726b01e MK |
780 | mmap_read_lock(mm); |
781 | vma = find_vma(mm, start); | |
70ccb92f AA |
782 | if (!vma) |
783 | return -ENOMEM; | |
784 | if (start < vma->vm_start) { | |
785 | /* | |
786 | * This "vma" under revalidation is the one | |
787 | * with the lowest vma->vm_start where start | |
788 | * is also < vma->vm_end. If start < | |
789 | * vma->vm_start it means an hole materialized | |
790 | * in the user address space within the | |
230ca982 MR |
791 | * virtual range passed to MADV_DONTNEED |
792 | * or MADV_FREE. | |
70ccb92f AA |
793 | */ |
794 | return -ENOMEM; | |
795 | } | |
9c276cc6 | 796 | if (!can_madv_lru_vma(vma)) |
70ccb92f AA |
797 | return -EINVAL; |
798 | if (end > vma->vm_end) { | |
799 | /* | |
800 | * Don't fail if end > vma->vm_end. If the old | |
c1e8d7c6 | 801 | * vma was splitted while the mmap_lock was |
70ccb92f | 802 | * released the effect of the concurrent |
230ca982 | 803 | * operation may not cause madvise() to |
70ccb92f AA |
804 | * have an undefined result. There may be an |
805 | * adjacent next vma that we'll walk | |
806 | * next. userfaultfd_remove() will generate an | |
807 | * UFFD_EVENT_REMOVE repetition on the | |
808 | * end-vma->vm_end range, but the manager can | |
809 | * handle a repetition fine. | |
810 | */ | |
811 | end = vma->vm_end; | |
812 | } | |
813 | VM_WARN_ON(start >= end); | |
814 | } | |
230ca982 MR |
815 | |
816 | if (behavior == MADV_DONTNEED) | |
817 | return madvise_dontneed_single_vma(vma, start, end); | |
818 | else if (behavior == MADV_FREE) | |
819 | return madvise_free_single_vma(vma, start, end); | |
820 | else | |
821 | return -EINVAL; | |
1da177e4 LT |
822 | } |
823 | ||
f6b3ec23 BP |
824 | /* |
825 | * Application wants to free up the pages and associated backing store. | |
826 | * This is effectively punching a hole into the middle of a file. | |
f6b3ec23 BP |
827 | */ |
828 | static long madvise_remove(struct vm_area_struct *vma, | |
00e9fa2d | 829 | struct vm_area_struct **prev, |
f6b3ec23 BP |
830 | unsigned long start, unsigned long end) |
831 | { | |
3f31d075 | 832 | loff_t offset; |
90ed52eb | 833 | int error; |
9ab4233d | 834 | struct file *f; |
0726b01e | 835 | struct mm_struct *mm = vma->vm_mm; |
f6b3ec23 | 836 | |
c1e8d7c6 | 837 | *prev = NULL; /* tell sys_madvise we drop mmap_lock */ |
00e9fa2d | 838 | |
72079ba0 | 839 | if (vma->vm_flags & VM_LOCKED) |
f6b3ec23 BP |
840 | return -EINVAL; |
841 | ||
9ab4233d AL |
842 | f = vma->vm_file; |
843 | ||
844 | if (!f || !f->f_mapping || !f->f_mapping->host) { | |
f6b3ec23 BP |
845 | return -EINVAL; |
846 | } | |
847 | ||
69cf0fac HD |
848 | if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) |
849 | return -EACCES; | |
850 | ||
f6b3ec23 BP |
851 | offset = (loff_t)(start - vma->vm_start) |
852 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | |
90ed52eb | 853 | |
9ab4233d AL |
854 | /* |
855 | * Filesystem's fallocate may need to take i_mutex. We need to | |
856 | * explicitly grab a reference because the vma (and hence the | |
857 | * vma's reference to the file) can go away as soon as we drop | |
c1e8d7c6 | 858 | * mmap_lock. |
9ab4233d AL |
859 | */ |
860 | get_file(f); | |
70ccb92f | 861 | if (userfaultfd_remove(vma, start, end)) { |
c1e8d7c6 | 862 | /* mmap_lock was not released by userfaultfd_remove() */ |
0726b01e | 863 | mmap_read_unlock(mm); |
70ccb92f | 864 | } |
72c72bdf | 865 | error = vfs_fallocate(f, |
3f31d075 HD |
866 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, |
867 | offset, end - start); | |
9ab4233d | 868 | fput(f); |
0726b01e | 869 | mmap_read_lock(mm); |
90ed52eb | 870 | return error; |
f6b3ec23 BP |
871 | } |
872 | ||
9893e49d AK |
873 | #ifdef CONFIG_MEMORY_FAILURE |
874 | /* | |
875 | * Error injection support for memory error handling. | |
876 | */ | |
97167a76 AK |
877 | static int madvise_inject_error(int behavior, |
878 | unsigned long start, unsigned long end) | |
9893e49d | 879 | { |
d3cd257c | 880 | unsigned long size; |
97167a76 | 881 | |
9893e49d AK |
882 | if (!capable(CAP_SYS_ADMIN)) |
883 | return -EPERM; | |
97167a76 | 884 | |
19bfbe22 | 885 | |
d3cd257c | 886 | for (; start < end; start += size) { |
23e7b5c2 | 887 | unsigned long pfn; |
dc7560b4 | 888 | struct page *page; |
325c4ef5 AM |
889 | int ret; |
890 | ||
97167a76 | 891 | ret = get_user_pages_fast(start, 1, 0, &page); |
9893e49d AK |
892 | if (ret != 1) |
893 | return ret; | |
23e7b5c2 | 894 | pfn = page_to_pfn(page); |
325c4ef5 | 895 | |
19bfbe22 AM |
896 | /* |
897 | * When soft offlining hugepages, after migrating the page | |
898 | * we dissolve it, therefore in the second loop "page" will | |
d3cd257c | 899 | * no longer be a compound page. |
19bfbe22 | 900 | */ |
d3cd257c | 901 | size = page_size(compound_head(page)); |
19bfbe22 | 902 | |
97167a76 AK |
903 | if (behavior == MADV_SOFT_OFFLINE) { |
904 | pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", | |
dc7560b4 | 905 | pfn, start); |
feec24a6 | 906 | ret = soft_offline_page(pfn, MF_COUNT_INCREASED); |
dc7560b4 OS |
907 | } else { |
908 | pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", | |
909 | pfn, start); | |
1e8aaedb | 910 | ret = memory_failure(pfn, MF_COUNT_INCREASED); |
afcf938e | 911 | } |
23e7b5c2 | 912 | |
23a003bf NH |
913 | if (ret) |
914 | return ret; | |
9893e49d | 915 | } |
c461ad6a | 916 | |
325c4ef5 | 917 | return 0; |
9893e49d AK |
918 | } |
919 | #endif | |
920 | ||
165cd402 | 921 | static long |
922 | madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |
923 | unsigned long start, unsigned long end, int behavior) | |
1da177e4 | 924 | { |
1da177e4 | 925 | switch (behavior) { |
f6b3ec23 | 926 | case MADV_REMOVE: |
3866ea90 | 927 | return madvise_remove(vma, prev, start, end); |
1da177e4 | 928 | case MADV_WILLNEED: |
3866ea90 | 929 | return madvise_willneed(vma, prev, start, end); |
9c276cc6 MK |
930 | case MADV_COLD: |
931 | return madvise_cold(vma, prev, start, end); | |
1a4e58cc MK |
932 | case MADV_PAGEOUT: |
933 | return madvise_pageout(vma, prev, start, end); | |
854e9ed0 | 934 | case MADV_FREE: |
1da177e4 | 935 | case MADV_DONTNEED: |
230ca982 | 936 | return madvise_dontneed_free(vma, prev, start, end, behavior); |
1da177e4 | 937 | default: |
3866ea90 | 938 | return madvise_behavior(vma, prev, start, end, behavior); |
1da177e4 | 939 | } |
1da177e4 LT |
940 | } |
941 | ||
1ecef9ed | 942 | static bool |
75927af8 NP |
943 | madvise_behavior_valid(int behavior) |
944 | { | |
945 | switch (behavior) { | |
946 | case MADV_DOFORK: | |
947 | case MADV_DONTFORK: | |
948 | case MADV_NORMAL: | |
949 | case MADV_SEQUENTIAL: | |
950 | case MADV_RANDOM: | |
951 | case MADV_REMOVE: | |
952 | case MADV_WILLNEED: | |
953 | case MADV_DONTNEED: | |
854e9ed0 | 954 | case MADV_FREE: |
9c276cc6 | 955 | case MADV_COLD: |
1a4e58cc | 956 | case MADV_PAGEOUT: |
f8af4da3 HD |
957 | #ifdef CONFIG_KSM |
958 | case MADV_MERGEABLE: | |
959 | case MADV_UNMERGEABLE: | |
0af4e98b AA |
960 | #endif |
961 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
962 | case MADV_HUGEPAGE: | |
a664b2d8 | 963 | case MADV_NOHUGEPAGE: |
f8af4da3 | 964 | #endif |
accb61fe JB |
965 | case MADV_DONTDUMP: |
966 | case MADV_DODUMP: | |
d2cd9ede RR |
967 | case MADV_WIPEONFORK: |
968 | case MADV_KEEPONFORK: | |
5e451be7 AK |
969 | #ifdef CONFIG_MEMORY_FAILURE |
970 | case MADV_SOFT_OFFLINE: | |
971 | case MADV_HWPOISON: | |
972 | #endif | |
1ecef9ed | 973 | return true; |
75927af8 NP |
974 | |
975 | default: | |
1ecef9ed | 976 | return false; |
75927af8 NP |
977 | } |
978 | } | |
3866ea90 | 979 | |
ecb8ac8b MK |
980 | static bool |
981 | process_madvise_behavior_valid(int behavior) | |
982 | { | |
983 | switch (behavior) { | |
984 | case MADV_COLD: | |
985 | case MADV_PAGEOUT: | |
986 | return true; | |
987 | default: | |
988 | return false; | |
989 | } | |
990 | } | |
991 | ||
1da177e4 LT |
992 | /* |
993 | * The madvise(2) system call. | |
994 | * | |
995 | * Applications can use madvise() to advise the kernel how it should | |
996 | * handle paging I/O in this VM area. The idea is to help the kernel | |
997 | * use appropriate read-ahead and caching techniques. The information | |
998 | * provided is advisory only, and can be safely disregarded by the | |
999 | * kernel without affecting the correct operation of the application. | |
1000 | * | |
1001 | * behavior values: | |
1002 | * MADV_NORMAL - the default behavior is to read clusters. This | |
1003 | * results in some read-ahead and read-behind. | |
1004 | * MADV_RANDOM - the system should read the minimum amount of data | |
1005 | * on any access, since it is unlikely that the appli- | |
1006 | * cation will need more than what it asks for. | |
1007 | * MADV_SEQUENTIAL - pages in the given range will probably be accessed | |
1008 | * once, so they can be aggressively read ahead, and | |
1009 | * can be freed soon after they are accessed. | |
1010 | * MADV_WILLNEED - the application is notifying the system to read | |
1011 | * some pages ahead. | |
1012 | * MADV_DONTNEED - the application is finished with the given range, | |
1013 | * so the kernel can free resources associated with it. | |
d7206a70 NH |
1014 | * MADV_FREE - the application marks pages in the given range as lazy free, |
1015 | * where actual purges are postponed until memory pressure happens. | |
f6b3ec23 BP |
1016 | * MADV_REMOVE - the application wants to free up the given range of |
1017 | * pages and associated backing store. | |
3866ea90 HD |
1018 | * MADV_DONTFORK - omit this area from child's address space when forking: |
1019 | * typically, to avoid COWing pages pinned by get_user_pages(). | |
1020 | * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. | |
c02c3009 YS |
1021 | * MADV_WIPEONFORK - present the child process with zero-filled memory in this |
1022 | * range after a fork. | |
1023 | * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK | |
d7206a70 NH |
1024 | * MADV_HWPOISON - trigger memory error handler as if the given memory range |
1025 | * were corrupted by unrecoverable hardware memory failure. | |
1026 | * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. | |
f8af4da3 HD |
1027 | * MADV_MERGEABLE - the application recommends that KSM try to merge pages in |
1028 | * this area with pages of identical content from other such areas. | |
1029 | * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. | |
d7206a70 NH |
1030 | * MADV_HUGEPAGE - the application wants to back the given range by transparent |
1031 | * huge pages in the future. Existing pages might be coalesced and | |
1032 | * new pages might be allocated as THP. | |
1033 | * MADV_NOHUGEPAGE - mark the given range as not worth being backed by | |
1034 | * transparent huge pages so the existing pages will not be | |
1035 | * coalesced into THP and new pages will not be allocated as THP. | |
1036 | * MADV_DONTDUMP - the application wants to prevent pages in the given range | |
1037 | * from being included in its core dump. | |
1038 | * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. | |
ecb8ac8b MK |
1039 | * MADV_COLD - the application is not expected to use this memory soon, |
1040 | * deactivate pages in this range so that they can be reclaimed | |
1041 | * easily if memory pressure hanppens. | |
1042 | * MADV_PAGEOUT - the application is not expected to use this memory soon, | |
1043 | * page out the pages in this range immediately. | |
1da177e4 LT |
1044 | * |
1045 | * return values: | |
1046 | * zero - success | |
1047 | * -EINVAL - start + len < 0, start is not page-aligned, | |
1048 | * "behavior" is not a valid value, or application | |
c02c3009 YS |
1049 | * is attempting to release locked or shared pages, |
1050 | * or the specified address range includes file, Huge TLB, | |
1051 | * MAP_SHARED or VMPFNMAP range. | |
1da177e4 LT |
1052 | * -ENOMEM - addresses in the specified range are not currently |
1053 | * mapped, or are outside the AS of the process. | |
1054 | * -EIO - an I/O error occurred while paging in data. | |
1055 | * -EBADF - map exists, but area maps something that isn't a file. | |
1056 | * -EAGAIN - a kernel resource was temporarily unavailable. | |
1057 | */ | |
0726b01e | 1058 | int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) |
1da177e4 | 1059 | { |
05b74384 | 1060 | unsigned long end, tmp; |
ec9bed9d | 1061 | struct vm_area_struct *vma, *prev; |
1da177e4 LT |
1062 | int unmapped_error = 0; |
1063 | int error = -EINVAL; | |
f7977793 | 1064 | int write; |
1da177e4 | 1065 | size_t len; |
1998cc04 | 1066 | struct blk_plug plug; |
1da177e4 | 1067 | |
057d3389 AK |
1068 | start = untagged_addr(start); |
1069 | ||
75927af8 NP |
1070 | if (!madvise_behavior_valid(behavior)) |
1071 | return error; | |
1072 | ||
df6c6500 | 1073 | if (!PAGE_ALIGNED(start)) |
84d96d89 | 1074 | return error; |
df6c6500 | 1075 | len = PAGE_ALIGN(len_in); |
1da177e4 LT |
1076 | |
1077 | /* Check to see whether len was rounded up from small -ve to zero */ | |
1078 | if (len_in && !len) | |
84d96d89 | 1079 | return error; |
1da177e4 LT |
1080 | |
1081 | end = start + len; | |
1082 | if (end < start) | |
84d96d89 | 1083 | return error; |
1da177e4 LT |
1084 | |
1085 | error = 0; | |
1086 | if (end == start) | |
84d96d89 RV |
1087 | return error; |
1088 | ||
5e451be7 AK |
1089 | #ifdef CONFIG_MEMORY_FAILURE |
1090 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) | |
1091 | return madvise_inject_error(behavior, start, start + len_in); | |
1092 | #endif | |
1093 | ||
84d96d89 | 1094 | write = madvise_need_mmap_write(behavior); |
dc0ef0df | 1095 | if (write) { |
0726b01e | 1096 | if (mmap_write_lock_killable(mm)) |
dc0ef0df MH |
1097 | return -EINTR; |
1098 | } else { | |
0726b01e | 1099 | mmap_read_lock(mm); |
dc0ef0df | 1100 | } |
1da177e4 LT |
1101 | |
1102 | /* | |
1103 | * If the interval [start,end) covers some unmapped address | |
1104 | * ranges, just ignore them, but return -ENOMEM at the end. | |
05b74384 | 1105 | * - different from the way of handling in mlock etc. |
1da177e4 | 1106 | */ |
0726b01e | 1107 | vma = find_vma_prev(mm, start, &prev); |
836d5ffd HD |
1108 | if (vma && start > vma->vm_start) |
1109 | prev = vma; | |
1110 | ||
1998cc04 | 1111 | blk_start_plug(&plug); |
1da177e4 LT |
1112 | for (;;) { |
1113 | /* Still start < end. */ | |
1114 | error = -ENOMEM; | |
1115 | if (!vma) | |
84d96d89 | 1116 | goto out; |
1da177e4 | 1117 | |
05b74384 | 1118 | /* Here start < (end|vma->vm_end). */ |
1da177e4 LT |
1119 | if (start < vma->vm_start) { |
1120 | unmapped_error = -ENOMEM; | |
1121 | start = vma->vm_start; | |
05b74384 | 1122 | if (start >= end) |
84d96d89 | 1123 | goto out; |
1da177e4 LT |
1124 | } |
1125 | ||
05b74384 PM |
1126 | /* Here vma->vm_start <= start < (end|vma->vm_end) */ |
1127 | tmp = vma->vm_end; | |
1128 | if (end < tmp) | |
1129 | tmp = end; | |
1da177e4 | 1130 | |
05b74384 PM |
1131 | /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ |
1132 | error = madvise_vma(vma, &prev, start, tmp, behavior); | |
1da177e4 | 1133 | if (error) |
84d96d89 | 1134 | goto out; |
05b74384 | 1135 | start = tmp; |
90ed52eb | 1136 | if (prev && start < prev->vm_end) |
05b74384 PM |
1137 | start = prev->vm_end; |
1138 | error = unmapped_error; | |
1139 | if (start >= end) | |
84d96d89 | 1140 | goto out; |
90ed52eb HD |
1141 | if (prev) |
1142 | vma = prev->vm_next; | |
c1e8d7c6 | 1143 | else /* madvise_remove dropped mmap_lock */ |
0726b01e | 1144 | vma = find_vma(mm, start); |
1da177e4 | 1145 | } |
1da177e4 | 1146 | out: |
84d96d89 | 1147 | blk_finish_plug(&plug); |
f7977793 | 1148 | if (write) |
0726b01e | 1149 | mmap_write_unlock(mm); |
0a27a14a | 1150 | else |
0726b01e | 1151 | mmap_read_unlock(mm); |
0a27a14a | 1152 | |
1da177e4 LT |
1153 | return error; |
1154 | } | |
db08ca25 JA |
1155 | |
1156 | SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |
1157 | { | |
0726b01e | 1158 | return do_madvise(current->mm, start, len_in, behavior); |
db08ca25 | 1159 | } |
ecb8ac8b MK |
1160 | |
1161 | SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, | |
1162 | size_t, vlen, int, behavior, unsigned int, flags) | |
1163 | { | |
1164 | ssize_t ret; | |
1165 | struct iovec iovstack[UIO_FASTIOV], iovec; | |
1166 | struct iovec *iov = iovstack; | |
1167 | struct iov_iter iter; | |
1168 | struct pid *pid; | |
1169 | struct task_struct *task; | |
1170 | struct mm_struct *mm; | |
1171 | size_t total_len; | |
1172 | unsigned int f_flags; | |
1173 | ||
1174 | if (flags != 0) { | |
1175 | ret = -EINVAL; | |
1176 | goto out; | |
1177 | } | |
1178 | ||
1179 | ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); | |
1180 | if (ret < 0) | |
1181 | goto out; | |
1182 | ||
1183 | pid = pidfd_get_pid(pidfd, &f_flags); | |
1184 | if (IS_ERR(pid)) { | |
1185 | ret = PTR_ERR(pid); | |
1186 | goto free_iov; | |
1187 | } | |
1188 | ||
1189 | task = get_pid_task(pid, PIDTYPE_PID); | |
1190 | if (!task) { | |
1191 | ret = -ESRCH; | |
1192 | goto put_pid; | |
1193 | } | |
1194 | ||
a68a0262 | 1195 | if (!process_madvise_behavior_valid(behavior)) { |
ecb8ac8b MK |
1196 | ret = -EINVAL; |
1197 | goto release_task; | |
1198 | } | |
1199 | ||
1200 | mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); | |
1201 | if (IS_ERR_OR_NULL(mm)) { | |
1202 | ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; | |
1203 | goto release_task; | |
1204 | } | |
1205 | ||
1206 | total_len = iov_iter_count(&iter); | |
1207 | ||
1208 | while (iov_iter_count(&iter)) { | |
1209 | iovec = iov_iter_iovec(&iter); | |
1210 | ret = do_madvise(mm, (unsigned long)iovec.iov_base, | |
1211 | iovec.iov_len, behavior); | |
1212 | if (ret < 0) | |
1213 | break; | |
1214 | iov_iter_advance(&iter, iovec.iov_len); | |
1215 | } | |
1216 | ||
1217 | if (ret == 0) | |
1218 | ret = total_len - iov_iter_count(&iter); | |
1219 | ||
1220 | mmput(mm); | |
ecb8ac8b MK |
1221 | release_task: |
1222 | put_task_struct(task); | |
1223 | put_pid: | |
1224 | put_pid(pid); | |
1225 | free_iov: | |
1226 | kfree(iov); | |
1227 | out: | |
1228 | return ret; | |
1229 | } |