]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - mm/mempolicy.c
tunnels: do not assume mac header is set in skb_tunnel_check_pmtu()
[mirror_ubuntu-jammy-kernel.git] / mm / mempolicy.c
CommitLineData
46aeb7e6 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4 28 * preferred Try a specific node first before normal fallback.
00ef2d2f 29 * As a special case NUMA_NO_NODE here means do the allocation
1da177e4
LT
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
b27abacc
DH
34 * preferred many Try a set of nodes first before normal fallback. This is
35 * similar to preferred without the special case.
36 *
1da177e4
LT
37 * default Allocate on the local node first, or when on a VMA
38 * use the process policy. This is what Linux always did
39 * in a NUMA aware kernel and still does by, ahem, default.
40 *
41 * The process policy is applied for most non interrupt memory allocations
42 * in that process' context. Interrupts ignore the policies and always
43 * try to allocate on the local CPU. The VMA policy is only applied for memory
44 * allocations for a VMA in the VM.
45 *
46 * Currently there are a few corner cases in swapping where the policy
47 * is not applied, but the majority should be handled. When process policy
48 * is used it is not remembered over swap outs/swap ins.
49 *
50 * Only the highest zone in the zone hierarchy gets policied. Allocations
51 * requesting a lower zone just use default policy. This implies that
52 * on systems with highmem kernel lowmem allocation don't get policied.
53 * Same with GFP_DMA allocations.
54 *
55 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
56 * all users and remembered even when nobody has memory mapped.
57 */
58
59/* Notebook:
60 fix mmap readahead to honour policy and enable policy for any page cache
61 object
62 statistics for bigpages
63 global policy for page cache? currently it uses process policy. Requires
64 first item above.
65 handle mremap for shared memory (currently ignored for the policy)
66 grows down?
67 make bind policy root only? It can trigger oom much faster and the
68 kernel is not always grateful with that.
1da177e4
LT
69*/
70
b1de0d13
MH
71#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72
1da177e4 73#include <linux/mempolicy.h>
a520110e 74#include <linux/pagewalk.h>
1da177e4
LT
75#include <linux/highmem.h>
76#include <linux/hugetlb.h>
77#include <linux/kernel.h>
78#include <linux/sched.h>
6e84f315 79#include <linux/sched/mm.h>
6a3827d7 80#include <linux/sched/numa_balancing.h>
f719ff9b 81#include <linux/sched/task.h>
1da177e4
LT
82#include <linux/nodemask.h>
83#include <linux/cpuset.h>
1da177e4
LT
84#include <linux/slab.h>
85#include <linux/string.h>
b95f1b31 86#include <linux/export.h>
b488893a 87#include <linux/nsproxy.h>
1da177e4
LT
88#include <linux/interrupt.h>
89#include <linux/init.h>
90#include <linux/compat.h>
31367466 91#include <linux/ptrace.h>
dc9aa5b9 92#include <linux/swap.h>
1a75a6c8
CL
93#include <linux/seq_file.h>
94#include <linux/proc_fs.h>
b20a3503 95#include <linux/migrate.h>
62b61f61 96#include <linux/ksm.h>
95a402c3 97#include <linux/rmap.h>
86c3a764 98#include <linux/security.h>
dbcb0f19 99#include <linux/syscalls.h>
095f1fc4 100#include <linux/ctype.h>
6d9c285a 101#include <linux/mm_inline.h>
b24f53a0 102#include <linux/mmu_notifier.h>
b1de0d13 103#include <linux/printk.h>
c8633798 104#include <linux/swapops.h>
dc9aa5b9 105
1da177e4 106#include <asm/tlbflush.h>
7c0f6ba6 107#include <linux/uaccess.h>
1da177e4 108
62695a84
NP
109#include "internal.h"
110
38e35860 111/* Internal flags */
dc9aa5b9 112#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 113#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
dc9aa5b9 114
fcc234f8
PE
115static struct kmem_cache *policy_cache;
116static struct kmem_cache *sn_cache;
1da177e4 117
1da177e4
LT
118/* Highest zone. An specific allocation for a zone below that is not
119 policied. */
6267276f 120enum zone_type policy_zone = 0;
1da177e4 121
bea904d5
LS
122/*
123 * run-time system-wide default policy => local allocation
124 */
e754d79d 125static struct mempolicy default_policy = {
1da177e4 126 .refcnt = ATOMIC_INIT(1), /* never free it */
7858d7bc 127 .mode = MPOL_LOCAL,
1da177e4
LT
128};
129
5606e387
MG
130static struct mempolicy preferred_node_policy[MAX_NUMNODES];
131
b2ca916c
DW
132/**
133 * numa_map_to_online_node - Find closest online node
f6e92f40 134 * @node: Node id to start the search
b2ca916c
DW
135 *
136 * Lookup the next closest node by distance if @nid is not online.
137 */
138int numa_map_to_online_node(int node)
139{
4fcbe96e 140 int min_dist = INT_MAX, dist, n, min_node;
b2ca916c 141
4fcbe96e
DW
142 if (node == NUMA_NO_NODE || node_online(node))
143 return node;
b2ca916c
DW
144
145 min_node = node;
4fcbe96e
DW
146 for_each_online_node(n) {
147 dist = node_distance(node, n);
148 if (dist < min_dist) {
149 min_dist = dist;
150 min_node = n;
b2ca916c
DW
151 }
152 }
153
154 return min_node;
155}
156EXPORT_SYMBOL_GPL(numa_map_to_online_node);
157
74d2c3a0 158struct mempolicy *get_task_policy(struct task_struct *p)
5606e387
MG
159{
160 struct mempolicy *pol = p->mempolicy;
f15ca78e 161 int node;
5606e387 162
f15ca78e
ON
163 if (pol)
164 return pol;
5606e387 165
f15ca78e
ON
166 node = numa_node_id();
167 if (node != NUMA_NO_NODE) {
168 pol = &preferred_node_policy[node];
169 /* preferred_node_policy is not initialised early in boot */
170 if (pol->mode)
171 return pol;
5606e387
MG
172 }
173
f15ca78e 174 return &default_policy;
5606e387
MG
175}
176
37012946
DR
177static const struct mempolicy_operations {
178 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
213980c0 179 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
37012946
DR
180} mpol_ops[MPOL_MAX];
181
f5b087b5
DR
182static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
183{
6d556294 184 return pol->flags & MPOL_MODE_FLAGS;
4c50bc01
DR
185}
186
187static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
188 const nodemask_t *rel)
189{
190 nodemask_t tmp;
191 nodes_fold(tmp, *orig, nodes_weight(*rel));
192 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
193}
194
be897d48 195static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
196{
197 if (nodes_empty(*nodes))
198 return -EINVAL;
269fbe72 199 pol->nodes = *nodes;
37012946
DR
200 return 0;
201}
202
203static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
204{
7858d7bc
FT
205 if (nodes_empty(*nodes))
206 return -EINVAL;
269fbe72
BW
207
208 nodes_clear(pol->nodes);
209 node_set(first_node(*nodes), pol->nodes);
37012946
DR
210 return 0;
211}
212
58568d2a
MX
213/*
214 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
215 * any, for the new policy. mpol_new() has already validated the nodes
7858d7bc 216 * parameter with respect to the policy mode and flags.
58568d2a
MX
217 *
218 * Must be called holding task's alloc_lock to protect task's mems_allowed
c1e8d7c6 219 * and mempolicy. May also be called holding the mmap_lock for write.
58568d2a 220 */
4bfc4495
KH
221static int mpol_set_nodemask(struct mempolicy *pol,
222 const nodemask_t *nodes, struct nodemask_scratch *nsc)
58568d2a 223{
58568d2a
MX
224 int ret;
225
7858d7bc
FT
226 /*
227 * Default (pol==NULL) resp. local memory policies are not a
228 * subject of any remapping. They also do not need any special
229 * constructor.
230 */
231 if (!pol || pol->mode == MPOL_LOCAL)
58568d2a 232 return 0;
7858d7bc 233
01f13bd6 234 /* Check N_MEMORY */
4bfc4495 235 nodes_and(nsc->mask1,
01f13bd6 236 cpuset_current_mems_allowed, node_states[N_MEMORY]);
58568d2a
MX
237
238 VM_BUG_ON(!nodes);
4bfc4495 239
7858d7bc
FT
240 if (pol->flags & MPOL_F_RELATIVE_NODES)
241 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
242 else
243 nodes_and(nsc->mask2, *nodes, nsc->mask1);
58568d2a 244
7858d7bc
FT
245 if (mpol_store_user_nodemask(pol))
246 pol->w.user_nodemask = *nodes;
4bfc4495 247 else
7858d7bc
FT
248 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
249
250 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
58568d2a
MX
251 return ret;
252}
253
254/*
255 * This function just creates a new policy, does some check and simple
256 * initialization. You must invoke mpol_set_nodemask() to set nodes.
257 */
028fec41
DR
258static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
259 nodemask_t *nodes)
1da177e4
LT
260{
261 struct mempolicy *policy;
262
028fec41 263 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
00ef2d2f 264 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
140d5a49 265
3e1f0645
DR
266 if (mode == MPOL_DEFAULT) {
267 if (nodes && !nodes_empty(*nodes))
37012946 268 return ERR_PTR(-EINVAL);
d3a71033 269 return NULL;
37012946 270 }
3e1f0645
DR
271 VM_BUG_ON(!nodes);
272
273 /*
274 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
275 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
276 * All other modes require a valid pointer to a non-empty nodemask.
277 */
278 if (mode == MPOL_PREFERRED) {
279 if (nodes_empty(*nodes)) {
280 if (((flags & MPOL_F_STATIC_NODES) ||
281 (flags & MPOL_F_RELATIVE_NODES)))
282 return ERR_PTR(-EINVAL);
7858d7bc
FT
283
284 mode = MPOL_LOCAL;
3e1f0645 285 }
479e2802 286 } else if (mode == MPOL_LOCAL) {
8d303e44
PK
287 if (!nodes_empty(*nodes) ||
288 (flags & MPOL_F_STATIC_NODES) ||
289 (flags & MPOL_F_RELATIVE_NODES))
479e2802 290 return ERR_PTR(-EINVAL);
3e1f0645
DR
291 } else if (nodes_empty(*nodes))
292 return ERR_PTR(-EINVAL);
1da177e4
LT
293 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
294 if (!policy)
295 return ERR_PTR(-ENOMEM);
296 atomic_set(&policy->refcnt, 1);
45c4745a 297 policy->mode = mode;
3e1f0645 298 policy->flags = flags;
37012946 299
1da177e4 300 return policy;
37012946
DR
301}
302
52cd3b07
LS
303/* Slow path of a mpol destructor. */
304void __mpol_put(struct mempolicy *p)
305{
306 if (!atomic_dec_and_test(&p->refcnt))
307 return;
52cd3b07
LS
308 kmem_cache_free(policy_cache, p);
309}
310
213980c0 311static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
312{
313}
314
213980c0 315static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
316{
317 nodemask_t tmp;
318
319 if (pol->flags & MPOL_F_STATIC_NODES)
320 nodes_and(tmp, pol->w.user_nodemask, *nodes);
321 else if (pol->flags & MPOL_F_RELATIVE_NODES)
322 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
323 else {
269fbe72 324 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
213980c0 325 *nodes);
29b190fa 326 pol->w.cpuset_mems_allowed = *nodes;
37012946 327 }
f5b087b5 328
708c1bbc
MX
329 if (nodes_empty(tmp))
330 tmp = *nodes;
331
269fbe72 332 pol->nodes = tmp;
37012946
DR
333}
334
335static void mpol_rebind_preferred(struct mempolicy *pol,
213980c0 336 const nodemask_t *nodes)
37012946 337{
7858d7bc 338 pol->w.cpuset_mems_allowed = *nodes;
1da177e4
LT
339}
340
708c1bbc
MX
341/*
342 * mpol_rebind_policy - Migrate a policy to a different set of nodes
343 *
c1e8d7c6 344 * Per-vma policies are protected by mmap_lock. Allocations using per-task
213980c0
VB
345 * policies are protected by task->mems_allowed_seq to prevent a premature
346 * OOM/allocation failure due to parallel nodemask modification.
708c1bbc 347 */
213980c0 348static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1d0d2680 349{
1d0d2680
DR
350 if (!pol)
351 return;
7858d7bc 352 if (!mpol_store_user_nodemask(pol) &&
1d0d2680
DR
353 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
354 return;
708c1bbc 355
213980c0 356 mpol_ops[pol->mode].rebind(pol, newmask);
1d0d2680
DR
357}
358
359/*
360 * Wrapper for mpol_rebind_policy() that just requires task
361 * pointer, and updates task mempolicy.
58568d2a
MX
362 *
363 * Called with task's alloc_lock held.
1d0d2680
DR
364 */
365
213980c0 366void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1d0d2680 367{
213980c0 368 mpol_rebind_policy(tsk->mempolicy, new);
1d0d2680
DR
369}
370
371/*
372 * Rebind each vma in mm to new nodemask.
373 *
c1e8d7c6 374 * Call holding a reference to mm. Takes mm->mmap_lock during call.
1d0d2680
DR
375 */
376
377void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
378{
379 struct vm_area_struct *vma;
380
d8ed45c5 381 mmap_write_lock(mm);
1d0d2680 382 for (vma = mm->mmap; vma; vma = vma->vm_next)
213980c0 383 mpol_rebind_policy(vma->vm_policy, new);
d8ed45c5 384 mmap_write_unlock(mm);
1d0d2680
DR
385}
386
37012946
DR
387static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
388 [MPOL_DEFAULT] = {
389 .rebind = mpol_rebind_default,
390 },
391 [MPOL_INTERLEAVE] = {
be897d48 392 .create = mpol_new_nodemask,
37012946
DR
393 .rebind = mpol_rebind_nodemask,
394 },
395 [MPOL_PREFERRED] = {
396 .create = mpol_new_preferred,
397 .rebind = mpol_rebind_preferred,
398 },
399 [MPOL_BIND] = {
be897d48 400 .create = mpol_new_nodemask,
37012946
DR
401 .rebind = mpol_rebind_nodemask,
402 },
7858d7bc
FT
403 [MPOL_LOCAL] = {
404 .rebind = mpol_rebind_default,
405 },
b27abacc 406 [MPOL_PREFERRED_MANY] = {
be897d48 407 .create = mpol_new_nodemask,
b27abacc
DH
408 .rebind = mpol_rebind_preferred,
409 },
37012946
DR
410};
411
a53190a4 412static int migrate_page_add(struct page *page, struct list_head *pagelist,
fc301289 413 unsigned long flags);
1a75a6c8 414
6f4576e3
NH
415struct queue_pages {
416 struct list_head *pagelist;
417 unsigned long flags;
418 nodemask_t *nmask;
f18da660
LX
419 unsigned long start;
420 unsigned long end;
421 struct vm_area_struct *first;
6f4576e3
NH
422};
423
88aaa2a1
NH
424/*
425 * Check if the page's nid is in qp->nmask.
426 *
427 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
428 * in the invert of qp->nmask.
429 */
430static inline bool queue_pages_required(struct page *page,
431 struct queue_pages *qp)
432{
433 int nid = page_to_nid(page);
434 unsigned long flags = qp->flags;
435
436 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
437}
438
a7f40cfe 439/*
d8835445 440 * queue_pages_pmd() has four possible return values:
e5947d23
YS
441 * 0 - pages are placed on the right node or queued successfully, or
442 * special page is met, i.e. huge zero page.
d8835445
YS
443 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
444 * specified.
445 * 2 - THP was split.
446 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
447 * existing page was already on a node that does not follow the
448 * policy.
a7f40cfe 449 */
c8633798
NH
450static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
451 unsigned long end, struct mm_walk *walk)
959a7e13 452 __releases(ptl)
c8633798
NH
453{
454 int ret = 0;
455 struct page *page;
456 struct queue_pages *qp = walk->private;
457 unsigned long flags;
458
459 if (unlikely(is_pmd_migration_entry(*pmd))) {
a7f40cfe 460 ret = -EIO;
c8633798
NH
461 goto unlock;
462 }
463 page = pmd_page(*pmd);
464 if (is_huge_zero_page(page)) {
465 spin_unlock(ptl);
e5947d23 466 walk->action = ACTION_CONTINUE;
c8633798
NH
467 goto out;
468 }
d8835445 469 if (!queue_pages_required(page, qp))
c8633798 470 goto unlock;
c8633798 471
c8633798
NH
472 flags = qp->flags;
473 /* go to thp migration */
a7f40cfe 474 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
a53190a4
YS
475 if (!vma_migratable(walk->vma) ||
476 migrate_page_add(page, qp->pagelist, flags)) {
d8835445 477 ret = 1;
a7f40cfe
YS
478 goto unlock;
479 }
a7f40cfe
YS
480 } else
481 ret = -EIO;
c8633798
NH
482unlock:
483 spin_unlock(ptl);
484out:
485 return ret;
486}
487
98094945
NH
488/*
489 * Scan through pages checking if pages follow certain conditions,
490 * and move them to the pagelist if they do.
d8835445
YS
491 *
492 * queue_pages_pte_range() has three possible return values:
e5947d23
YS
493 * 0 - pages are placed on the right node or queued successfully, or
494 * special page is met, i.e. zero page.
d8835445
YS
495 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
496 * specified.
497 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
498 * on a node that does not follow the policy.
98094945 499 */
6f4576e3
NH
500static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
501 unsigned long end, struct mm_walk *walk)
1da177e4 502{
6f4576e3
NH
503 struct vm_area_struct *vma = walk->vma;
504 struct page *page;
505 struct queue_pages *qp = walk->private;
506 unsigned long flags = qp->flags;
c8633798 507 int ret;
d8835445 508 bool has_unmovable = false;
3f088420 509 pte_t *pte, *mapped_pte;
705e87c0 510 spinlock_t *ptl;
941150a3 511
c8633798
NH
512 ptl = pmd_trans_huge_lock(pmd, vma);
513 if (ptl) {
514 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
d8835445 515 if (ret != 2)
a7f40cfe 516 return ret;
248db92d 517 }
d8835445 518 /* THP was split, fall through to pte walk */
91612e0d 519
337d9abf
NH
520 if (pmd_trans_unstable(pmd))
521 return 0;
94723aaf 522
3f088420 523 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
6f4576e3 524 for (; addr != end; pte++, addr += PAGE_SIZE) {
91612e0d 525 if (!pte_present(*pte))
1da177e4 526 continue;
6aab341e
LT
527 page = vm_normal_page(vma, addr, *pte);
528 if (!page)
1da177e4 529 continue;
053837fc 530 /*
62b61f61
HD
531 * vm_normal_page() filters out zero pages, but there might
532 * still be PageReserved pages to skip, perhaps in a VDSO.
053837fc 533 */
b79bc0a0 534 if (PageReserved(page))
f4598c8b 535 continue;
88aaa2a1 536 if (!queue_pages_required(page, qp))
38e35860 537 continue;
a7f40cfe 538 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
d8835445
YS
539 /* MPOL_MF_STRICT must be specified if we get here */
540 if (!vma_migratable(vma)) {
541 has_unmovable = true;
a7f40cfe 542 break;
d8835445 543 }
a53190a4
YS
544
545 /*
546 * Do not abort immediately since there may be
547 * temporary off LRU pages in the range. Still
548 * need migrate other LRU pages.
549 */
550 if (migrate_page_add(page, qp->pagelist, flags))
551 has_unmovable = true;
a7f40cfe
YS
552 } else
553 break;
6f4576e3 554 }
3f088420 555 pte_unmap_unlock(mapped_pte, ptl);
6f4576e3 556 cond_resched();
d8835445
YS
557
558 if (has_unmovable)
559 return 1;
560
a7f40cfe 561 return addr != end ? -EIO : 0;
91612e0d
HD
562}
563
6f4576e3
NH
564static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
565 unsigned long addr, unsigned long end,
566 struct mm_walk *walk)
e2d8cf40 567{
dcf17635 568 int ret = 0;
e2d8cf40 569#ifdef CONFIG_HUGETLB_PAGE
6f4576e3 570 struct queue_pages *qp = walk->private;
dcf17635 571 unsigned long flags = (qp->flags & MPOL_MF_VALID);
e2d8cf40 572 struct page *page;
cb900f41 573 spinlock_t *ptl;
d4c54919 574 pte_t entry;
e2d8cf40 575
6f4576e3
NH
576 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
577 entry = huge_ptep_get(pte);
d4c54919
NH
578 if (!pte_present(entry))
579 goto unlock;
580 page = pte_page(entry);
88aaa2a1 581 if (!queue_pages_required(page, qp))
e2d8cf40 582 goto unlock;
dcf17635
LX
583
584 if (flags == MPOL_MF_STRICT) {
585 /*
586 * STRICT alone means only detecting misplaced page and no
587 * need to further check other vma.
588 */
589 ret = -EIO;
590 goto unlock;
591 }
592
593 if (!vma_migratable(walk->vma)) {
594 /*
595 * Must be STRICT with MOVE*, otherwise .test_walk() have
596 * stopped walking current vma.
597 * Detecting misplaced page but allow migrating pages which
598 * have been queued.
599 */
600 ret = 1;
601 goto unlock;
602 }
603
e2d8cf40
NH
604 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
605 if (flags & (MPOL_MF_MOVE_ALL) ||
dcf17635
LX
606 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
607 if (!isolate_huge_page(page, qp->pagelist) &&
608 (flags & MPOL_MF_STRICT))
609 /*
610 * Failed to isolate page but allow migrating pages
611 * which have been queued.
612 */
613 ret = 1;
614 }
e2d8cf40 615unlock:
cb900f41 616 spin_unlock(ptl);
e2d8cf40
NH
617#else
618 BUG();
619#endif
dcf17635 620 return ret;
1da177e4
LT
621}
622
5877231f 623#ifdef CONFIG_NUMA_BALANCING
b24f53a0 624/*
4b10e7d5
MG
625 * This is used to mark a range of virtual addresses to be inaccessible.
626 * These are later cleared by a NUMA hinting fault. Depending on these
627 * faults, pages may be migrated for better NUMA placement.
628 *
629 * This is assuming that NUMA faults are handled using PROT_NONE. If
630 * an architecture makes a different choice, it will need further
631 * changes to the core.
b24f53a0 632 */
4b10e7d5
MG
633unsigned long change_prot_numa(struct vm_area_struct *vma,
634 unsigned long addr, unsigned long end)
b24f53a0 635{
4b10e7d5 636 int nr_updated;
b24f53a0 637
58705444 638 nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
03c5a6e1
MG
639 if (nr_updated)
640 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
b24f53a0 641
4b10e7d5 642 return nr_updated;
b24f53a0
LS
643}
644#else
645static unsigned long change_prot_numa(struct vm_area_struct *vma,
646 unsigned long addr, unsigned long end)
647{
648 return 0;
649}
5877231f 650#endif /* CONFIG_NUMA_BALANCING */
b24f53a0 651
6f4576e3
NH
652static int queue_pages_test_walk(unsigned long start, unsigned long end,
653 struct mm_walk *walk)
654{
655 struct vm_area_struct *vma = walk->vma;
656 struct queue_pages *qp = walk->private;
657 unsigned long endvma = vma->vm_end;
658 unsigned long flags = qp->flags;
659
a18b3ac2 660 /* range check first */
ce33135c 661 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
f18da660
LX
662
663 if (!qp->first) {
664 qp->first = vma;
665 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
666 (qp->start < vma->vm_start))
667 /* hole at head side of range */
a18b3ac2
LX
668 return -EFAULT;
669 }
f18da660
LX
670 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
671 ((vma->vm_end < qp->end) &&
672 (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
673 /* hole at middle or tail of range */
674 return -EFAULT;
a18b3ac2 675
a7f40cfe
YS
676 /*
677 * Need check MPOL_MF_STRICT to return -EIO if possible
678 * regardless of vma_migratable
679 */
680 if (!vma_migratable(vma) &&
681 !(flags & MPOL_MF_STRICT))
48684a65
NH
682 return 1;
683
6f4576e3
NH
684 if (endvma > end)
685 endvma = end;
6f4576e3 686
6f4576e3
NH
687 if (flags & MPOL_MF_LAZY) {
688 /* Similar to task_numa_work, skip inaccessible VMAs */
3122e80e 689 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
4355c018 690 !(vma->vm_flags & VM_MIXEDMAP))
6f4576e3
NH
691 change_prot_numa(vma, start, endvma);
692 return 1;
693 }
694
77bf45e7 695 /* queue pages from current vma */
a7f40cfe 696 if (flags & MPOL_MF_VALID)
6f4576e3
NH
697 return 0;
698 return 1;
699}
700
7b86ac33
CH
701static const struct mm_walk_ops queue_pages_walk_ops = {
702 .hugetlb_entry = queue_pages_hugetlb,
703 .pmd_entry = queue_pages_pte_range,
704 .test_walk = queue_pages_test_walk,
705};
706
dc9aa5b9 707/*
98094945
NH
708 * Walk through page tables and collect pages to be migrated.
709 *
710 * If pages found in a given range are on a set of nodes (determined by
711 * @nodes and @flags,) it's isolated and queued to the pagelist which is
d8835445
YS
712 * passed via @private.
713 *
714 * queue_pages_range() has three possible return values:
715 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
716 * specified.
717 * 0 - queue pages successfully or no misplaced page.
a85dfc30
YS
718 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
719 * memory range specified by nodemask and maxnode points outside
720 * your accessible address space (-EFAULT)
dc9aa5b9 721 */
d05f0cdc 722static int
98094945 723queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
6f4576e3
NH
724 nodemask_t *nodes, unsigned long flags,
725 struct list_head *pagelist)
1da177e4 726{
f18da660 727 int err;
6f4576e3
NH
728 struct queue_pages qp = {
729 .pagelist = pagelist,
730 .flags = flags,
731 .nmask = nodes,
f18da660
LX
732 .start = start,
733 .end = end,
734 .first = NULL,
6f4576e3 735 };
6f4576e3 736
f18da660
LX
737 err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
738
739 if (!qp.first)
740 /* whole range in hole */
741 err = -EFAULT;
742
743 return err;
1da177e4
LT
744}
745
869833f2
KM
746/*
747 * Apply policy to a single VMA
c1e8d7c6 748 * This must be called with the mmap_lock held for writing.
869833f2
KM
749 */
750static int vma_replace_policy(struct vm_area_struct *vma,
751 struct mempolicy *pol)
8d34694c 752{
869833f2
KM
753 int err;
754 struct mempolicy *old;
755 struct mempolicy *new;
8d34694c
KM
756
757 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
758 vma->vm_start, vma->vm_end, vma->vm_pgoff,
759 vma->vm_ops, vma->vm_file,
760 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
761
869833f2
KM
762 new = mpol_dup(pol);
763 if (IS_ERR(new))
764 return PTR_ERR(new);
765
766 if (vma->vm_ops && vma->vm_ops->set_policy) {
8d34694c 767 err = vma->vm_ops->set_policy(vma, new);
869833f2
KM
768 if (err)
769 goto err_out;
8d34694c 770 }
869833f2
KM
771
772 old = vma->vm_policy;
c1e8d7c6 773 vma->vm_policy = new; /* protected by mmap_lock */
869833f2
KM
774 mpol_put(old);
775
776 return 0;
777 err_out:
778 mpol_put(new);
8d34694c
KM
779 return err;
780}
781
1da177e4 782/* Step 2: apply policy to a range and do splits. */
9d8cebd4
KM
783static int mbind_range(struct mm_struct *mm, unsigned long start,
784 unsigned long end, struct mempolicy *new_pol)
1da177e4 785{
9d8cebd4
KM
786 struct vm_area_struct *prev;
787 struct vm_area_struct *vma;
788 int err = 0;
e26a5114 789 pgoff_t pgoff;
9d8cebd4
KM
790 unsigned long vmstart;
791 unsigned long vmend;
1da177e4 792
097d5910 793 vma = find_vma(mm, start);
f18da660 794 VM_BUG_ON(!vma);
9d8cebd4 795
097d5910 796 prev = vma->vm_prev;
e26a5114
KM
797 if (start > vma->vm_start)
798 prev = vma;
799
656f16e6 800 for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
9d8cebd4
KM
801 vmstart = max(start, vma->vm_start);
802 vmend = min(end, vma->vm_end);
803
e26a5114
KM
804 if (mpol_equal(vma_policy(vma), new_pol))
805 continue;
806
807 pgoff = vma->vm_pgoff +
808 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
9d8cebd4 809 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
19a809af
AA
810 vma->anon_vma, vma->vm_file, pgoff,
811 new_pol, vma->vm_userfaultfd_ctx);
9d8cebd4
KM
812 if (prev) {
813 vma = prev;
3964acd0 814 goto replace;
9d8cebd4
KM
815 }
816 if (vma->vm_start != vmstart) {
817 err = split_vma(vma->vm_mm, vma, vmstart, 1);
818 if (err)
819 goto out;
820 }
821 if (vma->vm_end != vmend) {
822 err = split_vma(vma->vm_mm, vma, vmend, 0);
823 if (err)
824 goto out;
825 }
3964acd0 826 replace:
869833f2 827 err = vma_replace_policy(vma, new_pol);
8d34694c
KM
828 if (err)
829 goto out;
1da177e4 830 }
9d8cebd4
KM
831
832 out:
1da177e4
LT
833 return err;
834}
835
1da177e4 836/* Set the process memory policy */
028fec41
DR
837static long do_set_mempolicy(unsigned short mode, unsigned short flags,
838 nodemask_t *nodes)
1da177e4 839{
58568d2a 840 struct mempolicy *new, *old;
4bfc4495 841 NODEMASK_SCRATCH(scratch);
58568d2a 842 int ret;
1da177e4 843
4bfc4495
KH
844 if (!scratch)
845 return -ENOMEM;
f4e53d91 846
4bfc4495
KH
847 new = mpol_new(mode, flags, nodes);
848 if (IS_ERR(new)) {
849 ret = PTR_ERR(new);
850 goto out;
851 }
2c7c3a7d 852
4bfc4495 853 ret = mpol_set_nodemask(new, nodes, scratch);
58568d2a 854 if (ret) {
58568d2a 855 mpol_put(new);
4bfc4495 856 goto out;
58568d2a 857 }
78b132e9 858 task_lock(current);
58568d2a 859 old = current->mempolicy;
1da177e4 860 current->mempolicy = new;
45816682
VB
861 if (new && new->mode == MPOL_INTERLEAVE)
862 current->il_prev = MAX_NUMNODES-1;
58568d2a 863 task_unlock(current);
58568d2a 864 mpol_put(old);
4bfc4495
KH
865 ret = 0;
866out:
867 NODEMASK_SCRATCH_FREE(scratch);
868 return ret;
1da177e4
LT
869}
870
bea904d5
LS
871/*
872 * Return nodemask for policy for get_mempolicy() query
58568d2a
MX
873 *
874 * Called with task's alloc_lock held
bea904d5
LS
875 */
876static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4 877{
dfcd3c0d 878 nodes_clear(*nodes);
bea904d5
LS
879 if (p == &default_policy)
880 return;
881
45c4745a 882 switch (p->mode) {
19770b32 883 case MPOL_BIND:
1da177e4 884 case MPOL_INTERLEAVE:
269fbe72 885 case MPOL_PREFERRED:
b27abacc 886 case MPOL_PREFERRED_MANY:
269fbe72 887 *nodes = p->nodes;
1da177e4 888 break;
7858d7bc
FT
889 case MPOL_LOCAL:
890 /* return empty node mask for local allocation */
891 break;
1da177e4
LT
892 default:
893 BUG();
894 }
895}
896
3b9aadf7 897static int lookup_node(struct mm_struct *mm, unsigned long addr)
1da177e4 898{
ba841078 899 struct page *p = NULL;
1da177e4
LT
900 int err;
901
3b9aadf7
AA
902 int locked = 1;
903 err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
2d3a36a4 904 if (err > 0) {
1da177e4
LT
905 err = page_to_nid(p);
906 put_page(p);
907 }
3b9aadf7 908 if (locked)
d8ed45c5 909 mmap_read_unlock(mm);
1da177e4
LT
910 return err;
911}
912
1da177e4 913/* Retrieve NUMA policy */
dbcb0f19
AB
914static long do_get_mempolicy(int *policy, nodemask_t *nmask,
915 unsigned long addr, unsigned long flags)
1da177e4 916{
8bccd85f 917 int err;
1da177e4
LT
918 struct mm_struct *mm = current->mm;
919 struct vm_area_struct *vma = NULL;
3b9aadf7 920 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1da177e4 921
754af6f5
LS
922 if (flags &
923 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 924 return -EINVAL;
754af6f5
LS
925
926 if (flags & MPOL_F_MEMS_ALLOWED) {
927 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
928 return -EINVAL;
929 *policy = 0; /* just so it's initialized */
58568d2a 930 task_lock(current);
754af6f5 931 *nmask = cpuset_current_mems_allowed;
58568d2a 932 task_unlock(current);
754af6f5
LS
933 return 0;
934 }
935
1da177e4 936 if (flags & MPOL_F_ADDR) {
bea904d5
LS
937 /*
938 * Do NOT fall back to task policy if the
939 * vma/shared policy at addr is NULL. We
940 * want to return MPOL_DEFAULT in this case.
941 */
d8ed45c5 942 mmap_read_lock(mm);
33e3575c 943 vma = vma_lookup(mm, addr);
1da177e4 944 if (!vma) {
d8ed45c5 945 mmap_read_unlock(mm);
1da177e4
LT
946 return -EFAULT;
947 }
948 if (vma->vm_ops && vma->vm_ops->get_policy)
949 pol = vma->vm_ops->get_policy(vma, addr);
950 else
951 pol = vma->vm_policy;
952 } else if (addr)
953 return -EINVAL;
954
955 if (!pol)
bea904d5 956 pol = &default_policy; /* indicates default behavior */
1da177e4
LT
957
958 if (flags & MPOL_F_NODE) {
959 if (flags & MPOL_F_ADDR) {
3b9aadf7
AA
960 /*
961 * Take a refcount on the mpol, lookup_node()
baf2f90b 962 * will drop the mmap_lock, so after calling
3b9aadf7
AA
963 * lookup_node() only "pol" remains valid, "vma"
964 * is stale.
965 */
966 pol_refcount = pol;
967 vma = NULL;
968 mpol_get(pol);
969 err = lookup_node(mm, addr);
1da177e4
LT
970 if (err < 0)
971 goto out;
8bccd85f 972 *policy = err;
1da177e4 973 } else if (pol == current->mempolicy &&
45c4745a 974 pol->mode == MPOL_INTERLEAVE) {
269fbe72 975 *policy = next_node_in(current->il_prev, pol->nodes);
1da177e4
LT
976 } else {
977 err = -EINVAL;
978 goto out;
979 }
bea904d5
LS
980 } else {
981 *policy = pol == &default_policy ? MPOL_DEFAULT :
982 pol->mode;
d79df630
DR
983 /*
984 * Internal mempolicy flags must be masked off before exposing
985 * the policy to userspace.
986 */
987 *policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d5 988 }
1da177e4 989
1da177e4 990 err = 0;
58568d2a 991 if (nmask) {
c6b6ef8b
LS
992 if (mpol_store_user_nodemask(pol)) {
993 *nmask = pol->w.user_nodemask;
994 } else {
995 task_lock(current);
996 get_policy_nodemask(pol, nmask);
997 task_unlock(current);
998 }
58568d2a 999 }
1da177e4
LT
1000
1001 out:
52cd3b07 1002 mpol_cond_put(pol);
1da177e4 1003 if (vma)
d8ed45c5 1004 mmap_read_unlock(mm);
3b9aadf7
AA
1005 if (pol_refcount)
1006 mpol_put(pol_refcount);
1da177e4
LT
1007 return err;
1008}
1009
b20a3503 1010#ifdef CONFIG_MIGRATION
6ce3c4c0 1011/*
c8633798 1012 * page migration, thp tail pages can be passed.
6ce3c4c0 1013 */
a53190a4 1014static int migrate_page_add(struct page *page, struct list_head *pagelist,
fc301289 1015 unsigned long flags)
6ce3c4c0 1016{
c8633798 1017 struct page *head = compound_head(page);
6ce3c4c0 1018 /*
fc301289 1019 * Avoid migrating a page that is shared with others.
6ce3c4c0 1020 */
c8633798
NH
1021 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1022 if (!isolate_lru_page(head)) {
1023 list_add_tail(&head->lru, pagelist);
1024 mod_node_page_state(page_pgdat(head),
9de4f22a 1025 NR_ISOLATED_ANON + page_is_file_lru(head),
6c357848 1026 thp_nr_pages(head));
a53190a4
YS
1027 } else if (flags & MPOL_MF_STRICT) {
1028 /*
1029 * Non-movable page may reach here. And, there may be
1030 * temporary off LRU pages or non-LRU movable pages.
1031 * Treat them as unmovable pages since they can't be
1032 * isolated, so they can't be moved at the moment. It
1033 * should return -EIO for this case too.
1034 */
1035 return -EIO;
62695a84
NP
1036 }
1037 }
a53190a4
YS
1038
1039 return 0;
7e2ab150 1040}
6ce3c4c0 1041
7e2ab150
CL
1042/*
1043 * Migrate pages from one node to a target node.
1044 * Returns error or the number of pages not migrated.
1045 */
dbcb0f19
AB
1046static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1047 int flags)
7e2ab150
CL
1048{
1049 nodemask_t nmask;
1050 LIST_HEAD(pagelist);
1051 int err = 0;
a0976311
JK
1052 struct migration_target_control mtc = {
1053 .nid = dest,
1054 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1055 };
7e2ab150
CL
1056
1057 nodes_clear(nmask);
1058 node_set(source, nmask);
6ce3c4c0 1059
08270807
MK
1060 /*
1061 * This does not "check" the range but isolates all pages that
1062 * need migration. Between passing in the full user address
1063 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1064 */
1065 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
98094945 1066 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
7e2ab150
CL
1067 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1068
cf608ac1 1069 if (!list_empty(&pagelist)) {
a0976311 1070 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
5ac95884 1071 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
cf608ac1 1072 if (err)
e2d8cf40 1073 putback_movable_pages(&pagelist);
cf608ac1 1074 }
95a402c3 1075
7e2ab150 1076 return err;
6ce3c4c0
CL
1077}
1078
39743889 1079/*
7e2ab150
CL
1080 * Move pages between the two nodesets so as to preserve the physical
1081 * layout as much as possible.
39743889
CL
1082 *
1083 * Returns the number of page that could not be moved.
1084 */
0ce72d4f
AM
1085int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1086 const nodemask_t *to, int flags)
39743889 1087{
7e2ab150 1088 int busy = 0;
f555befd 1089 int err = 0;
7e2ab150 1090 nodemask_t tmp;
39743889 1091
361a2a22 1092 lru_cache_disable();
0aedadf9 1093
d8ed45c5 1094 mmap_read_lock(mm);
39743889 1095
da0aa138
KM
1096 /*
1097 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1098 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1099 * bit in 'tmp', and return that <source, dest> pair for migration.
1100 * The pair of nodemasks 'to' and 'from' define the map.
1101 *
1102 * If no pair of bits is found that way, fallback to picking some
1103 * pair of 'source' and 'dest' bits that are not the same. If the
1104 * 'source' and 'dest' bits are the same, this represents a node
1105 * that will be migrating to itself, so no pages need move.
1106 *
1107 * If no bits are left in 'tmp', or if all remaining bits left
1108 * in 'tmp' correspond to the same bit in 'to', return false
1109 * (nothing left to migrate).
1110 *
1111 * This lets us pick a pair of nodes to migrate between, such that
1112 * if possible the dest node is not already occupied by some other
1113 * source node, minimizing the risk of overloading the memory on a
1114 * node that would happen if we migrated incoming memory to a node
1115 * before migrating outgoing memory source that same node.
1116 *
1117 * A single scan of tmp is sufficient. As we go, we remember the
1118 * most recent <s, d> pair that moved (s != d). If we find a pair
1119 * that not only moved, but what's better, moved to an empty slot
1120 * (d is not set in tmp), then we break out then, with that pair.
ae0e47f0 1121 * Otherwise when we finish scanning from_tmp, we at least have the
da0aa138
KM
1122 * most recent <s, d> pair that moved. If we get all the way through
1123 * the scan of tmp without finding any node that moved, much less
1124 * moved to an empty node, then there is nothing left worth migrating.
1125 */
d4984711 1126
0ce72d4f 1127 tmp = *from;
7e2ab150 1128 while (!nodes_empty(tmp)) {
68d68ff6 1129 int s, d;
b76ac7e7 1130 int source = NUMA_NO_NODE;
7e2ab150
CL
1131 int dest = 0;
1132
1133 for_each_node_mask(s, tmp) {
4a5b18cc
LW
1134
1135 /*
1136 * do_migrate_pages() tries to maintain the relative
1137 * node relationship of the pages established between
1138 * threads and memory areas.
1139 *
1140 * However if the number of source nodes is not equal to
1141 * the number of destination nodes we can not preserve
1142 * this node relative relationship. In that case, skip
1143 * copying memory from a node that is in the destination
1144 * mask.
1145 *
1146 * Example: [2,3,4] -> [3,4,5] moves everything.
1147 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1148 */
1149
0ce72d4f
AM
1150 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1151 (node_isset(s, *to)))
4a5b18cc
LW
1152 continue;
1153
0ce72d4f 1154 d = node_remap(s, *from, *to);
7e2ab150
CL
1155 if (s == d)
1156 continue;
1157
1158 source = s; /* Node moved. Memorize */
1159 dest = d;
1160
1161 /* dest not in remaining from nodes? */
1162 if (!node_isset(dest, tmp))
1163 break;
1164 }
b76ac7e7 1165 if (source == NUMA_NO_NODE)
7e2ab150
CL
1166 break;
1167
1168 node_clear(source, tmp);
1169 err = migrate_to_node(mm, source, dest, flags);
1170 if (err > 0)
1171 busy += err;
1172 if (err < 0)
1173 break;
39743889 1174 }
d8ed45c5 1175 mmap_read_unlock(mm);
d479960e 1176
361a2a22 1177 lru_cache_enable();
7e2ab150
CL
1178 if (err < 0)
1179 return err;
1180 return busy;
b20a3503
CL
1181
1182}
1183
3ad33b24
LS
1184/*
1185 * Allocate a new page for page migration based on vma policy.
d05f0cdc 1186 * Start by assuming the page is mapped by the same vma as contains @start.
3ad33b24
LS
1187 * Search forward from there, if not. N.B., this assumes that the
1188 * list of pages handed to migrate_pages()--which is how we get here--
1189 * is in virtual address order.
1190 */
666feb21 1191static struct page *new_page(struct page *page, unsigned long start)
95a402c3 1192{
d05f0cdc 1193 struct vm_area_struct *vma;
3f649ab7 1194 unsigned long address;
95a402c3 1195
d05f0cdc 1196 vma = find_vma(current->mm, start);
3ad33b24
LS
1197 while (vma) {
1198 address = page_address_in_vma(page, vma);
1199 if (address != -EFAULT)
1200 break;
1201 vma = vma->vm_next;
1202 }
11c731e8
WL
1203
1204 if (PageHuge(page)) {
389c8178
MH
1205 return alloc_huge_page_vma(page_hstate(compound_head(page)),
1206 vma, address);
94723aaf 1207 } else if (PageTransHuge(page)) {
c8633798
NH
1208 struct page *thp;
1209
19deb769
DR
1210 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1211 HPAGE_PMD_ORDER);
c8633798
NH
1212 if (!thp)
1213 return NULL;
1214 prep_transhuge_page(thp);
1215 return thp;
11c731e8 1216 }
0bf598d8 1217 /*
11c731e8 1218 * if !vma, alloc_page_vma() will use task or system default policy
0bf598d8 1219 */
0f556856
MH
1220 return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1221 vma, address);
95a402c3 1222}
b20a3503
CL
1223#else
1224
a53190a4 1225static int migrate_page_add(struct page *page, struct list_head *pagelist,
b20a3503
CL
1226 unsigned long flags)
1227{
a53190a4 1228 return -EIO;
39743889
CL
1229}
1230
0ce72d4f
AM
1231int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1232 const nodemask_t *to, int flags)
b20a3503
CL
1233{
1234 return -ENOSYS;
1235}
95a402c3 1236
666feb21 1237static struct page *new_page(struct page *page, unsigned long start)
95a402c3
CL
1238{
1239 return NULL;
1240}
b20a3503
CL
1241#endif
1242
dbcb0f19 1243static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
1244 unsigned short mode, unsigned short mode_flags,
1245 nodemask_t *nmask, unsigned long flags)
6ce3c4c0 1246{
6ce3c4c0
CL
1247 struct mm_struct *mm = current->mm;
1248 struct mempolicy *new;
1249 unsigned long end;
1250 int err;
d8835445 1251 int ret;
6ce3c4c0
CL
1252 LIST_HEAD(pagelist);
1253
b24f53a0 1254 if (flags & ~(unsigned long)MPOL_MF_VALID)
6ce3c4c0 1255 return -EINVAL;
74c00241 1256 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
1257 return -EPERM;
1258
1259 if (start & ~PAGE_MASK)
1260 return -EINVAL;
1261
1262 if (mode == MPOL_DEFAULT)
1263 flags &= ~MPOL_MF_STRICT;
1264
1265 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1266 end = start + len;
1267
1268 if (end < start)
1269 return -EINVAL;
1270 if (end == start)
1271 return 0;
1272
028fec41 1273 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
1274 if (IS_ERR(new))
1275 return PTR_ERR(new);
1276
b24f53a0
LS
1277 if (flags & MPOL_MF_LAZY)
1278 new->flags |= MPOL_F_MOF;
1279
6ce3c4c0
CL
1280 /*
1281 * If we are using the default policy then operation
1282 * on discontinuous address spaces is okay after all
1283 */
1284 if (!new)
1285 flags |= MPOL_MF_DISCONTIG_OK;
1286
028fec41
DR
1287 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1288 start, start + len, mode, mode_flags,
00ef2d2f 1289 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
6ce3c4c0 1290
0aedadf9
CL
1291 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1292
361a2a22 1293 lru_cache_disable();
0aedadf9 1294 }
4bfc4495
KH
1295 {
1296 NODEMASK_SCRATCH(scratch);
1297 if (scratch) {
d8ed45c5 1298 mmap_write_lock(mm);
4bfc4495 1299 err = mpol_set_nodemask(new, nmask, scratch);
4bfc4495 1300 if (err)
d8ed45c5 1301 mmap_write_unlock(mm);
4bfc4495
KH
1302 } else
1303 err = -ENOMEM;
1304 NODEMASK_SCRATCH_FREE(scratch);
1305 }
b05ca738
KM
1306 if (err)
1307 goto mpol_out;
1308
d8835445 1309 ret = queue_pages_range(mm, start, end, nmask,
6ce3c4c0 1310 flags | MPOL_MF_INVERT, &pagelist);
d8835445
YS
1311
1312 if (ret < 0) {
a85dfc30 1313 err = ret;
d8835445
YS
1314 goto up_out;
1315 }
1316
1317 err = mbind_range(mm, start, end, new);
7e2ab150 1318
b24f53a0
LS
1319 if (!err) {
1320 int nr_failed = 0;
1321
cf608ac1 1322 if (!list_empty(&pagelist)) {
b24f53a0 1323 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
d05f0cdc 1324 nr_failed = migrate_pages(&pagelist, new_page, NULL,
5ac95884 1325 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
cf608ac1 1326 if (nr_failed)
74060e4d 1327 putback_movable_pages(&pagelist);
cf608ac1 1328 }
6ce3c4c0 1329
d8835445 1330 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
6ce3c4c0 1331 err = -EIO;
a85dfc30 1332 } else {
d8835445 1333up_out:
a85dfc30
YS
1334 if (!list_empty(&pagelist))
1335 putback_movable_pages(&pagelist);
1336 }
1337
d8ed45c5 1338 mmap_write_unlock(mm);
d8835445 1339mpol_out:
f0be3d32 1340 mpol_put(new);
d479960e 1341 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
361a2a22 1342 lru_cache_enable();
6ce3c4c0
CL
1343 return err;
1344}
1345
8bccd85f
CL
1346/*
1347 * User space interface with variable sized bitmaps for nodelists.
1348 */
e130242d
AB
1349static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1350 unsigned long maxnode)
1351{
1352 unsigned long nlongs = BITS_TO_LONGS(maxnode);
1353 int ret;
1354
1355 if (in_compat_syscall())
1356 ret = compat_get_bitmap(mask,
1357 (const compat_ulong_t __user *)nmask,
1358 maxnode);
1359 else
1360 ret = copy_from_user(mask, nmask,
1361 nlongs * sizeof(unsigned long));
1362
1363 if (ret)
1364 return -EFAULT;
1365
1366 if (maxnode % BITS_PER_LONG)
1367 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1368
1369 return 0;
1370}
8bccd85f
CL
1371
1372/* Copy a node mask from user space. */
39743889 1373static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
1374 unsigned long maxnode)
1375{
8bccd85f
CL
1376 --maxnode;
1377 nodes_clear(*nodes);
1378 if (maxnode == 0 || !nmask)
1379 return 0;
a9c930ba 1380 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 1381 return -EINVAL;
8bccd85f 1382
56521e7a
YX
1383 /*
1384 * When the user specified more nodes than supported just check
e130242d
AB
1385 * if the non supported part is all zero, one word at a time,
1386 * starting at the end.
56521e7a 1387 */
e130242d
AB
1388 while (maxnode > MAX_NUMNODES) {
1389 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1390 unsigned long t;
8bccd85f 1391
e130242d 1392 if (get_bitmap(&t, &nmask[maxnode / BITS_PER_LONG], bits))
56521e7a 1393 return -EFAULT;
e130242d
AB
1394
1395 if (maxnode - bits >= MAX_NUMNODES) {
1396 maxnode -= bits;
1397 } else {
1398 maxnode = MAX_NUMNODES;
1399 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1400 }
1401 if (t)
56521e7a
YX
1402 return -EINVAL;
1403 }
1404
e130242d 1405 return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
8bccd85f
CL
1406}
1407
1408/* Copy a kernel node mask to user space */
1409static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1410 nodemask_t *nodes)
1411{
1412 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
050c17f2 1413 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
e130242d
AB
1414 bool compat = in_compat_syscall();
1415
1416 if (compat)
1417 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
8bccd85f
CL
1418
1419 if (copy > nbytes) {
1420 if (copy > PAGE_SIZE)
1421 return -EINVAL;
1422 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1423 return -EFAULT;
1424 copy = nbytes;
e130242d 1425 maxnode = nr_node_ids;
8bccd85f 1426 }
e130242d
AB
1427
1428 if (compat)
1429 return compat_put_bitmap((compat_ulong_t __user *)mask,
1430 nodes_addr(*nodes), maxnode);
1431
8bccd85f
CL
1432 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1433}
1434
95837924
FT
1435/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1436static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1437{
1438 *flags = *mode & MPOL_MODE_FLAGS;
1439 *mode &= ~MPOL_MODE_FLAGS;
b27abacc 1440
a38a59fd 1441 if ((unsigned int)(*mode) >= MPOL_MAX)
95837924
FT
1442 return -EINVAL;
1443 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1444 return -EINVAL;
6d2aec9e
ED
1445 if (*flags & MPOL_F_NUMA_BALANCING) {
1446 if (*mode != MPOL_BIND)
1447 return -EINVAL;
1448 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1449 }
95837924
FT
1450 return 0;
1451}
1452
e7dc9ad6
DB
1453static long kernel_mbind(unsigned long start, unsigned long len,
1454 unsigned long mode, const unsigned long __user *nmask,
1455 unsigned long maxnode, unsigned int flags)
8bccd85f 1456{
95837924 1457 unsigned short mode_flags;
8bccd85f 1458 nodemask_t nodes;
95837924 1459 int lmode = mode;
8bccd85f
CL
1460 int err;
1461
057d3389 1462 start = untagged_addr(start);
95837924
FT
1463 err = sanitize_mpol_flags(&lmode, &mode_flags);
1464 if (err)
1465 return err;
1466
8bccd85f
CL
1467 err = get_nodes(&nodes, nmask, maxnode);
1468 if (err)
1469 return err;
95837924
FT
1470
1471 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
8bccd85f
CL
1472}
1473
e7dc9ad6
DB
1474SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1475 unsigned long, mode, const unsigned long __user *, nmask,
1476 unsigned long, maxnode, unsigned int, flags)
1477{
1478 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1479}
1480
8bccd85f 1481/* Set the process memory policy */
af03c4ac
DB
1482static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1483 unsigned long maxnode)
8bccd85f 1484{
95837924 1485 unsigned short mode_flags;
8bccd85f 1486 nodemask_t nodes;
95837924
FT
1487 int lmode = mode;
1488 int err;
1489
1490 err = sanitize_mpol_flags(&lmode, &mode_flags);
1491 if (err)
1492 return err;
8bccd85f 1493
8bccd85f
CL
1494 err = get_nodes(&nodes, nmask, maxnode);
1495 if (err)
1496 return err;
95837924
FT
1497
1498 return do_set_mempolicy(lmode, mode_flags, &nodes);
8bccd85f
CL
1499}
1500
af03c4ac
DB
1501SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1502 unsigned long, maxnode)
1503{
1504 return kernel_set_mempolicy(mode, nmask, maxnode);
1505}
1506
b6e9b0ba
DB
1507static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1508 const unsigned long __user *old_nodes,
1509 const unsigned long __user *new_nodes)
39743889 1510{
596d7cfa 1511 struct mm_struct *mm = NULL;
39743889 1512 struct task_struct *task;
39743889
CL
1513 nodemask_t task_nodes;
1514 int err;
596d7cfa
KM
1515 nodemask_t *old;
1516 nodemask_t *new;
1517 NODEMASK_SCRATCH(scratch);
1518
1519 if (!scratch)
1520 return -ENOMEM;
39743889 1521
596d7cfa
KM
1522 old = &scratch->mask1;
1523 new = &scratch->mask2;
1524
1525 err = get_nodes(old, old_nodes, maxnode);
39743889 1526 if (err)
596d7cfa 1527 goto out;
39743889 1528
596d7cfa 1529 err = get_nodes(new, new_nodes, maxnode);
39743889 1530 if (err)
596d7cfa 1531 goto out;
39743889
CL
1532
1533 /* Find the mm_struct */
55cfaa3c 1534 rcu_read_lock();
228ebcbe 1535 task = pid ? find_task_by_vpid(pid) : current;
39743889 1536 if (!task) {
55cfaa3c 1537 rcu_read_unlock();
596d7cfa
KM
1538 err = -ESRCH;
1539 goto out;
39743889 1540 }
3268c63e 1541 get_task_struct(task);
39743889 1542
596d7cfa 1543 err = -EINVAL;
39743889
CL
1544
1545 /*
31367466
OE
1546 * Check if this process has the right to modify the specified process.
1547 * Use the regular "ptrace_may_access()" checks.
39743889 1548 */
31367466 1549 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
c69e8d9c 1550 rcu_read_unlock();
39743889 1551 err = -EPERM;
3268c63e 1552 goto out_put;
39743889 1553 }
c69e8d9c 1554 rcu_read_unlock();
39743889
CL
1555
1556 task_nodes = cpuset_mems_allowed(task);
1557 /* Is the user allowed to access the target nodes? */
596d7cfa 1558 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889 1559 err = -EPERM;
3268c63e 1560 goto out_put;
39743889
CL
1561 }
1562
0486a38b
YX
1563 task_nodes = cpuset_mems_allowed(current);
1564 nodes_and(*new, *new, task_nodes);
1565 if (nodes_empty(*new))
1566 goto out_put;
1567
86c3a764
DQ
1568 err = security_task_movememory(task);
1569 if (err)
3268c63e 1570 goto out_put;
86c3a764 1571
3268c63e
CL
1572 mm = get_task_mm(task);
1573 put_task_struct(task);
f2a9ef88
SL
1574
1575 if (!mm) {
3268c63e 1576 err = -EINVAL;
f2a9ef88
SL
1577 goto out;
1578 }
1579
1580 err = do_migrate_pages(mm, old, new,
1581 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
3268c63e
CL
1582
1583 mmput(mm);
1584out:
596d7cfa
KM
1585 NODEMASK_SCRATCH_FREE(scratch);
1586
39743889 1587 return err;
3268c63e
CL
1588
1589out_put:
1590 put_task_struct(task);
1591 goto out;
1592
39743889
CL
1593}
1594
b6e9b0ba
DB
1595SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1596 const unsigned long __user *, old_nodes,
1597 const unsigned long __user *, new_nodes)
1598{
1599 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1600}
1601
39743889 1602
8bccd85f 1603/* Retrieve NUMA policy */
af03c4ac
DB
1604static int kernel_get_mempolicy(int __user *policy,
1605 unsigned long __user *nmask,
1606 unsigned long maxnode,
1607 unsigned long addr,
1608 unsigned long flags)
8bccd85f 1609{
dbcb0f19 1610 int err;
3f649ab7 1611 int pval;
8bccd85f
CL
1612 nodemask_t nodes;
1613
050c17f2 1614 if (nmask != NULL && maxnode < nr_node_ids)
8bccd85f
CL
1615 return -EINVAL;
1616
4605f057
WH
1617 addr = untagged_addr(addr);
1618
8bccd85f
CL
1619 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1620
1621 if (err)
1622 return err;
1623
1624 if (policy && put_user(pval, policy))
1625 return -EFAULT;
1626
1627 if (nmask)
1628 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1629
1630 return err;
1631}
1632
af03c4ac
DB
1633SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1634 unsigned long __user *, nmask, unsigned long, maxnode,
1635 unsigned long, addr, unsigned long, flags)
1636{
1637 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1638}
1639
20ca87f2
LX
1640bool vma_migratable(struct vm_area_struct *vma)
1641{
1642 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1643 return false;
1644
1645 /*
1646 * DAX device mappings require predictable access latency, so avoid
1647 * incurring periodic faults.
1648 */
1649 if (vma_is_dax(vma))
1650 return false;
1651
1652 if (is_vm_hugetlb_page(vma) &&
1653 !hugepage_migration_supported(hstate_vma(vma)))
1654 return false;
1655
1656 /*
1657 * Migration allocates pages in the highest zone. If we cannot
1658 * do so then migration (at least from node to node) is not
1659 * possible.
1660 */
1661 if (vma->vm_file &&
1662 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1663 < policy_zone)
1664 return false;
1665 return true;
1666}
1667
74d2c3a0
ON
1668struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1669 unsigned long addr)
1da177e4 1670{
8d90274b 1671 struct mempolicy *pol = NULL;
1da177e4
LT
1672
1673 if (vma) {
480eccf9 1674 if (vma->vm_ops && vma->vm_ops->get_policy) {
8d90274b 1675 pol = vma->vm_ops->get_policy(vma, addr);
00442ad0 1676 } else if (vma->vm_policy) {
1da177e4 1677 pol = vma->vm_policy;
00442ad0
MG
1678
1679 /*
1680 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1681 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1682 * count on these policies which will be dropped by
1683 * mpol_cond_put() later
1684 */
1685 if (mpol_needs_cond_ref(pol))
1686 mpol_get(pol);
1687 }
1da177e4 1688 }
f15ca78e 1689
74d2c3a0
ON
1690 return pol;
1691}
1692
1693/*
dd6eecb9 1694 * get_vma_policy(@vma, @addr)
74d2c3a0
ON
1695 * @vma: virtual memory area whose policy is sought
1696 * @addr: address in @vma for shared policy lookup
1697 *
1698 * Returns effective policy for a VMA at specified address.
dd6eecb9 1699 * Falls back to current->mempolicy or system default policy, as necessary.
74d2c3a0
ON
1700 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1701 * count--added by the get_policy() vm_op, as appropriate--to protect against
1702 * freeing by another task. It is the caller's responsibility to free the
1703 * extra reference for shared policies.
1704 */
ac79f78d 1705static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
dd6eecb9 1706 unsigned long addr)
74d2c3a0
ON
1707{
1708 struct mempolicy *pol = __get_vma_policy(vma, addr);
1709
8d90274b 1710 if (!pol)
dd6eecb9 1711 pol = get_task_policy(current);
8d90274b 1712
1da177e4
LT
1713 return pol;
1714}
1715
6b6482bb 1716bool vma_policy_mof(struct vm_area_struct *vma)
fc314724 1717{
6b6482bb 1718 struct mempolicy *pol;
fc314724 1719
6b6482bb
ON
1720 if (vma->vm_ops && vma->vm_ops->get_policy) {
1721 bool ret = false;
fc314724 1722
6b6482bb
ON
1723 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1724 if (pol && (pol->flags & MPOL_F_MOF))
1725 ret = true;
1726 mpol_cond_put(pol);
8d90274b 1727
6b6482bb 1728 return ret;
fc314724
MG
1729 }
1730
6b6482bb 1731 pol = vma->vm_policy;
8d90274b 1732 if (!pol)
6b6482bb 1733 pol = get_task_policy(current);
8d90274b 1734
fc314724
MG
1735 return pol->flags & MPOL_F_MOF;
1736}
1737
d3eb1570
LJ
1738static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1739{
1740 enum zone_type dynamic_policy_zone = policy_zone;
1741
1742 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1743
1744 /*
269fbe72 1745 * if policy->nodes has movable memory only,
d3eb1570
LJ
1746 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1747 *
269fbe72 1748 * policy->nodes is intersect with node_states[N_MEMORY].
f0953a1b 1749 * so if the following test fails, it implies
269fbe72 1750 * policy->nodes has movable memory only.
d3eb1570 1751 */
269fbe72 1752 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
d3eb1570
LJ
1753 dynamic_policy_zone = ZONE_MOVABLE;
1754
1755 return zone >= dynamic_policy_zone;
1756}
1757
52cd3b07
LS
1758/*
1759 * Return a nodemask representing a mempolicy for filtering nodes for
1760 * page allocation
1761 */
8ca39e68 1762nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
19770b32 1763{
b27abacc
DH
1764 int mode = policy->mode;
1765
19770b32 1766 /* Lower zones don't get a nodemask applied for MPOL_BIND */
b27abacc
DH
1767 if (unlikely(mode == MPOL_BIND) &&
1768 apply_policy_zone(policy, gfp_zone(gfp)) &&
1769 cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1770 return &policy->nodes;
1771
1772 if (mode == MPOL_PREFERRED_MANY)
269fbe72 1773 return &policy->nodes;
19770b32
MG
1774
1775 return NULL;
1776}
1777
b27abacc
DH
1778/*
1779 * Return the preferred node id for 'prefer' mempolicy, and return
1780 * the given id for all other policies.
1781 *
1782 * policy_node() is always coupled with policy_nodemask(), which
1783 * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1784 */
f8fd5253 1785static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1da177e4 1786{
7858d7bc 1787 if (policy->mode == MPOL_PREFERRED) {
269fbe72 1788 nd = first_node(policy->nodes);
7858d7bc 1789 } else {
19770b32 1790 /*
6d840958
MH
1791 * __GFP_THISNODE shouldn't even be used with the bind policy
1792 * because we might easily break the expectation to stay on the
1793 * requested node and not break the policy.
19770b32 1794 */
6d840958 1795 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1da177e4 1796 }
6d840958 1797
04ec6264 1798 return nd;
1da177e4
LT
1799}
1800
1801/* Do dynamic interleaving for a process */
1802static unsigned interleave_nodes(struct mempolicy *policy)
1803{
45816682 1804 unsigned next;
1da177e4
LT
1805 struct task_struct *me = current;
1806
269fbe72 1807 next = next_node_in(me->il_prev, policy->nodes);
f5b087b5 1808 if (next < MAX_NUMNODES)
45816682
VB
1809 me->il_prev = next;
1810 return next;
1da177e4
LT
1811}
1812
dc85da15
CL
1813/*
1814 * Depending on the memory policy provide a node from which to allocate the
1815 * next slab entry.
1816 */
2a389610 1817unsigned int mempolicy_slab_node(void)
dc85da15 1818{
e7b691b0 1819 struct mempolicy *policy;
2a389610 1820 int node = numa_mem_id();
e7b691b0 1821
38b031dd 1822 if (!in_task())
2a389610 1823 return node;
e7b691b0
AK
1824
1825 policy = current->mempolicy;
7858d7bc 1826 if (!policy)
2a389610 1827 return node;
bea904d5
LS
1828
1829 switch (policy->mode) {
1830 case MPOL_PREFERRED:
269fbe72 1831 return first_node(policy->nodes);
765c4507 1832
dc85da15
CL
1833 case MPOL_INTERLEAVE:
1834 return interleave_nodes(policy);
1835
b27abacc
DH
1836 case MPOL_BIND:
1837 case MPOL_PREFERRED_MANY:
1838 {
c33d6c06
MG
1839 struct zoneref *z;
1840
dc85da15
CL
1841 /*
1842 * Follow bind policy behavior and start allocation at the
1843 * first node.
1844 */
19770b32 1845 struct zonelist *zonelist;
19770b32 1846 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
c9634cf0 1847 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
c33d6c06 1848 z = first_zones_zonelist(zonelist, highest_zoneidx,
269fbe72 1849 &policy->nodes);
c1093b74 1850 return z->zone ? zone_to_nid(z->zone) : node;
dd1a239f 1851 }
7858d7bc
FT
1852 case MPOL_LOCAL:
1853 return node;
dc85da15 1854
dc85da15 1855 default:
bea904d5 1856 BUG();
dc85da15
CL
1857 }
1858}
1859
fee83b3a
AM
1860/*
1861 * Do static interleaving for a VMA with known offset @n. Returns the n'th
269fbe72 1862 * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
fee83b3a
AM
1863 * number of present nodes.
1864 */
98c70baa 1865static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1da177e4 1866{
276aeee1 1867 nodemask_t nodemask = pol->nodes;
1868 unsigned int target, nnodes;
fee83b3a
AM
1869 int i;
1870 int nid;
276aeee1 1871 /*
1872 * The barrier will stabilize the nodemask in a register or on
1873 * the stack so that it will stop changing under the code.
1874 *
1875 * Between first_node() and next_node(), pol->nodes could be changed
1876 * by other threads. So we put pol->nodes in a local stack.
1877 */
1878 barrier();
1da177e4 1879
276aeee1 1880 nnodes = nodes_weight(nodemask);
f5b087b5
DR
1881 if (!nnodes)
1882 return numa_node_id();
fee83b3a 1883 target = (unsigned int)n % nnodes;
276aeee1 1884 nid = first_node(nodemask);
fee83b3a 1885 for (i = 0; i < target; i++)
276aeee1 1886 nid = next_node(nid, nodemask);
1da177e4
LT
1887 return nid;
1888}
1889
5da7ca86
CL
1890/* Determine a node number for interleave */
1891static inline unsigned interleave_nid(struct mempolicy *pol,
1892 struct vm_area_struct *vma, unsigned long addr, int shift)
1893{
1894 if (vma) {
1895 unsigned long off;
1896
3b98b087
NA
1897 /*
1898 * for small pages, there is no difference between
1899 * shift and PAGE_SHIFT, so the bit-shift is safe.
1900 * for huge pages, since vm_pgoff is in units of small
1901 * pages, we need to shift off the always 0 bits to get
1902 * a useful offset.
1903 */
1904 BUG_ON(shift < PAGE_SHIFT);
1905 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca86 1906 off += (addr - vma->vm_start) >> shift;
98c70baa 1907 return offset_il_node(pol, off);
5da7ca86
CL
1908 } else
1909 return interleave_nodes(pol);
1910}
1911
00ac59ad 1912#ifdef CONFIG_HUGETLBFS
480eccf9 1913/*
04ec6264 1914 * huge_node(@vma, @addr, @gfp_flags, @mpol)
b46e14ac
FF
1915 * @vma: virtual memory area whose policy is sought
1916 * @addr: address in @vma for shared policy lookup and interleave policy
1917 * @gfp_flags: for requested zone
1918 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
b27abacc 1919 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
480eccf9 1920 *
04ec6264 1921 * Returns a nid suitable for a huge page allocation and a pointer
52cd3b07 1922 * to the struct mempolicy for conditional unref after allocation.
b27abacc
DH
1923 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
1924 * to the mempolicy's @nodemask for filtering the zonelist.
c0ff7453 1925 *
d26914d1 1926 * Must be protected by read_mems_allowed_begin()
480eccf9 1927 */
04ec6264
VB
1928int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1929 struct mempolicy **mpol, nodemask_t **nodemask)
5da7ca86 1930{
04ec6264 1931 int nid;
b27abacc 1932 int mode;
5da7ca86 1933
dd6eecb9 1934 *mpol = get_vma_policy(vma, addr);
b27abacc
DH
1935 *nodemask = NULL;
1936 mode = (*mpol)->mode;
5da7ca86 1937
b27abacc 1938 if (unlikely(mode == MPOL_INTERLEAVE)) {
04ec6264
VB
1939 nid = interleave_nid(*mpol, vma, addr,
1940 huge_page_shift(hstate_vma(vma)));
52cd3b07 1941 } else {
04ec6264 1942 nid = policy_node(gfp_flags, *mpol, numa_node_id());
b27abacc 1943 if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
269fbe72 1944 *nodemask = &(*mpol)->nodes;
480eccf9 1945 }
04ec6264 1946 return nid;
5da7ca86 1947}
06808b08
LS
1948
1949/*
1950 * init_nodemask_of_mempolicy
1951 *
1952 * If the current task's mempolicy is "default" [NULL], return 'false'
1953 * to indicate default policy. Otherwise, extract the policy nodemask
1954 * for 'bind' or 'interleave' policy into the argument nodemask, or
1955 * initialize the argument nodemask to contain the single node for
1956 * 'preferred' or 'local' policy and return 'true' to indicate presence
1957 * of non-default mempolicy.
1958 *
1959 * We don't bother with reference counting the mempolicy [mpol_get/put]
1960 * because the current task is examining it's own mempolicy and a task's
1961 * mempolicy is only ever changed by the task itself.
1962 *
1963 * N.B., it is the caller's responsibility to free a returned nodemask.
1964 */
1965bool init_nodemask_of_mempolicy(nodemask_t *mask)
1966{
1967 struct mempolicy *mempolicy;
06808b08
LS
1968
1969 if (!(mask && current->mempolicy))
1970 return false;
1971
c0ff7453 1972 task_lock(current);
06808b08
LS
1973 mempolicy = current->mempolicy;
1974 switch (mempolicy->mode) {
1975 case MPOL_PREFERRED:
b27abacc 1976 case MPOL_PREFERRED_MANY:
06808b08 1977 case MPOL_BIND:
06808b08 1978 case MPOL_INTERLEAVE:
269fbe72 1979 *mask = mempolicy->nodes;
7858d7bc
FT
1980 break;
1981
1982 case MPOL_LOCAL:
269fbe72 1983 init_nodemask_of_node(mask, numa_node_id());
06808b08
LS
1984 break;
1985
1986 default:
1987 BUG();
1988 }
c0ff7453 1989 task_unlock(current);
06808b08
LS
1990
1991 return true;
1992}
00ac59ad 1993#endif
5da7ca86 1994
6f48d0eb 1995/*
b26e517a 1996 * mempolicy_in_oom_domain
6f48d0eb 1997 *
b26e517a
FT
1998 * If tsk's mempolicy is "bind", check for intersection between mask and
1999 * the policy nodemask. Otherwise, return true for all other policies
2000 * including "interleave", as a tsk with "interleave" policy may have
2001 * memory allocated from all nodes in system.
6f48d0eb
DR
2002 *
2003 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2004 */
b26e517a 2005bool mempolicy_in_oom_domain(struct task_struct *tsk,
6f48d0eb
DR
2006 const nodemask_t *mask)
2007{
2008 struct mempolicy *mempolicy;
2009 bool ret = true;
2010
2011 if (!mask)
2012 return ret;
b26e517a 2013
6f48d0eb
DR
2014 task_lock(tsk);
2015 mempolicy = tsk->mempolicy;
b26e517a 2016 if (mempolicy && mempolicy->mode == MPOL_BIND)
269fbe72 2017 ret = nodes_intersects(mempolicy->nodes, *mask);
6f48d0eb 2018 task_unlock(tsk);
b26e517a 2019
6f48d0eb
DR
2020 return ret;
2021}
2022
1da177e4
LT
2023/* Allocate a page in interleaved policy.
2024 Own path because it needs to do special accounting. */
662f3a0b
AK
2025static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2026 unsigned nid)
1da177e4 2027{
1da177e4
LT
2028 struct page *page;
2029
84172f4b 2030 page = __alloc_pages(gfp, order, nid, NULL);
4518085e
KW
2031 /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2032 if (!static_branch_likely(&vm_numa_stat_key))
2033 return page;
de55c8b2
AR
2034 if (page && page_to_nid(page) == nid) {
2035 preempt_disable();
f19298b9 2036 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
de55c8b2
AR
2037 preempt_enable();
2038 }
1da177e4
LT
2039 return page;
2040}
2041
4c54d949
FT
2042static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2043 int nid, struct mempolicy *pol)
2044{
2045 struct page *page;
2046 gfp_t preferred_gfp;
2047
2048 /*
2049 * This is a two pass approach. The first pass will only try the
2050 * preferred nodes but skip the direct reclaim and allow the
2051 * allocation to fail, while the second pass will try all the
2052 * nodes in system.
2053 */
2054 preferred_gfp = gfp | __GFP_NOWARN;
2055 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2056 page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
2057 if (!page)
2058 page = __alloc_pages(gfp, order, numa_node_id(), NULL);
2059
2060 return page;
2061}
2062
1da177e4 2063/**
eb350739
MWO
2064 * alloc_pages_vma - Allocate a page for a VMA.
2065 * @gfp: GFP flags.
2066 * @order: Order of the GFP allocation.
2067 * @vma: Pointer to VMA or NULL if not available.
2068 * @addr: Virtual address of the allocation. Must be inside @vma.
2069 * @node: Which node to prefer for allocation (modulo policy).
2070 * @hugepage: For hugepages try only the preferred node if possible.
1da177e4 2071 *
eb350739
MWO
2072 * Allocate a page for a specific address in @vma, using the appropriate
2073 * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
2074 * of the mm_struct of the VMA to prevent it from going away. Should be
2075 * used for all allocations for pages that will be mapped into user space.
1da177e4 2076 *
eb350739 2077 * Return: The page on success or NULL if allocation fails.
1da177e4 2078 */
eb350739 2079struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
19deb769 2080 unsigned long addr, int node, bool hugepage)
1da177e4 2081{
cc9a6c87 2082 struct mempolicy *pol;
c0ff7453 2083 struct page *page;
04ec6264 2084 int preferred_nid;
be97a41b 2085 nodemask_t *nmask;
cc9a6c87 2086
dd6eecb9 2087 pol = get_vma_policy(vma, addr);
1da177e4 2088
0867a57c
VB
2089 if (pol->mode == MPOL_INTERLEAVE) {
2090 unsigned nid;
2091
2092 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2093 mpol_cond_put(pol);
2094 page = alloc_page_interleave(gfp, order, nid);
2095 goto out;
19deb769
DR
2096 }
2097
4c54d949
FT
2098 if (pol->mode == MPOL_PREFERRED_MANY) {
2099 page = alloc_pages_preferred_many(gfp, order, node, pol);
2100 mpol_cond_put(pol);
2101 goto out;
2102 }
2103
19deb769
DR
2104 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2105 int hpage_node = node;
2106
2107 /*
2108 * For hugepage allocation and non-interleave policy which
2109 * allows the current node (or other explicitly preferred
2110 * node) we only try to allocate from the current/preferred
2111 * node and don't fall back to other nodes, as the cost of
2112 * remote accesses would likely offset THP benefits.
2113 *
b27abacc 2114 * If the policy is interleave or does not allow the current
19deb769
DR
2115 * node in its nodemask, we allocate the standard way.
2116 */
7858d7bc 2117 if (pol->mode == MPOL_PREFERRED)
269fbe72 2118 hpage_node = first_node(pol->nodes);
19deb769
DR
2119
2120 nmask = policy_nodemask(gfp, pol);
2121 if (!nmask || node_isset(hpage_node, *nmask)) {
2122 mpol_cond_put(pol);
cc638f32
VB
2123 /*
2124 * First, try to allocate THP only on local node, but
2125 * don't reclaim unnecessarily, just compact.
2126 */
19deb769 2127 page = __alloc_pages_node(hpage_node,
cc638f32 2128 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
76e654cc
DR
2129
2130 /*
2131 * If hugepage allocations are configured to always
2132 * synchronous compact or the vma has been madvised
2133 * to prefer hugepage backing, retry allowing remote
cc638f32 2134 * memory with both reclaim and compact as well.
76e654cc
DR
2135 */
2136 if (!page && (gfp & __GFP_DIRECT_RECLAIM))
54962c2e 2137 page = __alloc_pages(gfp, order, hpage_node, nmask);
76e654cc 2138
19deb769
DR
2139 goto out;
2140 }
356ff8a9
DR
2141 }
2142
be97a41b 2143 nmask = policy_nodemask(gfp, pol);
04ec6264 2144 preferred_nid = policy_node(gfp, pol, node);
84172f4b 2145 page = __alloc_pages(gfp, order, preferred_nid, nmask);
d51e9894 2146 mpol_cond_put(pol);
be97a41b 2147out:
c0ff7453 2148 return page;
1da177e4 2149}
69262215 2150EXPORT_SYMBOL(alloc_pages_vma);
1da177e4
LT
2151
2152/**
6421ec76
MWO
2153 * alloc_pages - Allocate pages.
2154 * @gfp: GFP flags.
2155 * @order: Power of two of number of pages to allocate.
1da177e4 2156 *
6421ec76
MWO
2157 * Allocate 1 << @order contiguous pages. The physical address of the
2158 * first page is naturally aligned (eg an order-3 allocation will be aligned
2159 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2160 * process is honoured when in process context.
1da177e4 2161 *
6421ec76
MWO
2162 * Context: Can be called from any context, providing the appropriate GFP
2163 * flags are used.
2164 * Return: The page on success or NULL if allocation fails.
1da177e4 2165 */
d7f946d0 2166struct page *alloc_pages(gfp_t gfp, unsigned order)
1da177e4 2167{
8d90274b 2168 struct mempolicy *pol = &default_policy;
c0ff7453 2169 struct page *page;
1da177e4 2170
8d90274b
ON
2171 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2172 pol = get_task_policy(current);
52cd3b07
LS
2173
2174 /*
2175 * No reference counting needed for current->mempolicy
2176 * nor system default_policy
2177 */
45c4745a 2178 if (pol->mode == MPOL_INTERLEAVE)
c0ff7453 2179 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
4c54d949
FT
2180 else if (pol->mode == MPOL_PREFERRED_MANY)
2181 page = alloc_pages_preferred_many(gfp, order,
2182 numa_node_id(), pol);
c0ff7453 2183 else
84172f4b 2184 page = __alloc_pages(gfp, order,
04ec6264 2185 policy_node(gfp, pol, numa_node_id()),
5c4b4be3 2186 policy_nodemask(gfp, pol));
cc9a6c87 2187
c0ff7453 2188 return page;
1da177e4 2189}
d7f946d0 2190EXPORT_SYMBOL(alloc_pages);
1da177e4 2191
ef0855d3
ON
2192int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2193{
2194 struct mempolicy *pol = mpol_dup(vma_policy(src));
2195
2196 if (IS_ERR(pol))
2197 return PTR_ERR(pol);
2198 dst->vm_policy = pol;
2199 return 0;
2200}
2201
4225399a 2202/*
846a16bf 2203 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a
PJ
2204 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2205 * with the mems_allowed returned by cpuset_mems_allowed(). This
2206 * keeps mempolicies cpuset relative after its cpuset moves. See
2207 * further kernel/cpuset.c update_nodemask().
708c1bbc
MX
2208 *
2209 * current's mempolicy may be rebinded by the other task(the task that changes
2210 * cpuset's mems), so we needn't do rebind work for current task.
4225399a 2211 */
4225399a 2212
846a16bf
LS
2213/* Slow path of a mempolicy duplicate */
2214struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4
LT
2215{
2216 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2217
2218 if (!new)
2219 return ERR_PTR(-ENOMEM);
708c1bbc
MX
2220
2221 /* task's mempolicy is protected by alloc_lock */
2222 if (old == current->mempolicy) {
2223 task_lock(current);
2224 *new = *old;
2225 task_unlock(current);
2226 } else
2227 *new = *old;
2228
4225399a
PJ
2229 if (current_cpuset_is_being_rebound()) {
2230 nodemask_t mems = cpuset_mems_allowed(current);
213980c0 2231 mpol_rebind_policy(new, &mems);
4225399a 2232 }
1da177e4 2233 atomic_set(&new->refcnt, 1);
1da177e4
LT
2234 return new;
2235}
2236
2237/* Slow path of a mempolicy comparison */
fcfb4dcc 2238bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1da177e4
LT
2239{
2240 if (!a || !b)
fcfb4dcc 2241 return false;
45c4745a 2242 if (a->mode != b->mode)
fcfb4dcc 2243 return false;
19800502 2244 if (a->flags != b->flags)
fcfb4dcc 2245 return false;
19800502
BL
2246 if (mpol_store_user_nodemask(a))
2247 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
fcfb4dcc 2248 return false;
19800502 2249
45c4745a 2250 switch (a->mode) {
19770b32 2251 case MPOL_BIND:
1da177e4 2252 case MPOL_INTERLEAVE:
1da177e4 2253 case MPOL_PREFERRED:
b27abacc 2254 case MPOL_PREFERRED_MANY:
269fbe72 2255 return !!nodes_equal(a->nodes, b->nodes);
7858d7bc
FT
2256 case MPOL_LOCAL:
2257 return true;
1da177e4
LT
2258 default:
2259 BUG();
fcfb4dcc 2260 return false;
1da177e4
LT
2261 }
2262}
2263
1da177e4
LT
2264/*
2265 * Shared memory backing store policy support.
2266 *
2267 * Remember policies even when nobody has shared memory mapped.
2268 * The policies are kept in Red-Black tree linked from the inode.
4a8c7bb5 2269 * They are protected by the sp->lock rwlock, which should be held
1da177e4
LT
2270 * for any accesses to the tree.
2271 */
2272
4a8c7bb5
NZ
2273/*
2274 * lookup first element intersecting start-end. Caller holds sp->lock for
2275 * reading or for writing
2276 */
1da177e4
LT
2277static struct sp_node *
2278sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2279{
2280 struct rb_node *n = sp->root.rb_node;
2281
2282 while (n) {
2283 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2284
2285 if (start >= p->end)
2286 n = n->rb_right;
2287 else if (end <= p->start)
2288 n = n->rb_left;
2289 else
2290 break;
2291 }
2292 if (!n)
2293 return NULL;
2294 for (;;) {
2295 struct sp_node *w = NULL;
2296 struct rb_node *prev = rb_prev(n);
2297 if (!prev)
2298 break;
2299 w = rb_entry(prev, struct sp_node, nd);
2300 if (w->end <= start)
2301 break;
2302 n = prev;
2303 }
2304 return rb_entry(n, struct sp_node, nd);
2305}
2306
4a8c7bb5
NZ
2307/*
2308 * Insert a new shared policy into the list. Caller holds sp->lock for
2309 * writing.
2310 */
1da177e4
LT
2311static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2312{
2313 struct rb_node **p = &sp->root.rb_node;
2314 struct rb_node *parent = NULL;
2315 struct sp_node *nd;
2316
2317 while (*p) {
2318 parent = *p;
2319 nd = rb_entry(parent, struct sp_node, nd);
2320 if (new->start < nd->start)
2321 p = &(*p)->rb_left;
2322 else if (new->end > nd->end)
2323 p = &(*p)->rb_right;
2324 else
2325 BUG();
2326 }
2327 rb_link_node(&new->nd, parent, p);
2328 rb_insert_color(&new->nd, &sp->root);
140d5a49 2329 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
45c4745a 2330 new->policy ? new->policy->mode : 0);
1da177e4
LT
2331}
2332
2333/* Find shared policy intersecting idx */
2334struct mempolicy *
2335mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2336{
2337 struct mempolicy *pol = NULL;
2338 struct sp_node *sn;
2339
2340 if (!sp->root.rb_node)
2341 return NULL;
4a8c7bb5 2342 read_lock(&sp->lock);
1da177e4
LT
2343 sn = sp_lookup(sp, idx, idx+1);
2344 if (sn) {
2345 mpol_get(sn->policy);
2346 pol = sn->policy;
2347 }
4a8c7bb5 2348 read_unlock(&sp->lock);
1da177e4
LT
2349 return pol;
2350}
2351
63f74ca2
KM
2352static void sp_free(struct sp_node *n)
2353{
2354 mpol_put(n->policy);
2355 kmem_cache_free(sn_cache, n);
2356}
2357
771fb4d8
LS
2358/**
2359 * mpol_misplaced - check whether current page node is valid in policy
2360 *
b46e14ac
FF
2361 * @page: page to be checked
2362 * @vma: vm area where page mapped
2363 * @addr: virtual address where page mapped
771fb4d8
LS
2364 *
2365 * Lookup current policy node id for vma,addr and "compare to" page's
5f076944 2366 * node id. Policy determination "mimics" alloc_page_vma().
771fb4d8 2367 * Called from fault path where we know the vma and faulting address.
5f076944 2368 *
062db293
BW
2369 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2370 * policy, or a suitable node ID to allocate a replacement page from.
771fb4d8
LS
2371 */
2372int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2373{
2374 struct mempolicy *pol;
c33d6c06 2375 struct zoneref *z;
771fb4d8
LS
2376 int curnid = page_to_nid(page);
2377 unsigned long pgoff;
90572890
PZ
2378 int thiscpu = raw_smp_processor_id();
2379 int thisnid = cpu_to_node(thiscpu);
98fa15f3 2380 int polnid = NUMA_NO_NODE;
062db293 2381 int ret = NUMA_NO_NODE;
771fb4d8 2382
dd6eecb9 2383 pol = get_vma_policy(vma, addr);
771fb4d8
LS
2384 if (!(pol->flags & MPOL_F_MOF))
2385 goto out;
2386
2387 switch (pol->mode) {
2388 case MPOL_INTERLEAVE:
771fb4d8
LS
2389 pgoff = vma->vm_pgoff;
2390 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
98c70baa 2391 polnid = offset_il_node(pol, pgoff);
771fb4d8
LS
2392 break;
2393
2394 case MPOL_PREFERRED:
b27abacc
DH
2395 if (node_isset(curnid, pol->nodes))
2396 goto out;
269fbe72 2397 polnid = first_node(pol->nodes);
7858d7bc
FT
2398 break;
2399
2400 case MPOL_LOCAL:
2401 polnid = numa_node_id();
771fb4d8
LS
2402 break;
2403
2404 case MPOL_BIND:
bda420b9
HY
2405 /* Optimize placement among multiple nodes via NUMA balancing */
2406 if (pol->flags & MPOL_F_MORON) {
269fbe72 2407 if (node_isset(thisnid, pol->nodes))
bda420b9
HY
2408 break;
2409 goto out;
2410 }
b27abacc 2411 fallthrough;
c33d6c06 2412
b27abacc 2413 case MPOL_PREFERRED_MANY:
771fb4d8 2414 /*
771fb4d8
LS
2415 * use current page if in policy nodemask,
2416 * else select nearest allowed node, if any.
2417 * If no allowed nodes, use current [!misplaced].
2418 */
269fbe72 2419 if (node_isset(curnid, pol->nodes))
771fb4d8 2420 goto out;
c33d6c06 2421 z = first_zones_zonelist(
771fb4d8
LS
2422 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2423 gfp_zone(GFP_HIGHUSER),
269fbe72 2424 &pol->nodes);
c1093b74 2425 polnid = zone_to_nid(z->zone);
771fb4d8
LS
2426 break;
2427
2428 default:
2429 BUG();
2430 }
5606e387
MG
2431
2432 /* Migrate the page towards the node whose CPU is referencing it */
e42c8ff2 2433 if (pol->flags & MPOL_F_MORON) {
90572890 2434 polnid = thisnid;
5606e387 2435
10f39042 2436 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
de1c9ce6 2437 goto out;
e42c8ff2
MG
2438 }
2439
771fb4d8
LS
2440 if (curnid != polnid)
2441 ret = polnid;
2442out:
2443 mpol_cond_put(pol);
2444
2445 return ret;
2446}
2447
c11600e4
DR
2448/*
2449 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2450 * dropped after task->mempolicy is set to NULL so that any allocation done as
2451 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2452 * policy.
2453 */
2454void mpol_put_task_policy(struct task_struct *task)
2455{
2456 struct mempolicy *pol;
2457
2458 task_lock(task);
2459 pol = task->mempolicy;
2460 task->mempolicy = NULL;
2461 task_unlock(task);
2462 mpol_put(pol);
2463}
2464
1da177e4
LT
2465static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2466{
140d5a49 2467 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1da177e4 2468 rb_erase(&n->nd, &sp->root);
63f74ca2 2469 sp_free(n);
1da177e4
LT
2470}
2471
42288fe3
MG
2472static void sp_node_init(struct sp_node *node, unsigned long start,
2473 unsigned long end, struct mempolicy *pol)
2474{
2475 node->start = start;
2476 node->end = end;
2477 node->policy = pol;
2478}
2479
dbcb0f19
AB
2480static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2481 struct mempolicy *pol)
1da177e4 2482{
869833f2
KM
2483 struct sp_node *n;
2484 struct mempolicy *newpol;
1da177e4 2485
869833f2 2486 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1da177e4
LT
2487 if (!n)
2488 return NULL;
869833f2
KM
2489
2490 newpol = mpol_dup(pol);
2491 if (IS_ERR(newpol)) {
2492 kmem_cache_free(sn_cache, n);
2493 return NULL;
2494 }
2495 newpol->flags |= MPOL_F_SHARED;
42288fe3 2496 sp_node_init(n, start, end, newpol);
869833f2 2497
1da177e4
LT
2498 return n;
2499}
2500
2501/* Replace a policy range. */
2502static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2503 unsigned long end, struct sp_node *new)
2504{
b22d127a 2505 struct sp_node *n;
42288fe3
MG
2506 struct sp_node *n_new = NULL;
2507 struct mempolicy *mpol_new = NULL;
b22d127a 2508 int ret = 0;
1da177e4 2509
42288fe3 2510restart:
4a8c7bb5 2511 write_lock(&sp->lock);
1da177e4
LT
2512 n = sp_lookup(sp, start, end);
2513 /* Take care of old policies in the same range. */
2514 while (n && n->start < end) {
2515 struct rb_node *next = rb_next(&n->nd);
2516 if (n->start >= start) {
2517 if (n->end <= end)
2518 sp_delete(sp, n);
2519 else
2520 n->start = end;
2521 } else {
2522 /* Old policy spanning whole new range. */
2523 if (n->end > end) {
42288fe3
MG
2524 if (!n_new)
2525 goto alloc_new;
2526
2527 *mpol_new = *n->policy;
2528 atomic_set(&mpol_new->refcnt, 1);
7880639c 2529 sp_node_init(n_new, end, n->end, mpol_new);
1da177e4 2530 n->end = start;
5ca39575 2531 sp_insert(sp, n_new);
42288fe3
MG
2532 n_new = NULL;
2533 mpol_new = NULL;
1da177e4
LT
2534 break;
2535 } else
2536 n->end = start;
2537 }
2538 if (!next)
2539 break;
2540 n = rb_entry(next, struct sp_node, nd);
2541 }
2542 if (new)
2543 sp_insert(sp, new);
4a8c7bb5 2544 write_unlock(&sp->lock);
42288fe3
MG
2545 ret = 0;
2546
2547err_out:
2548 if (mpol_new)
2549 mpol_put(mpol_new);
2550 if (n_new)
2551 kmem_cache_free(sn_cache, n_new);
2552
b22d127a 2553 return ret;
42288fe3
MG
2554
2555alloc_new:
4a8c7bb5 2556 write_unlock(&sp->lock);
42288fe3
MG
2557 ret = -ENOMEM;
2558 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2559 if (!n_new)
2560 goto err_out;
2561 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2562 if (!mpol_new)
2563 goto err_out;
9530da0c 2564 atomic_set(&mpol_new->refcnt, 1);
42288fe3 2565 goto restart;
1da177e4
LT
2566}
2567
71fe804b
LS
2568/**
2569 * mpol_shared_policy_init - initialize shared policy for inode
2570 * @sp: pointer to inode shared policy
2571 * @mpol: struct mempolicy to install
2572 *
2573 * Install non-NULL @mpol in inode's shared policy rb-tree.
2574 * On entry, the current task has a reference on a non-NULL @mpol.
2575 * This must be released on exit.
4bfc4495 2576 * This is called at get_inode() calls and we can use GFP_KERNEL.
71fe804b
LS
2577 */
2578void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2579{
58568d2a
MX
2580 int ret;
2581
71fe804b 2582 sp->root = RB_ROOT; /* empty tree == default mempolicy */
4a8c7bb5 2583 rwlock_init(&sp->lock);
71fe804b
LS
2584
2585 if (mpol) {
2586 struct vm_area_struct pvma;
2587 struct mempolicy *new;
4bfc4495 2588 NODEMASK_SCRATCH(scratch);
71fe804b 2589
4bfc4495 2590 if (!scratch)
5c0c1654 2591 goto put_mpol;
71fe804b
LS
2592 /* contextualize the tmpfs mount point mempolicy */
2593 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
15d77835 2594 if (IS_ERR(new))
0cae3457 2595 goto free_scratch; /* no valid nodemask intersection */
58568d2a
MX
2596
2597 task_lock(current);
4bfc4495 2598 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
58568d2a 2599 task_unlock(current);
15d77835 2600 if (ret)
5c0c1654 2601 goto put_new;
71fe804b
LS
2602
2603 /* Create pseudo-vma that contains just the policy */
2c4541e2 2604 vma_init(&pvma, NULL);
71fe804b
LS
2605 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2606 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
15d77835 2607
5c0c1654 2608put_new:
71fe804b 2609 mpol_put(new); /* drop initial ref */
0cae3457 2610free_scratch:
4bfc4495 2611 NODEMASK_SCRATCH_FREE(scratch);
5c0c1654
LS
2612put_mpol:
2613 mpol_put(mpol); /* drop our incoming ref on sb mpol */
7339ff83
RH
2614 }
2615}
2616
1da177e4
LT
2617int mpol_set_shared_policy(struct shared_policy *info,
2618 struct vm_area_struct *vma, struct mempolicy *npol)
2619{
2620 int err;
2621 struct sp_node *new = NULL;
2622 unsigned long sz = vma_pages(vma);
2623
028fec41 2624 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1da177e4 2625 vma->vm_pgoff,
45c4745a 2626 sz, npol ? npol->mode : -1,
028fec41 2627 npol ? npol->flags : -1,
269fbe72 2628 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
1da177e4
LT
2629
2630 if (npol) {
2631 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2632 if (!new)
2633 return -ENOMEM;
2634 }
2635 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2636 if (err && new)
63f74ca2 2637 sp_free(new);
1da177e4
LT
2638 return err;
2639}
2640
2641/* Free a backing policy store on inode delete. */
2642void mpol_free_shared_policy(struct shared_policy *p)
2643{
2644 struct sp_node *n;
2645 struct rb_node *next;
2646
2647 if (!p->root.rb_node)
2648 return;
4a8c7bb5 2649 write_lock(&p->lock);
1da177e4
LT
2650 next = rb_first(&p->root);
2651 while (next) {
2652 n = rb_entry(next, struct sp_node, nd);
2653 next = rb_next(&n->nd);
63f74ca2 2654 sp_delete(p, n);
1da177e4 2655 }
4a8c7bb5 2656 write_unlock(&p->lock);
1da177e4
LT
2657}
2658
1a687c2e 2659#ifdef CONFIG_NUMA_BALANCING
c297663c 2660static int __initdata numabalancing_override;
1a687c2e
MG
2661
2662static void __init check_numabalancing_enable(void)
2663{
2664 bool numabalancing_default = false;
2665
2666 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2667 numabalancing_default = true;
2668
c297663c
MG
2669 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2670 if (numabalancing_override)
2671 set_numabalancing_state(numabalancing_override == 1);
2672
b0dc2b9b 2673 if (num_online_nodes() > 1 && !numabalancing_override) {
756a025f 2674 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
c297663c 2675 numabalancing_default ? "Enabling" : "Disabling");
1a687c2e
MG
2676 set_numabalancing_state(numabalancing_default);
2677 }
2678}
2679
2680static int __init setup_numabalancing(char *str)
2681{
2682 int ret = 0;
2683 if (!str)
2684 goto out;
1a687c2e
MG
2685
2686 if (!strcmp(str, "enable")) {
c297663c 2687 numabalancing_override = 1;
1a687c2e
MG
2688 ret = 1;
2689 } else if (!strcmp(str, "disable")) {
c297663c 2690 numabalancing_override = -1;
1a687c2e
MG
2691 ret = 1;
2692 }
2693out:
2694 if (!ret)
4a404bea 2695 pr_warn("Unable to parse numa_balancing=\n");
1a687c2e
MG
2696
2697 return ret;
2698}
2699__setup("numa_balancing=", setup_numabalancing);
2700#else
2701static inline void __init check_numabalancing_enable(void)
2702{
2703}
2704#endif /* CONFIG_NUMA_BALANCING */
2705
1da177e4
LT
2706/* assumes fs == KERNEL_DS */
2707void __init numa_policy_init(void)
2708{
b71636e2
PM
2709 nodemask_t interleave_nodes;
2710 unsigned long largest = 0;
2711 int nid, prefer = 0;
2712
1da177e4
LT
2713 policy_cache = kmem_cache_create("numa_policy",
2714 sizeof(struct mempolicy),
20c2df83 2715 0, SLAB_PANIC, NULL);
1da177e4
LT
2716
2717 sn_cache = kmem_cache_create("shared_policy_node",
2718 sizeof(struct sp_node),
20c2df83 2719 0, SLAB_PANIC, NULL);
1da177e4 2720
5606e387
MG
2721 for_each_node(nid) {
2722 preferred_node_policy[nid] = (struct mempolicy) {
2723 .refcnt = ATOMIC_INIT(1),
2724 .mode = MPOL_PREFERRED,
2725 .flags = MPOL_F_MOF | MPOL_F_MORON,
269fbe72 2726 .nodes = nodemask_of_node(nid),
5606e387
MG
2727 };
2728 }
2729
b71636e2
PM
2730 /*
2731 * Set interleaving policy for system init. Interleaving is only
2732 * enabled across suitably sized nodes (default is >= 16MB), or
2733 * fall back to the largest node if they're all smaller.
2734 */
2735 nodes_clear(interleave_nodes);
01f13bd6 2736 for_each_node_state(nid, N_MEMORY) {
b71636e2
PM
2737 unsigned long total_pages = node_present_pages(nid);
2738
2739 /* Preserve the largest node */
2740 if (largest < total_pages) {
2741 largest = total_pages;
2742 prefer = nid;
2743 }
2744
2745 /* Interleave this node? */
2746 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2747 node_set(nid, interleave_nodes);
2748 }
2749
2750 /* All too small, use the largest */
2751 if (unlikely(nodes_empty(interleave_nodes)))
2752 node_set(prefer, interleave_nodes);
1da177e4 2753
028fec41 2754 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
b1de0d13 2755 pr_err("%s: interleaving failed\n", __func__);
1a687c2e
MG
2756
2757 check_numabalancing_enable();
1da177e4
LT
2758}
2759
8bccd85f 2760/* Reset policy of current process to default */
1da177e4
LT
2761void numa_default_policy(void)
2762{
028fec41 2763 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 2764}
68860ec1 2765
095f1fc4
LS
2766/*
2767 * Parse and format mempolicy from/to strings
2768 */
2769
345ace9c
LS
2770static const char * const policy_modes[] =
2771{
2772 [MPOL_DEFAULT] = "default",
2773 [MPOL_PREFERRED] = "prefer",
2774 [MPOL_BIND] = "bind",
2775 [MPOL_INTERLEAVE] = "interleave",
d3a71033 2776 [MPOL_LOCAL] = "local",
b27abacc 2777 [MPOL_PREFERRED_MANY] = "prefer (many)",
345ace9c 2778};
1a75a6c8 2779
095f1fc4
LS
2780
2781#ifdef CONFIG_TMPFS
2782/**
f2a07f40 2783 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
095f1fc4 2784 * @str: string containing mempolicy to parse
71fe804b 2785 * @mpol: pointer to struct mempolicy pointer, returned on success.
095f1fc4
LS
2786 *
2787 * Format of input:
2788 * <mode>[=<flags>][:<nodelist>]
2789 *
71fe804b 2790 * On success, returns 0, else 1
095f1fc4 2791 */
a7a88b23 2792int mpol_parse_str(char *str, struct mempolicy **mpol)
095f1fc4 2793{
71fe804b 2794 struct mempolicy *new = NULL;
f2a07f40 2795 unsigned short mode_flags;
71fe804b 2796 nodemask_t nodes;
095f1fc4
LS
2797 char *nodelist = strchr(str, ':');
2798 char *flags = strchr(str, '=');
dedf2c73 2799 int err = 1, mode;
095f1fc4 2800
c7a91bc7
DC
2801 if (flags)
2802 *flags++ = '\0'; /* terminate mode string */
2803
095f1fc4
LS
2804 if (nodelist) {
2805 /* NUL-terminate mode or flags string */
2806 *nodelist++ = '\0';
71fe804b 2807 if (nodelist_parse(nodelist, nodes))
095f1fc4 2808 goto out;
01f13bd6 2809 if (!nodes_subset(nodes, node_states[N_MEMORY]))
095f1fc4 2810 goto out;
71fe804b
LS
2811 } else
2812 nodes_clear(nodes);
2813
dedf2c73 2814 mode = match_string(policy_modes, MPOL_MAX, str);
2815 if (mode < 0)
095f1fc4
LS
2816 goto out;
2817
71fe804b 2818 switch (mode) {
095f1fc4 2819 case MPOL_PREFERRED:
71fe804b 2820 /*
aa9f7d51
RD
2821 * Insist on a nodelist of one node only, although later
2822 * we use first_node(nodes) to grab a single node, so here
2823 * nodelist (or nodes) cannot be empty.
71fe804b 2824 */
095f1fc4
LS
2825 if (nodelist) {
2826 char *rest = nodelist;
2827 while (isdigit(*rest))
2828 rest++;
926f2ae0
KM
2829 if (*rest)
2830 goto out;
aa9f7d51
RD
2831 if (nodes_empty(nodes))
2832 goto out;
095f1fc4
LS
2833 }
2834 break;
095f1fc4
LS
2835 case MPOL_INTERLEAVE:
2836 /*
2837 * Default to online nodes with memory if no nodelist
2838 */
2839 if (!nodelist)
01f13bd6 2840 nodes = node_states[N_MEMORY];
3f226aa1 2841 break;
71fe804b 2842 case MPOL_LOCAL:
3f226aa1 2843 /*
71fe804b 2844 * Don't allow a nodelist; mpol_new() checks flags
3f226aa1 2845 */
71fe804b 2846 if (nodelist)
3f226aa1 2847 goto out;
3f226aa1 2848 break;
413b43de
RT
2849 case MPOL_DEFAULT:
2850 /*
2851 * Insist on a empty nodelist
2852 */
2853 if (!nodelist)
2854 err = 0;
2855 goto out;
b27abacc 2856 case MPOL_PREFERRED_MANY:
d69b2e63
KM
2857 case MPOL_BIND:
2858 /*
2859 * Insist on a nodelist
2860 */
2861 if (!nodelist)
2862 goto out;
095f1fc4
LS
2863 }
2864
71fe804b 2865 mode_flags = 0;
095f1fc4
LS
2866 if (flags) {
2867 /*
2868 * Currently, we only support two mutually exclusive
2869 * mode flags.
2870 */
2871 if (!strcmp(flags, "static"))
71fe804b 2872 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4 2873 else if (!strcmp(flags, "relative"))
71fe804b 2874 mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4 2875 else
926f2ae0 2876 goto out;
095f1fc4 2877 }
71fe804b
LS
2878
2879 new = mpol_new(mode, mode_flags, &nodes);
2880 if (IS_ERR(new))
926f2ae0
KM
2881 goto out;
2882
f2a07f40
HD
2883 /*
2884 * Save nodes for mpol_to_str() to show the tmpfs mount options
2885 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2886 */
269fbe72
BW
2887 if (mode != MPOL_PREFERRED) {
2888 new->nodes = nodes;
2889 } else if (nodelist) {
2890 nodes_clear(new->nodes);
2891 node_set(first_node(nodes), new->nodes);
2892 } else {
7858d7bc 2893 new->mode = MPOL_LOCAL;
269fbe72 2894 }
f2a07f40
HD
2895
2896 /*
2897 * Save nodes for contextualization: this will be used to "clone"
2898 * the mempolicy in a specific context [cpuset] at a later time.
2899 */
2900 new->w.user_nodemask = nodes;
2901
926f2ae0 2902 err = 0;
71fe804b 2903
095f1fc4
LS
2904out:
2905 /* Restore string for error message */
2906 if (nodelist)
2907 *--nodelist = ':';
2908 if (flags)
2909 *--flags = '=';
71fe804b
LS
2910 if (!err)
2911 *mpol = new;
095f1fc4
LS
2912 return err;
2913}
2914#endif /* CONFIG_TMPFS */
2915
71fe804b
LS
2916/**
2917 * mpol_to_str - format a mempolicy structure for printing
2918 * @buffer: to contain formatted mempolicy string
2919 * @maxlen: length of @buffer
2920 * @pol: pointer to mempolicy to be formatted
71fe804b 2921 *
948927ee
DR
2922 * Convert @pol into a string. If @buffer is too short, truncate the string.
2923 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2924 * longest flag, "relative", and to display at least a few node ids.
1a75a6c8 2925 */
948927ee 2926void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1a75a6c8
CL
2927{
2928 char *p = buffer;
948927ee
DR
2929 nodemask_t nodes = NODE_MASK_NONE;
2930 unsigned short mode = MPOL_DEFAULT;
2931 unsigned short flags = 0;
2291990a 2932
8790c71a 2933 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
bea904d5 2934 mode = pol->mode;
948927ee
DR
2935 flags = pol->flags;
2936 }
bea904d5 2937
1a75a6c8
CL
2938 switch (mode) {
2939 case MPOL_DEFAULT:
7858d7bc 2940 case MPOL_LOCAL:
1a75a6c8 2941 break;
1a75a6c8 2942 case MPOL_PREFERRED:
b27abacc 2943 case MPOL_PREFERRED_MANY:
1a75a6c8 2944 case MPOL_BIND:
1a75a6c8 2945 case MPOL_INTERLEAVE:
269fbe72 2946 nodes = pol->nodes;
1a75a6c8 2947 break;
1a75a6c8 2948 default:
948927ee
DR
2949 WARN_ON_ONCE(1);
2950 snprintf(p, maxlen, "unknown");
2951 return;
1a75a6c8
CL
2952 }
2953
b7a9f420 2954 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
1a75a6c8 2955
fc36b8d3 2956 if (flags & MPOL_MODE_FLAGS) {
948927ee 2957 p += snprintf(p, buffer + maxlen - p, "=");
f5b087b5 2958
2291990a
LS
2959 /*
2960 * Currently, the only defined flags are mutually exclusive
2961 */
f5b087b5 2962 if (flags & MPOL_F_STATIC_NODES)
2291990a
LS
2963 p += snprintf(p, buffer + maxlen - p, "static");
2964 else if (flags & MPOL_F_RELATIVE_NODES)
2965 p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b5
DR
2966 }
2967
9e763e0f
TH
2968 if (!nodes_empty(nodes))
2969 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2970 nodemask_pr_args(&nodes));
1a75a6c8 2971}
20b51af1
HY
2972
2973bool numa_demotion_enabled = false;
2974
2975#ifdef CONFIG_SYSFS
2976static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
2977 struct kobj_attribute *attr, char *buf)
2978{
2979 return sysfs_emit(buf, "%s\n",
2980 numa_demotion_enabled? "true" : "false");
2981}
2982
2983static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
2984 struct kobj_attribute *attr,
2985 const char *buf, size_t count)
2986{
2987 if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
2988 numa_demotion_enabled = true;
2989 else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
2990 numa_demotion_enabled = false;
2991 else
2992 return -EINVAL;
2993
2994 return count;
2995}
2996
2997static struct kobj_attribute numa_demotion_enabled_attr =
2998 __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
2999 numa_demotion_enabled_store);
3000
3001static struct attribute *numa_attrs[] = {
3002 &numa_demotion_enabled_attr.attr,
3003 NULL,
3004};
3005
3006static const struct attribute_group numa_attr_group = {
3007 .attrs = numa_attrs,
3008};
3009
3010static int __init numa_init_sysfs(void)
3011{
3012 int err;
3013 struct kobject *numa_kobj;
3014
3015 numa_kobj = kobject_create_and_add("numa", mm_kobj);
3016 if (!numa_kobj) {
3017 pr_err("failed to create numa kobject\n");
3018 return -ENOMEM;
3019 }
3020 err = sysfs_create_group(numa_kobj, &numa_attr_group);
3021 if (err) {
3022 pr_err("failed to register numa group\n");
3023 goto delete_obj;
3024 }
3025 return 0;
3026
3027delete_obj:
3028 kobject_put(numa_kobj);
3029 return err;
3030}
3031subsys_initcall(numa_init_sysfs);
3032#endif