]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - mm/mempolicy.c
ASoC: amd: fix for pcm_read() error
[mirror_ubuntu-hirsute-kernel.git] / mm / mempolicy.c
CommitLineData
46aeb7e6 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4 28 * preferred Try a specific node first before normal fallback.
00ef2d2f 29 * As a special case NUMA_NO_NODE here means do the allocation
1da177e4
LT
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
1da177e4
LT
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
1da177e4
LT
66*/
67
b1de0d13
MH
68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
1da177e4 70#include <linux/mempolicy.h>
a520110e 71#include <linux/pagewalk.h>
1da177e4
LT
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
6e84f315 76#include <linux/sched/mm.h>
6a3827d7 77#include <linux/sched/numa_balancing.h>
f719ff9b 78#include <linux/sched/task.h>
1da177e4
LT
79#include <linux/nodemask.h>
80#include <linux/cpuset.h>
1da177e4
LT
81#include <linux/slab.h>
82#include <linux/string.h>
b95f1b31 83#include <linux/export.h>
b488893a 84#include <linux/nsproxy.h>
1da177e4
LT
85#include <linux/interrupt.h>
86#include <linux/init.h>
87#include <linux/compat.h>
31367466 88#include <linux/ptrace.h>
dc9aa5b9 89#include <linux/swap.h>
1a75a6c8
CL
90#include <linux/seq_file.h>
91#include <linux/proc_fs.h>
b20a3503 92#include <linux/migrate.h>
62b61f61 93#include <linux/ksm.h>
95a402c3 94#include <linux/rmap.h>
86c3a764 95#include <linux/security.h>
dbcb0f19 96#include <linux/syscalls.h>
095f1fc4 97#include <linux/ctype.h>
6d9c285a 98#include <linux/mm_inline.h>
b24f53a0 99#include <linux/mmu_notifier.h>
b1de0d13 100#include <linux/printk.h>
c8633798 101#include <linux/swapops.h>
dc9aa5b9 102
1da177e4 103#include <asm/tlbflush.h>
7c0f6ba6 104#include <linux/uaccess.h>
1da177e4 105
62695a84
NP
106#include "internal.h"
107
38e35860 108/* Internal flags */
dc9aa5b9 109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
dc9aa5b9 111
fcc234f8
PE
112static struct kmem_cache *policy_cache;
113static struct kmem_cache *sn_cache;
1da177e4 114
1da177e4
LT
115/* Highest zone. An specific allocation for a zone below that is not
116 policied. */
6267276f 117enum zone_type policy_zone = 0;
1da177e4 118
bea904d5
LS
119/*
120 * run-time system-wide default policy => local allocation
121 */
e754d79d 122static struct mempolicy default_policy = {
1da177e4 123 .refcnt = ATOMIC_INIT(1), /* never free it */
bea904d5 124 .mode = MPOL_PREFERRED,
fc36b8d3 125 .flags = MPOL_F_LOCAL,
1da177e4
LT
126};
127
5606e387
MG
128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129
b2ca916c
DW
130/**
131 * numa_map_to_online_node - Find closest online node
f6e92f40 132 * @node: Node id to start the search
b2ca916c
DW
133 *
134 * Lookup the next closest node by distance if @nid is not online.
135 */
136int numa_map_to_online_node(int node)
137{
4fcbe96e 138 int min_dist = INT_MAX, dist, n, min_node;
b2ca916c 139
4fcbe96e
DW
140 if (node == NUMA_NO_NODE || node_online(node))
141 return node;
b2ca916c
DW
142
143 min_node = node;
4fcbe96e
DW
144 for_each_online_node(n) {
145 dist = node_distance(node, n);
146 if (dist < min_dist) {
147 min_dist = dist;
148 min_node = n;
b2ca916c
DW
149 }
150 }
151
152 return min_node;
153}
154EXPORT_SYMBOL_GPL(numa_map_to_online_node);
155
74d2c3a0 156struct mempolicy *get_task_policy(struct task_struct *p)
5606e387
MG
157{
158 struct mempolicy *pol = p->mempolicy;
f15ca78e 159 int node;
5606e387 160
f15ca78e
ON
161 if (pol)
162 return pol;
5606e387 163
f15ca78e
ON
164 node = numa_node_id();
165 if (node != NUMA_NO_NODE) {
166 pol = &preferred_node_policy[node];
167 /* preferred_node_policy is not initialised early in boot */
168 if (pol->mode)
169 return pol;
5606e387
MG
170 }
171
f15ca78e 172 return &default_policy;
5606e387
MG
173}
174
37012946
DR
175static const struct mempolicy_operations {
176 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
213980c0 177 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
37012946
DR
178} mpol_ops[MPOL_MAX];
179
f5b087b5
DR
180static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
181{
6d556294 182 return pol->flags & MPOL_MODE_FLAGS;
4c50bc01
DR
183}
184
185static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
186 const nodemask_t *rel)
187{
188 nodemask_t tmp;
189 nodes_fold(tmp, *orig, nodes_weight(*rel));
190 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
191}
192
37012946
DR
193static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
194{
195 if (nodes_empty(*nodes))
196 return -EINVAL;
197 pol->v.nodes = *nodes;
198 return 0;
199}
200
201static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
202{
203 if (!nodes)
fc36b8d3 204 pol->flags |= MPOL_F_LOCAL; /* local allocation */
37012946
DR
205 else if (nodes_empty(*nodes))
206 return -EINVAL; /* no allowed nodes */
207 else
208 pol->v.preferred_node = first_node(*nodes);
209 return 0;
210}
211
212static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
213{
859f7ef1 214 if (nodes_empty(*nodes))
37012946
DR
215 return -EINVAL;
216 pol->v.nodes = *nodes;
217 return 0;
218}
219
58568d2a
MX
220/*
221 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
222 * any, for the new policy. mpol_new() has already validated the nodes
223 * parameter with respect to the policy mode and flags. But, we need to
224 * handle an empty nodemask with MPOL_PREFERRED here.
225 *
226 * Must be called holding task's alloc_lock to protect task's mems_allowed
c1e8d7c6 227 * and mempolicy. May also be called holding the mmap_lock for write.
58568d2a 228 */
4bfc4495
KH
229static int mpol_set_nodemask(struct mempolicy *pol,
230 const nodemask_t *nodes, struct nodemask_scratch *nsc)
58568d2a 231{
58568d2a
MX
232 int ret;
233
234 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
235 if (pol == NULL)
236 return 0;
01f13bd6 237 /* Check N_MEMORY */
4bfc4495 238 nodes_and(nsc->mask1,
01f13bd6 239 cpuset_current_mems_allowed, node_states[N_MEMORY]);
58568d2a
MX
240
241 VM_BUG_ON(!nodes);
242 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
243 nodes = NULL; /* explicit local allocation */
244 else {
245 if (pol->flags & MPOL_F_RELATIVE_NODES)
859f7ef1 246 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
58568d2a 247 else
4bfc4495
KH
248 nodes_and(nsc->mask2, *nodes, nsc->mask1);
249
58568d2a
MX
250 if (mpol_store_user_nodemask(pol))
251 pol->w.user_nodemask = *nodes;
252 else
253 pol->w.cpuset_mems_allowed =
254 cpuset_current_mems_allowed;
255 }
256
4bfc4495
KH
257 if (nodes)
258 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
259 else
260 ret = mpol_ops[pol->mode].create(pol, NULL);
58568d2a
MX
261 return ret;
262}
263
264/*
265 * This function just creates a new policy, does some check and simple
266 * initialization. You must invoke mpol_set_nodemask() to set nodes.
267 */
028fec41
DR
268static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
269 nodemask_t *nodes)
1da177e4
LT
270{
271 struct mempolicy *policy;
272
028fec41 273 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
00ef2d2f 274 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
140d5a49 275
3e1f0645
DR
276 if (mode == MPOL_DEFAULT) {
277 if (nodes && !nodes_empty(*nodes))
37012946 278 return ERR_PTR(-EINVAL);
d3a71033 279 return NULL;
37012946 280 }
3e1f0645
DR
281 VM_BUG_ON(!nodes);
282
283 /*
284 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
285 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
286 * All other modes require a valid pointer to a non-empty nodemask.
287 */
288 if (mode == MPOL_PREFERRED) {
289 if (nodes_empty(*nodes)) {
290 if (((flags & MPOL_F_STATIC_NODES) ||
291 (flags & MPOL_F_RELATIVE_NODES)))
292 return ERR_PTR(-EINVAL);
3e1f0645 293 }
479e2802 294 } else if (mode == MPOL_LOCAL) {
8d303e44
PK
295 if (!nodes_empty(*nodes) ||
296 (flags & MPOL_F_STATIC_NODES) ||
297 (flags & MPOL_F_RELATIVE_NODES))
479e2802
PZ
298 return ERR_PTR(-EINVAL);
299 mode = MPOL_PREFERRED;
3e1f0645
DR
300 } else if (nodes_empty(*nodes))
301 return ERR_PTR(-EINVAL);
1da177e4
LT
302 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
303 if (!policy)
304 return ERR_PTR(-ENOMEM);
305 atomic_set(&policy->refcnt, 1);
45c4745a 306 policy->mode = mode;
3e1f0645 307 policy->flags = flags;
37012946 308
1da177e4 309 return policy;
37012946
DR
310}
311
52cd3b07
LS
312/* Slow path of a mpol destructor. */
313void __mpol_put(struct mempolicy *p)
314{
315 if (!atomic_dec_and_test(&p->refcnt))
316 return;
52cd3b07
LS
317 kmem_cache_free(policy_cache, p);
318}
319
213980c0 320static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
321{
322}
323
213980c0 324static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
325{
326 nodemask_t tmp;
327
328 if (pol->flags & MPOL_F_STATIC_NODES)
329 nodes_and(tmp, pol->w.user_nodemask, *nodes);
330 else if (pol->flags & MPOL_F_RELATIVE_NODES)
331 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
332 else {
213980c0
VB
333 nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
334 *nodes);
29b190fa 335 pol->w.cpuset_mems_allowed = *nodes;
37012946 336 }
f5b087b5 337
708c1bbc
MX
338 if (nodes_empty(tmp))
339 tmp = *nodes;
340
213980c0 341 pol->v.nodes = tmp;
37012946
DR
342}
343
344static void mpol_rebind_preferred(struct mempolicy *pol,
213980c0 345 const nodemask_t *nodes)
37012946
DR
346{
347 nodemask_t tmp;
348
37012946
DR
349 if (pol->flags & MPOL_F_STATIC_NODES) {
350 int node = first_node(pol->w.user_nodemask);
351
fc36b8d3 352 if (node_isset(node, *nodes)) {
37012946 353 pol->v.preferred_node = node;
fc36b8d3
LS
354 pol->flags &= ~MPOL_F_LOCAL;
355 } else
356 pol->flags |= MPOL_F_LOCAL;
37012946
DR
357 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
358 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
359 pol->v.preferred_node = first_node(tmp);
fc36b8d3 360 } else if (!(pol->flags & MPOL_F_LOCAL)) {
37012946
DR
361 pol->v.preferred_node = node_remap(pol->v.preferred_node,
362 pol->w.cpuset_mems_allowed,
363 *nodes);
364 pol->w.cpuset_mems_allowed = *nodes;
365 }
1da177e4
LT
366}
367
708c1bbc
MX
368/*
369 * mpol_rebind_policy - Migrate a policy to a different set of nodes
370 *
c1e8d7c6 371 * Per-vma policies are protected by mmap_lock. Allocations using per-task
213980c0
VB
372 * policies are protected by task->mems_allowed_seq to prevent a premature
373 * OOM/allocation failure due to parallel nodemask modification.
708c1bbc 374 */
213980c0 375static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1d0d2680 376{
1d0d2680
DR
377 if (!pol)
378 return;
2e25644e 379 if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
1d0d2680
DR
380 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
381 return;
708c1bbc 382
213980c0 383 mpol_ops[pol->mode].rebind(pol, newmask);
1d0d2680
DR
384}
385
386/*
387 * Wrapper for mpol_rebind_policy() that just requires task
388 * pointer, and updates task mempolicy.
58568d2a
MX
389 *
390 * Called with task's alloc_lock held.
1d0d2680
DR
391 */
392
213980c0 393void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1d0d2680 394{
213980c0 395 mpol_rebind_policy(tsk->mempolicy, new);
1d0d2680
DR
396}
397
398/*
399 * Rebind each vma in mm to new nodemask.
400 *
c1e8d7c6 401 * Call holding a reference to mm. Takes mm->mmap_lock during call.
1d0d2680
DR
402 */
403
404void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
405{
406 struct vm_area_struct *vma;
407
d8ed45c5 408 mmap_write_lock(mm);
1d0d2680 409 for (vma = mm->mmap; vma; vma = vma->vm_next)
213980c0 410 mpol_rebind_policy(vma->vm_policy, new);
d8ed45c5 411 mmap_write_unlock(mm);
1d0d2680
DR
412}
413
37012946
DR
414static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
415 [MPOL_DEFAULT] = {
416 .rebind = mpol_rebind_default,
417 },
418 [MPOL_INTERLEAVE] = {
419 .create = mpol_new_interleave,
420 .rebind = mpol_rebind_nodemask,
421 },
422 [MPOL_PREFERRED] = {
423 .create = mpol_new_preferred,
424 .rebind = mpol_rebind_preferred,
425 },
426 [MPOL_BIND] = {
427 .create = mpol_new_bind,
428 .rebind = mpol_rebind_nodemask,
429 },
430};
431
a53190a4 432static int migrate_page_add(struct page *page, struct list_head *pagelist,
fc301289 433 unsigned long flags);
1a75a6c8 434
6f4576e3
NH
435struct queue_pages {
436 struct list_head *pagelist;
437 unsigned long flags;
438 nodemask_t *nmask;
f18da660
LX
439 unsigned long start;
440 unsigned long end;
441 struct vm_area_struct *first;
6f4576e3
NH
442};
443
88aaa2a1
NH
444/*
445 * Check if the page's nid is in qp->nmask.
446 *
447 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
448 * in the invert of qp->nmask.
449 */
450static inline bool queue_pages_required(struct page *page,
451 struct queue_pages *qp)
452{
453 int nid = page_to_nid(page);
454 unsigned long flags = qp->flags;
455
456 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
457}
458
a7f40cfe 459/*
d8835445
YS
460 * queue_pages_pmd() has four possible return values:
461 * 0 - pages are placed on the right node or queued successfully.
462 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
463 * specified.
464 * 2 - THP was split.
465 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
466 * existing page was already on a node that does not follow the
467 * policy.
a7f40cfe 468 */
c8633798
NH
469static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
470 unsigned long end, struct mm_walk *walk)
959a7e13 471 __releases(ptl)
c8633798
NH
472{
473 int ret = 0;
474 struct page *page;
475 struct queue_pages *qp = walk->private;
476 unsigned long flags;
477
478 if (unlikely(is_pmd_migration_entry(*pmd))) {
a7f40cfe 479 ret = -EIO;
c8633798
NH
480 goto unlock;
481 }
482 page = pmd_page(*pmd);
483 if (is_huge_zero_page(page)) {
484 spin_unlock(ptl);
485 __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
d8835445 486 ret = 2;
c8633798
NH
487 goto out;
488 }
d8835445 489 if (!queue_pages_required(page, qp))
c8633798 490 goto unlock;
c8633798 491
c8633798
NH
492 flags = qp->flags;
493 /* go to thp migration */
a7f40cfe 494 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
a53190a4
YS
495 if (!vma_migratable(walk->vma) ||
496 migrate_page_add(page, qp->pagelist, flags)) {
d8835445 497 ret = 1;
a7f40cfe
YS
498 goto unlock;
499 }
a7f40cfe
YS
500 } else
501 ret = -EIO;
c8633798
NH
502unlock:
503 spin_unlock(ptl);
504out:
505 return ret;
506}
507
98094945
NH
508/*
509 * Scan through pages checking if pages follow certain conditions,
510 * and move them to the pagelist if they do.
d8835445
YS
511 *
512 * queue_pages_pte_range() has three possible return values:
513 * 0 - pages are placed on the right node or queued successfully.
514 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
515 * specified.
516 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
517 * on a node that does not follow the policy.
98094945 518 */
6f4576e3
NH
519static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
520 unsigned long end, struct mm_walk *walk)
1da177e4 521{
6f4576e3
NH
522 struct vm_area_struct *vma = walk->vma;
523 struct page *page;
524 struct queue_pages *qp = walk->private;
525 unsigned long flags = qp->flags;
c8633798 526 int ret;
d8835445 527 bool has_unmovable = false;
3f088420 528 pte_t *pte, *mapped_pte;
705e87c0 529 spinlock_t *ptl;
941150a3 530
c8633798
NH
531 ptl = pmd_trans_huge_lock(pmd, vma);
532 if (ptl) {
533 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
d8835445 534 if (ret != 2)
a7f40cfe 535 return ret;
248db92d 536 }
d8835445 537 /* THP was split, fall through to pte walk */
91612e0d 538
337d9abf
NH
539 if (pmd_trans_unstable(pmd))
540 return 0;
94723aaf 541
3f088420 542 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
6f4576e3 543 for (; addr != end; pte++, addr += PAGE_SIZE) {
91612e0d 544 if (!pte_present(*pte))
1da177e4 545 continue;
6aab341e
LT
546 page = vm_normal_page(vma, addr, *pte);
547 if (!page)
1da177e4 548 continue;
053837fc 549 /*
62b61f61
HD
550 * vm_normal_page() filters out zero pages, but there might
551 * still be PageReserved pages to skip, perhaps in a VDSO.
053837fc 552 */
b79bc0a0 553 if (PageReserved(page))
f4598c8b 554 continue;
88aaa2a1 555 if (!queue_pages_required(page, qp))
38e35860 556 continue;
a7f40cfe 557 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
d8835445
YS
558 /* MPOL_MF_STRICT must be specified if we get here */
559 if (!vma_migratable(vma)) {
560 has_unmovable = true;
a7f40cfe 561 break;
d8835445 562 }
a53190a4
YS
563
564 /*
565 * Do not abort immediately since there may be
566 * temporary off LRU pages in the range. Still
567 * need migrate other LRU pages.
568 */
569 if (migrate_page_add(page, qp->pagelist, flags))
570 has_unmovable = true;
a7f40cfe
YS
571 } else
572 break;
6f4576e3 573 }
3f088420 574 pte_unmap_unlock(mapped_pte, ptl);
6f4576e3 575 cond_resched();
d8835445
YS
576
577 if (has_unmovable)
578 return 1;
579
a7f40cfe 580 return addr != end ? -EIO : 0;
91612e0d
HD
581}
582
6f4576e3
NH
583static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
584 unsigned long addr, unsigned long end,
585 struct mm_walk *walk)
e2d8cf40 586{
dcf17635 587 int ret = 0;
e2d8cf40 588#ifdef CONFIG_HUGETLB_PAGE
6f4576e3 589 struct queue_pages *qp = walk->private;
dcf17635 590 unsigned long flags = (qp->flags & MPOL_MF_VALID);
e2d8cf40 591 struct page *page;
cb900f41 592 spinlock_t *ptl;
d4c54919 593 pte_t entry;
e2d8cf40 594
6f4576e3
NH
595 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
596 entry = huge_ptep_get(pte);
d4c54919
NH
597 if (!pte_present(entry))
598 goto unlock;
599 page = pte_page(entry);
88aaa2a1 600 if (!queue_pages_required(page, qp))
e2d8cf40 601 goto unlock;
dcf17635
LX
602
603 if (flags == MPOL_MF_STRICT) {
604 /*
605 * STRICT alone means only detecting misplaced page and no
606 * need to further check other vma.
607 */
608 ret = -EIO;
609 goto unlock;
610 }
611
612 if (!vma_migratable(walk->vma)) {
613 /*
614 * Must be STRICT with MOVE*, otherwise .test_walk() have
615 * stopped walking current vma.
616 * Detecting misplaced page but allow migrating pages which
617 * have been queued.
618 */
619 ret = 1;
620 goto unlock;
621 }
622
e2d8cf40
NH
623 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
624 if (flags & (MPOL_MF_MOVE_ALL) ||
dcf17635
LX
625 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
626 if (!isolate_huge_page(page, qp->pagelist) &&
627 (flags & MPOL_MF_STRICT))
628 /*
629 * Failed to isolate page but allow migrating pages
630 * which have been queued.
631 */
632 ret = 1;
633 }
e2d8cf40 634unlock:
cb900f41 635 spin_unlock(ptl);
e2d8cf40
NH
636#else
637 BUG();
638#endif
dcf17635 639 return ret;
1da177e4
LT
640}
641
5877231f 642#ifdef CONFIG_NUMA_BALANCING
b24f53a0 643/*
4b10e7d5
MG
644 * This is used to mark a range of virtual addresses to be inaccessible.
645 * These are later cleared by a NUMA hinting fault. Depending on these
646 * faults, pages may be migrated for better NUMA placement.
647 *
648 * This is assuming that NUMA faults are handled using PROT_NONE. If
649 * an architecture makes a different choice, it will need further
650 * changes to the core.
b24f53a0 651 */
4b10e7d5
MG
652unsigned long change_prot_numa(struct vm_area_struct *vma,
653 unsigned long addr, unsigned long end)
b24f53a0 654{
4b10e7d5 655 int nr_updated;
b24f53a0 656
58705444 657 nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
03c5a6e1
MG
658 if (nr_updated)
659 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
b24f53a0 660
4b10e7d5 661 return nr_updated;
b24f53a0
LS
662}
663#else
664static unsigned long change_prot_numa(struct vm_area_struct *vma,
665 unsigned long addr, unsigned long end)
666{
667 return 0;
668}
5877231f 669#endif /* CONFIG_NUMA_BALANCING */
b24f53a0 670
6f4576e3
NH
671static int queue_pages_test_walk(unsigned long start, unsigned long end,
672 struct mm_walk *walk)
673{
674 struct vm_area_struct *vma = walk->vma;
675 struct queue_pages *qp = walk->private;
676 unsigned long endvma = vma->vm_end;
677 unsigned long flags = qp->flags;
678
a18b3ac2 679 /* range check first */
d888fb2b 680 VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
f18da660
LX
681
682 if (!qp->first) {
683 qp->first = vma;
684 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
685 (qp->start < vma->vm_start))
686 /* hole at head side of range */
a18b3ac2
LX
687 return -EFAULT;
688 }
f18da660
LX
689 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
690 ((vma->vm_end < qp->end) &&
691 (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
692 /* hole at middle or tail of range */
693 return -EFAULT;
a18b3ac2 694
a7f40cfe
YS
695 /*
696 * Need check MPOL_MF_STRICT to return -EIO if possible
697 * regardless of vma_migratable
698 */
699 if (!vma_migratable(vma) &&
700 !(flags & MPOL_MF_STRICT))
48684a65
NH
701 return 1;
702
6f4576e3
NH
703 if (endvma > end)
704 endvma = end;
6f4576e3 705
6f4576e3
NH
706 if (flags & MPOL_MF_LAZY) {
707 /* Similar to task_numa_work, skip inaccessible VMAs */
3122e80e 708 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
4355c018 709 !(vma->vm_flags & VM_MIXEDMAP))
6f4576e3
NH
710 change_prot_numa(vma, start, endvma);
711 return 1;
712 }
713
77bf45e7 714 /* queue pages from current vma */
a7f40cfe 715 if (flags & MPOL_MF_VALID)
6f4576e3
NH
716 return 0;
717 return 1;
718}
719
7b86ac33
CH
720static const struct mm_walk_ops queue_pages_walk_ops = {
721 .hugetlb_entry = queue_pages_hugetlb,
722 .pmd_entry = queue_pages_pte_range,
723 .test_walk = queue_pages_test_walk,
724};
725
dc9aa5b9 726/*
98094945
NH
727 * Walk through page tables and collect pages to be migrated.
728 *
729 * If pages found in a given range are on a set of nodes (determined by
730 * @nodes and @flags,) it's isolated and queued to the pagelist which is
d8835445
YS
731 * passed via @private.
732 *
733 * queue_pages_range() has three possible return values:
734 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
735 * specified.
736 * 0 - queue pages successfully or no misplaced page.
a85dfc30
YS
737 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
738 * memory range specified by nodemask and maxnode points outside
739 * your accessible address space (-EFAULT)
dc9aa5b9 740 */
d05f0cdc 741static int
98094945 742queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
6f4576e3
NH
743 nodemask_t *nodes, unsigned long flags,
744 struct list_head *pagelist)
1da177e4 745{
f18da660 746 int err;
6f4576e3
NH
747 struct queue_pages qp = {
748 .pagelist = pagelist,
749 .flags = flags,
750 .nmask = nodes,
f18da660
LX
751 .start = start,
752 .end = end,
753 .first = NULL,
6f4576e3 754 };
6f4576e3 755
f18da660
LX
756 err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
757
758 if (!qp.first)
759 /* whole range in hole */
760 err = -EFAULT;
761
762 return err;
1da177e4
LT
763}
764
869833f2
KM
765/*
766 * Apply policy to a single VMA
c1e8d7c6 767 * This must be called with the mmap_lock held for writing.
869833f2
KM
768 */
769static int vma_replace_policy(struct vm_area_struct *vma,
770 struct mempolicy *pol)
8d34694c 771{
869833f2
KM
772 int err;
773 struct mempolicy *old;
774 struct mempolicy *new;
8d34694c
KM
775
776 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
777 vma->vm_start, vma->vm_end, vma->vm_pgoff,
778 vma->vm_ops, vma->vm_file,
779 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
780
869833f2
KM
781 new = mpol_dup(pol);
782 if (IS_ERR(new))
783 return PTR_ERR(new);
784
785 if (vma->vm_ops && vma->vm_ops->set_policy) {
8d34694c 786 err = vma->vm_ops->set_policy(vma, new);
869833f2
KM
787 if (err)
788 goto err_out;
8d34694c 789 }
869833f2
KM
790
791 old = vma->vm_policy;
c1e8d7c6 792 vma->vm_policy = new; /* protected by mmap_lock */
869833f2
KM
793 mpol_put(old);
794
795 return 0;
796 err_out:
797 mpol_put(new);
8d34694c
KM
798 return err;
799}
800
1da177e4 801/* Step 2: apply policy to a range and do splits. */
9d8cebd4
KM
802static int mbind_range(struct mm_struct *mm, unsigned long start,
803 unsigned long end, struct mempolicy *new_pol)
1da177e4
LT
804{
805 struct vm_area_struct *next;
9d8cebd4
KM
806 struct vm_area_struct *prev;
807 struct vm_area_struct *vma;
808 int err = 0;
e26a5114 809 pgoff_t pgoff;
9d8cebd4
KM
810 unsigned long vmstart;
811 unsigned long vmend;
1da177e4 812
097d5910 813 vma = find_vma(mm, start);
f18da660 814 VM_BUG_ON(!vma);
9d8cebd4 815
097d5910 816 prev = vma->vm_prev;
e26a5114
KM
817 if (start > vma->vm_start)
818 prev = vma;
819
9d8cebd4 820 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
1da177e4 821 next = vma->vm_next;
9d8cebd4
KM
822 vmstart = max(start, vma->vm_start);
823 vmend = min(end, vma->vm_end);
824
e26a5114
KM
825 if (mpol_equal(vma_policy(vma), new_pol))
826 continue;
827
828 pgoff = vma->vm_pgoff +
829 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
9d8cebd4 830 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
19a809af
AA
831 vma->anon_vma, vma->vm_file, pgoff,
832 new_pol, vma->vm_userfaultfd_ctx);
9d8cebd4
KM
833 if (prev) {
834 vma = prev;
835 next = vma->vm_next;
3964acd0
ON
836 if (mpol_equal(vma_policy(vma), new_pol))
837 continue;
838 /* vma_merge() joined vma && vma->next, case 8 */
839 goto replace;
9d8cebd4
KM
840 }
841 if (vma->vm_start != vmstart) {
842 err = split_vma(vma->vm_mm, vma, vmstart, 1);
843 if (err)
844 goto out;
845 }
846 if (vma->vm_end != vmend) {
847 err = split_vma(vma->vm_mm, vma, vmend, 0);
848 if (err)
849 goto out;
850 }
3964acd0 851 replace:
869833f2 852 err = vma_replace_policy(vma, new_pol);
8d34694c
KM
853 if (err)
854 goto out;
1da177e4 855 }
9d8cebd4
KM
856
857 out:
1da177e4
LT
858 return err;
859}
860
1da177e4 861/* Set the process memory policy */
028fec41
DR
862static long do_set_mempolicy(unsigned short mode, unsigned short flags,
863 nodemask_t *nodes)
1da177e4 864{
58568d2a 865 struct mempolicy *new, *old;
4bfc4495 866 NODEMASK_SCRATCH(scratch);
58568d2a 867 int ret;
1da177e4 868
4bfc4495
KH
869 if (!scratch)
870 return -ENOMEM;
f4e53d91 871
4bfc4495
KH
872 new = mpol_new(mode, flags, nodes);
873 if (IS_ERR(new)) {
874 ret = PTR_ERR(new);
875 goto out;
876 }
2c7c3a7d 877
4bfc4495 878 ret = mpol_set_nodemask(new, nodes, scratch);
58568d2a 879 if (ret) {
58568d2a 880 mpol_put(new);
4bfc4495 881 goto out;
58568d2a 882 }
78b132e9 883 task_lock(current);
58568d2a 884 old = current->mempolicy;
1da177e4 885 current->mempolicy = new;
45816682
VB
886 if (new && new->mode == MPOL_INTERLEAVE)
887 current->il_prev = MAX_NUMNODES-1;
58568d2a 888 task_unlock(current);
58568d2a 889 mpol_put(old);
4bfc4495
KH
890 ret = 0;
891out:
892 NODEMASK_SCRATCH_FREE(scratch);
893 return ret;
1da177e4
LT
894}
895
bea904d5
LS
896/*
897 * Return nodemask for policy for get_mempolicy() query
58568d2a
MX
898 *
899 * Called with task's alloc_lock held
bea904d5
LS
900 */
901static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4 902{
dfcd3c0d 903 nodes_clear(*nodes);
bea904d5
LS
904 if (p == &default_policy)
905 return;
906
45c4745a 907 switch (p->mode) {
19770b32 908 case MPOL_BIND:
1da177e4 909 case MPOL_INTERLEAVE:
dfcd3c0d 910 *nodes = p->v.nodes;
1da177e4
LT
911 break;
912 case MPOL_PREFERRED:
fc36b8d3 913 if (!(p->flags & MPOL_F_LOCAL))
dfcd3c0d 914 node_set(p->v.preferred_node, *nodes);
53f2556b 915 /* else return empty node mask for local allocation */
1da177e4
LT
916 break;
917 default:
918 BUG();
919 }
920}
921
3b9aadf7 922static int lookup_node(struct mm_struct *mm, unsigned long addr)
1da177e4 923{
ba841078 924 struct page *p = NULL;
1da177e4
LT
925 int err;
926
3b9aadf7
AA
927 int locked = 1;
928 err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
2d3a36a4 929 if (err > 0) {
1da177e4
LT
930 err = page_to_nid(p);
931 put_page(p);
932 }
3b9aadf7 933 if (locked)
d8ed45c5 934 mmap_read_unlock(mm);
1da177e4
LT
935 return err;
936}
937
1da177e4 938/* Retrieve NUMA policy */
dbcb0f19
AB
939static long do_get_mempolicy(int *policy, nodemask_t *nmask,
940 unsigned long addr, unsigned long flags)
1da177e4 941{
8bccd85f 942 int err;
1da177e4
LT
943 struct mm_struct *mm = current->mm;
944 struct vm_area_struct *vma = NULL;
3b9aadf7 945 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1da177e4 946
754af6f5
LS
947 if (flags &
948 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 949 return -EINVAL;
754af6f5
LS
950
951 if (flags & MPOL_F_MEMS_ALLOWED) {
952 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
953 return -EINVAL;
954 *policy = 0; /* just so it's initialized */
58568d2a 955 task_lock(current);
754af6f5 956 *nmask = cpuset_current_mems_allowed;
58568d2a 957 task_unlock(current);
754af6f5
LS
958 return 0;
959 }
960
1da177e4 961 if (flags & MPOL_F_ADDR) {
bea904d5
LS
962 /*
963 * Do NOT fall back to task policy if the
964 * vma/shared policy at addr is NULL. We
965 * want to return MPOL_DEFAULT in this case.
966 */
d8ed45c5 967 mmap_read_lock(mm);
1da177e4
LT
968 vma = find_vma_intersection(mm, addr, addr+1);
969 if (!vma) {
d8ed45c5 970 mmap_read_unlock(mm);
1da177e4
LT
971 return -EFAULT;
972 }
973 if (vma->vm_ops && vma->vm_ops->get_policy)
974 pol = vma->vm_ops->get_policy(vma, addr);
975 else
976 pol = vma->vm_policy;
977 } else if (addr)
978 return -EINVAL;
979
980 if (!pol)
bea904d5 981 pol = &default_policy; /* indicates default behavior */
1da177e4
LT
982
983 if (flags & MPOL_F_NODE) {
984 if (flags & MPOL_F_ADDR) {
3b9aadf7
AA
985 /*
986 * Take a refcount on the mpol, lookup_node()
c1e8d7c6 987 * wil drop the mmap_lock, so after calling
3b9aadf7
AA
988 * lookup_node() only "pol" remains valid, "vma"
989 * is stale.
990 */
991 pol_refcount = pol;
992 vma = NULL;
993 mpol_get(pol);
994 err = lookup_node(mm, addr);
1da177e4
LT
995 if (err < 0)
996 goto out;
8bccd85f 997 *policy = err;
1da177e4 998 } else if (pol == current->mempolicy &&
45c4745a 999 pol->mode == MPOL_INTERLEAVE) {
45816682 1000 *policy = next_node_in(current->il_prev, pol->v.nodes);
1da177e4
LT
1001 } else {
1002 err = -EINVAL;
1003 goto out;
1004 }
bea904d5
LS
1005 } else {
1006 *policy = pol == &default_policy ? MPOL_DEFAULT :
1007 pol->mode;
d79df630
DR
1008 /*
1009 * Internal mempolicy flags must be masked off before exposing
1010 * the policy to userspace.
1011 */
1012 *policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d5 1013 }
1da177e4 1014
1da177e4 1015 err = 0;
58568d2a 1016 if (nmask) {
c6b6ef8b
LS
1017 if (mpol_store_user_nodemask(pol)) {
1018 *nmask = pol->w.user_nodemask;
1019 } else {
1020 task_lock(current);
1021 get_policy_nodemask(pol, nmask);
1022 task_unlock(current);
1023 }
58568d2a 1024 }
1da177e4
LT
1025
1026 out:
52cd3b07 1027 mpol_cond_put(pol);
1da177e4 1028 if (vma)
d8ed45c5 1029 mmap_read_unlock(mm);
3b9aadf7
AA
1030 if (pol_refcount)
1031 mpol_put(pol_refcount);
1da177e4
LT
1032 return err;
1033}
1034
b20a3503 1035#ifdef CONFIG_MIGRATION
6ce3c4c0 1036/*
c8633798 1037 * page migration, thp tail pages can be passed.
6ce3c4c0 1038 */
a53190a4 1039static int migrate_page_add(struct page *page, struct list_head *pagelist,
fc301289 1040 unsigned long flags)
6ce3c4c0 1041{
c8633798 1042 struct page *head = compound_head(page);
6ce3c4c0 1043 /*
fc301289 1044 * Avoid migrating a page that is shared with others.
6ce3c4c0 1045 */
c8633798
NH
1046 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1047 if (!isolate_lru_page(head)) {
1048 list_add_tail(&head->lru, pagelist);
1049 mod_node_page_state(page_pgdat(head),
9de4f22a 1050 NR_ISOLATED_ANON + page_is_file_lru(head),
6c357848 1051 thp_nr_pages(head));
a53190a4
YS
1052 } else if (flags & MPOL_MF_STRICT) {
1053 /*
1054 * Non-movable page may reach here. And, there may be
1055 * temporary off LRU pages or non-LRU movable pages.
1056 * Treat them as unmovable pages since they can't be
1057 * isolated, so they can't be moved at the moment. It
1058 * should return -EIO for this case too.
1059 */
1060 return -EIO;
62695a84
NP
1061 }
1062 }
a53190a4
YS
1063
1064 return 0;
7e2ab150 1065}
6ce3c4c0 1066
7e2ab150
CL
1067/*
1068 * Migrate pages from one node to a target node.
1069 * Returns error or the number of pages not migrated.
1070 */
dbcb0f19
AB
1071static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1072 int flags)
7e2ab150
CL
1073{
1074 nodemask_t nmask;
1075 LIST_HEAD(pagelist);
1076 int err = 0;
a0976311
JK
1077 struct migration_target_control mtc = {
1078 .nid = dest,
1079 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1080 };
7e2ab150
CL
1081
1082 nodes_clear(nmask);
1083 node_set(source, nmask);
6ce3c4c0 1084
08270807
MK
1085 /*
1086 * This does not "check" the range but isolates all pages that
1087 * need migration. Between passing in the full user address
1088 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1089 */
1090 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
98094945 1091 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
7e2ab150
CL
1092 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1093
cf608ac1 1094 if (!list_empty(&pagelist)) {
a0976311
JK
1095 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1096 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
cf608ac1 1097 if (err)
e2d8cf40 1098 putback_movable_pages(&pagelist);
cf608ac1 1099 }
95a402c3 1100
7e2ab150 1101 return err;
6ce3c4c0
CL
1102}
1103
39743889 1104/*
7e2ab150
CL
1105 * Move pages between the two nodesets so as to preserve the physical
1106 * layout as much as possible.
39743889
CL
1107 *
1108 * Returns the number of page that could not be moved.
1109 */
0ce72d4f
AM
1110int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1111 const nodemask_t *to, int flags)
39743889 1112{
7e2ab150 1113 int busy = 0;
f555befd 1114 int err = 0;
7e2ab150 1115 nodemask_t tmp;
39743889 1116
236c32eb 1117 migrate_prep();
0aedadf9 1118
d8ed45c5 1119 mmap_read_lock(mm);
39743889 1120
da0aa138
KM
1121 /*
1122 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1123 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1124 * bit in 'tmp', and return that <source, dest> pair for migration.
1125 * The pair of nodemasks 'to' and 'from' define the map.
1126 *
1127 * If no pair of bits is found that way, fallback to picking some
1128 * pair of 'source' and 'dest' bits that are not the same. If the
1129 * 'source' and 'dest' bits are the same, this represents a node
1130 * that will be migrating to itself, so no pages need move.
1131 *
1132 * If no bits are left in 'tmp', or if all remaining bits left
1133 * in 'tmp' correspond to the same bit in 'to', return false
1134 * (nothing left to migrate).
1135 *
1136 * This lets us pick a pair of nodes to migrate between, such that
1137 * if possible the dest node is not already occupied by some other
1138 * source node, minimizing the risk of overloading the memory on a
1139 * node that would happen if we migrated incoming memory to a node
1140 * before migrating outgoing memory source that same node.
1141 *
1142 * A single scan of tmp is sufficient. As we go, we remember the
1143 * most recent <s, d> pair that moved (s != d). If we find a pair
1144 * that not only moved, but what's better, moved to an empty slot
1145 * (d is not set in tmp), then we break out then, with that pair.
ae0e47f0 1146 * Otherwise when we finish scanning from_tmp, we at least have the
da0aa138
KM
1147 * most recent <s, d> pair that moved. If we get all the way through
1148 * the scan of tmp without finding any node that moved, much less
1149 * moved to an empty node, then there is nothing left worth migrating.
1150 */
d4984711 1151
0ce72d4f 1152 tmp = *from;
7e2ab150
CL
1153 while (!nodes_empty(tmp)) {
1154 int s,d;
b76ac7e7 1155 int source = NUMA_NO_NODE;
7e2ab150
CL
1156 int dest = 0;
1157
1158 for_each_node_mask(s, tmp) {
4a5b18cc
LW
1159
1160 /*
1161 * do_migrate_pages() tries to maintain the relative
1162 * node relationship of the pages established between
1163 * threads and memory areas.
1164 *
1165 * However if the number of source nodes is not equal to
1166 * the number of destination nodes we can not preserve
1167 * this node relative relationship. In that case, skip
1168 * copying memory from a node that is in the destination
1169 * mask.
1170 *
1171 * Example: [2,3,4] -> [3,4,5] moves everything.
1172 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1173 */
1174
0ce72d4f
AM
1175 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1176 (node_isset(s, *to)))
4a5b18cc
LW
1177 continue;
1178
0ce72d4f 1179 d = node_remap(s, *from, *to);
7e2ab150
CL
1180 if (s == d)
1181 continue;
1182
1183 source = s; /* Node moved. Memorize */
1184 dest = d;
1185
1186 /* dest not in remaining from nodes? */
1187 if (!node_isset(dest, tmp))
1188 break;
1189 }
b76ac7e7 1190 if (source == NUMA_NO_NODE)
7e2ab150
CL
1191 break;
1192
1193 node_clear(source, tmp);
1194 err = migrate_to_node(mm, source, dest, flags);
1195 if (err > 0)
1196 busy += err;
1197 if (err < 0)
1198 break;
39743889 1199 }
d8ed45c5 1200 mmap_read_unlock(mm);
7e2ab150
CL
1201 if (err < 0)
1202 return err;
1203 return busy;
b20a3503
CL
1204
1205}
1206
3ad33b24
LS
1207/*
1208 * Allocate a new page for page migration based on vma policy.
d05f0cdc 1209 * Start by assuming the page is mapped by the same vma as contains @start.
3ad33b24
LS
1210 * Search forward from there, if not. N.B., this assumes that the
1211 * list of pages handed to migrate_pages()--which is how we get here--
1212 * is in virtual address order.
1213 */
666feb21 1214static struct page *new_page(struct page *page, unsigned long start)
95a402c3 1215{
d05f0cdc 1216 struct vm_area_struct *vma;
3f649ab7 1217 unsigned long address;
95a402c3 1218
d05f0cdc 1219 vma = find_vma(current->mm, start);
3ad33b24
LS
1220 while (vma) {
1221 address = page_address_in_vma(page, vma);
1222 if (address != -EFAULT)
1223 break;
1224 vma = vma->vm_next;
1225 }
11c731e8
WL
1226
1227 if (PageHuge(page)) {
389c8178
MH
1228 return alloc_huge_page_vma(page_hstate(compound_head(page)),
1229 vma, address);
94723aaf 1230 } else if (PageTransHuge(page)) {
c8633798
NH
1231 struct page *thp;
1232
19deb769
DR
1233 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1234 HPAGE_PMD_ORDER);
c8633798
NH
1235 if (!thp)
1236 return NULL;
1237 prep_transhuge_page(thp);
1238 return thp;
11c731e8 1239 }
0bf598d8 1240 /*
11c731e8 1241 * if !vma, alloc_page_vma() will use task or system default policy
0bf598d8 1242 */
0f556856
MH
1243 return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1244 vma, address);
95a402c3 1245}
b20a3503
CL
1246#else
1247
a53190a4 1248static int migrate_page_add(struct page *page, struct list_head *pagelist,
b20a3503
CL
1249 unsigned long flags)
1250{
a53190a4 1251 return -EIO;
39743889
CL
1252}
1253
0ce72d4f
AM
1254int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1255 const nodemask_t *to, int flags)
b20a3503
CL
1256{
1257 return -ENOSYS;
1258}
95a402c3 1259
666feb21 1260static struct page *new_page(struct page *page, unsigned long start)
95a402c3
CL
1261{
1262 return NULL;
1263}
b20a3503
CL
1264#endif
1265
dbcb0f19 1266static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
1267 unsigned short mode, unsigned short mode_flags,
1268 nodemask_t *nmask, unsigned long flags)
6ce3c4c0 1269{
6ce3c4c0
CL
1270 struct mm_struct *mm = current->mm;
1271 struct mempolicy *new;
1272 unsigned long end;
1273 int err;
d8835445 1274 int ret;
6ce3c4c0
CL
1275 LIST_HEAD(pagelist);
1276
b24f53a0 1277 if (flags & ~(unsigned long)MPOL_MF_VALID)
6ce3c4c0 1278 return -EINVAL;
74c00241 1279 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
1280 return -EPERM;
1281
1282 if (start & ~PAGE_MASK)
1283 return -EINVAL;
1284
1285 if (mode == MPOL_DEFAULT)
1286 flags &= ~MPOL_MF_STRICT;
1287
1288 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1289 end = start + len;
1290
1291 if (end < start)
1292 return -EINVAL;
1293 if (end == start)
1294 return 0;
1295
028fec41 1296 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
1297 if (IS_ERR(new))
1298 return PTR_ERR(new);
1299
b24f53a0
LS
1300 if (flags & MPOL_MF_LAZY)
1301 new->flags |= MPOL_F_MOF;
1302
6ce3c4c0
CL
1303 /*
1304 * If we are using the default policy then operation
1305 * on discontinuous address spaces is okay after all
1306 */
1307 if (!new)
1308 flags |= MPOL_MF_DISCONTIG_OK;
1309
028fec41
DR
1310 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1311 start, start + len, mode, mode_flags,
00ef2d2f 1312 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
6ce3c4c0 1313
0aedadf9
CL
1314 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1315
236c32eb 1316 migrate_prep();
0aedadf9 1317 }
4bfc4495
KH
1318 {
1319 NODEMASK_SCRATCH(scratch);
1320 if (scratch) {
d8ed45c5 1321 mmap_write_lock(mm);
4bfc4495 1322 err = mpol_set_nodemask(new, nmask, scratch);
4bfc4495 1323 if (err)
d8ed45c5 1324 mmap_write_unlock(mm);
4bfc4495
KH
1325 } else
1326 err = -ENOMEM;
1327 NODEMASK_SCRATCH_FREE(scratch);
1328 }
b05ca738
KM
1329 if (err)
1330 goto mpol_out;
1331
d8835445 1332 ret = queue_pages_range(mm, start, end, nmask,
6ce3c4c0 1333 flags | MPOL_MF_INVERT, &pagelist);
d8835445
YS
1334
1335 if (ret < 0) {
a85dfc30 1336 err = ret;
d8835445
YS
1337 goto up_out;
1338 }
1339
1340 err = mbind_range(mm, start, end, new);
7e2ab150 1341
b24f53a0
LS
1342 if (!err) {
1343 int nr_failed = 0;
1344
cf608ac1 1345 if (!list_empty(&pagelist)) {
b24f53a0 1346 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
d05f0cdc
HD
1347 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1348 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
cf608ac1 1349 if (nr_failed)
74060e4d 1350 putback_movable_pages(&pagelist);
cf608ac1 1351 }
6ce3c4c0 1352
d8835445 1353 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
6ce3c4c0 1354 err = -EIO;
a85dfc30 1355 } else {
d8835445 1356up_out:
a85dfc30
YS
1357 if (!list_empty(&pagelist))
1358 putback_movable_pages(&pagelist);
1359 }
1360
d8ed45c5 1361 mmap_write_unlock(mm);
d8835445 1362mpol_out:
f0be3d32 1363 mpol_put(new);
6ce3c4c0
CL
1364 return err;
1365}
1366
8bccd85f
CL
1367/*
1368 * User space interface with variable sized bitmaps for nodelists.
1369 */
1370
1371/* Copy a node mask from user space. */
39743889 1372static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
1373 unsigned long maxnode)
1374{
1375 unsigned long k;
56521e7a 1376 unsigned long t;
8bccd85f
CL
1377 unsigned long nlongs;
1378 unsigned long endmask;
1379
1380 --maxnode;
1381 nodes_clear(*nodes);
1382 if (maxnode == 0 || !nmask)
1383 return 0;
a9c930ba 1384 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 1385 return -EINVAL;
8bccd85f
CL
1386
1387 nlongs = BITS_TO_LONGS(maxnode);
1388 if ((maxnode % BITS_PER_LONG) == 0)
1389 endmask = ~0UL;
1390 else
1391 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1392
56521e7a
YX
1393 /*
1394 * When the user specified more nodes than supported just check
1395 * if the non supported part is all zero.
1396 *
1397 * If maxnode have more longs than MAX_NUMNODES, check
1398 * the bits in that area first. And then go through to
1399 * check the rest bits which equal or bigger than MAX_NUMNODES.
1400 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1401 */
8bccd85f 1402 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
8bccd85f 1403 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
8bccd85f
CL
1404 if (get_user(t, nmask + k))
1405 return -EFAULT;
1406 if (k == nlongs - 1) {
1407 if (t & endmask)
1408 return -EINVAL;
1409 } else if (t)
1410 return -EINVAL;
1411 }
1412 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1413 endmask = ~0UL;
1414 }
1415
56521e7a
YX
1416 if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1417 unsigned long valid_mask = endmask;
1418
1419 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1420 if (get_user(t, nmask + nlongs - 1))
1421 return -EFAULT;
1422 if (t & valid_mask)
1423 return -EINVAL;
1424 }
1425
8bccd85f
CL
1426 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1427 return -EFAULT;
1428 nodes_addr(*nodes)[nlongs-1] &= endmask;
1429 return 0;
1430}
1431
1432/* Copy a kernel node mask to user space */
1433static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1434 nodemask_t *nodes)
1435{
1436 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
050c17f2 1437 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
8bccd85f
CL
1438
1439 if (copy > nbytes) {
1440 if (copy > PAGE_SIZE)
1441 return -EINVAL;
1442 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1443 return -EFAULT;
1444 copy = nbytes;
1445 }
1446 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1447}
1448
e7dc9ad6
DB
1449static long kernel_mbind(unsigned long start, unsigned long len,
1450 unsigned long mode, const unsigned long __user *nmask,
1451 unsigned long maxnode, unsigned int flags)
8bccd85f
CL
1452{
1453 nodemask_t nodes;
1454 int err;
028fec41 1455 unsigned short mode_flags;
8bccd85f 1456
057d3389 1457 start = untagged_addr(start);
028fec41
DR
1458 mode_flags = mode & MPOL_MODE_FLAGS;
1459 mode &= ~MPOL_MODE_FLAGS;
a3b51e01
DR
1460 if (mode >= MPOL_MAX)
1461 return -EINVAL;
4c50bc01
DR
1462 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1463 (mode_flags & MPOL_F_RELATIVE_NODES))
1464 return -EINVAL;
8bccd85f
CL
1465 err = get_nodes(&nodes, nmask, maxnode);
1466 if (err)
1467 return err;
028fec41 1468 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
8bccd85f
CL
1469}
1470
e7dc9ad6
DB
1471SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1472 unsigned long, mode, const unsigned long __user *, nmask,
1473 unsigned long, maxnode, unsigned int, flags)
1474{
1475 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1476}
1477
8bccd85f 1478/* Set the process memory policy */
af03c4ac
DB
1479static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1480 unsigned long maxnode)
8bccd85f
CL
1481{
1482 int err;
1483 nodemask_t nodes;
028fec41 1484 unsigned short flags;
8bccd85f 1485
028fec41
DR
1486 flags = mode & MPOL_MODE_FLAGS;
1487 mode &= ~MPOL_MODE_FLAGS;
1488 if ((unsigned int)mode >= MPOL_MAX)
8bccd85f 1489 return -EINVAL;
4c50bc01
DR
1490 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1491 return -EINVAL;
8bccd85f
CL
1492 err = get_nodes(&nodes, nmask, maxnode);
1493 if (err)
1494 return err;
028fec41 1495 return do_set_mempolicy(mode, flags, &nodes);
8bccd85f
CL
1496}
1497
af03c4ac
DB
1498SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1499 unsigned long, maxnode)
1500{
1501 return kernel_set_mempolicy(mode, nmask, maxnode);
1502}
1503
b6e9b0ba
DB
1504static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1505 const unsigned long __user *old_nodes,
1506 const unsigned long __user *new_nodes)
39743889 1507{
596d7cfa 1508 struct mm_struct *mm = NULL;
39743889 1509 struct task_struct *task;
39743889
CL
1510 nodemask_t task_nodes;
1511 int err;
596d7cfa
KM
1512 nodemask_t *old;
1513 nodemask_t *new;
1514 NODEMASK_SCRATCH(scratch);
1515
1516 if (!scratch)
1517 return -ENOMEM;
39743889 1518
596d7cfa
KM
1519 old = &scratch->mask1;
1520 new = &scratch->mask2;
1521
1522 err = get_nodes(old, old_nodes, maxnode);
39743889 1523 if (err)
596d7cfa 1524 goto out;
39743889 1525
596d7cfa 1526 err = get_nodes(new, new_nodes, maxnode);
39743889 1527 if (err)
596d7cfa 1528 goto out;
39743889
CL
1529
1530 /* Find the mm_struct */
55cfaa3c 1531 rcu_read_lock();
228ebcbe 1532 task = pid ? find_task_by_vpid(pid) : current;
39743889 1533 if (!task) {
55cfaa3c 1534 rcu_read_unlock();
596d7cfa
KM
1535 err = -ESRCH;
1536 goto out;
39743889 1537 }
3268c63e 1538 get_task_struct(task);
39743889 1539
596d7cfa 1540 err = -EINVAL;
39743889
CL
1541
1542 /*
31367466
OE
1543 * Check if this process has the right to modify the specified process.
1544 * Use the regular "ptrace_may_access()" checks.
39743889 1545 */
31367466 1546 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
c69e8d9c 1547 rcu_read_unlock();
39743889 1548 err = -EPERM;
3268c63e 1549 goto out_put;
39743889 1550 }
c69e8d9c 1551 rcu_read_unlock();
39743889
CL
1552
1553 task_nodes = cpuset_mems_allowed(task);
1554 /* Is the user allowed to access the target nodes? */
596d7cfa 1555 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889 1556 err = -EPERM;
3268c63e 1557 goto out_put;
39743889
CL
1558 }
1559
0486a38b
YX
1560 task_nodes = cpuset_mems_allowed(current);
1561 nodes_and(*new, *new, task_nodes);
1562 if (nodes_empty(*new))
1563 goto out_put;
1564
86c3a764
DQ
1565 err = security_task_movememory(task);
1566 if (err)
3268c63e 1567 goto out_put;
86c3a764 1568
3268c63e
CL
1569 mm = get_task_mm(task);
1570 put_task_struct(task);
f2a9ef88
SL
1571
1572 if (!mm) {
3268c63e 1573 err = -EINVAL;
f2a9ef88
SL
1574 goto out;
1575 }
1576
1577 err = do_migrate_pages(mm, old, new,
1578 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
3268c63e
CL
1579
1580 mmput(mm);
1581out:
596d7cfa
KM
1582 NODEMASK_SCRATCH_FREE(scratch);
1583
39743889 1584 return err;
3268c63e
CL
1585
1586out_put:
1587 put_task_struct(task);
1588 goto out;
1589
39743889
CL
1590}
1591
b6e9b0ba
DB
1592SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1593 const unsigned long __user *, old_nodes,
1594 const unsigned long __user *, new_nodes)
1595{
1596 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1597}
1598
39743889 1599
8bccd85f 1600/* Retrieve NUMA policy */
af03c4ac
DB
1601static int kernel_get_mempolicy(int __user *policy,
1602 unsigned long __user *nmask,
1603 unsigned long maxnode,
1604 unsigned long addr,
1605 unsigned long flags)
8bccd85f 1606{
dbcb0f19 1607 int err;
3f649ab7 1608 int pval;
8bccd85f
CL
1609 nodemask_t nodes;
1610
050c17f2 1611 if (nmask != NULL && maxnode < nr_node_ids)
8bccd85f
CL
1612 return -EINVAL;
1613
4605f057
WH
1614 addr = untagged_addr(addr);
1615
8bccd85f
CL
1616 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1617
1618 if (err)
1619 return err;
1620
1621 if (policy && put_user(pval, policy))
1622 return -EFAULT;
1623
1624 if (nmask)
1625 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1626
1627 return err;
1628}
1629
af03c4ac
DB
1630SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1631 unsigned long __user *, nmask, unsigned long, maxnode,
1632 unsigned long, addr, unsigned long, flags)
1633{
1634 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1635}
1636
1da177e4
LT
1637#ifdef CONFIG_COMPAT
1638
c93e0f6c
HC
1639COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1640 compat_ulong_t __user *, nmask,
1641 compat_ulong_t, maxnode,
1642 compat_ulong_t, addr, compat_ulong_t, flags)
1da177e4
LT
1643{
1644 long err;
1645 unsigned long __user *nm = NULL;
1646 unsigned long nr_bits, alloc_size;
1647 DECLARE_BITMAP(bm, MAX_NUMNODES);
1648
050c17f2 1649 nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1da177e4
LT
1650 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1651
1652 if (nmask)
1653 nm = compat_alloc_user_space(alloc_size);
1654
af03c4ac 1655 err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1da177e4
LT
1656
1657 if (!err && nmask) {
2bbff6c7
KH
1658 unsigned long copy_size;
1659 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1660 err = copy_from_user(bm, nm, copy_size);
1da177e4
LT
1661 /* ensure entire bitmap is zeroed */
1662 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1663 err |= compat_put_bitmap(nmask, bm, nr_bits);
1664 }
1665
1666 return err;
1667}
1668
c93e0f6c
HC
1669COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1670 compat_ulong_t, maxnode)
1da177e4 1671{
1da177e4
LT
1672 unsigned long __user *nm = NULL;
1673 unsigned long nr_bits, alloc_size;
1674 DECLARE_BITMAP(bm, MAX_NUMNODES);
1675
1676 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1677 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1678
1679 if (nmask) {
cf01fb99
CS
1680 if (compat_get_bitmap(bm, nmask, nr_bits))
1681 return -EFAULT;
1da177e4 1682 nm = compat_alloc_user_space(alloc_size);
cf01fb99
CS
1683 if (copy_to_user(nm, bm, alloc_size))
1684 return -EFAULT;
1da177e4
LT
1685 }
1686
af03c4ac 1687 return kernel_set_mempolicy(mode, nm, nr_bits+1);
1da177e4
LT
1688}
1689
c93e0f6c
HC
1690COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1691 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1692 compat_ulong_t, maxnode, compat_ulong_t, flags)
1da177e4 1693{
1da177e4
LT
1694 unsigned long __user *nm = NULL;
1695 unsigned long nr_bits, alloc_size;
dfcd3c0d 1696 nodemask_t bm;
1da177e4
LT
1697
1698 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1699 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1700
1701 if (nmask) {
cf01fb99
CS
1702 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1703 return -EFAULT;
1da177e4 1704 nm = compat_alloc_user_space(alloc_size);
cf01fb99
CS
1705 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1706 return -EFAULT;
1da177e4
LT
1707 }
1708
e7dc9ad6 1709 return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1da177e4
LT
1710}
1711
b6e9b0ba
DB
1712COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1713 compat_ulong_t, maxnode,
1714 const compat_ulong_t __user *, old_nodes,
1715 const compat_ulong_t __user *, new_nodes)
1716{
1717 unsigned long __user *old = NULL;
1718 unsigned long __user *new = NULL;
1719 nodemask_t tmp_mask;
1720 unsigned long nr_bits;
1721 unsigned long size;
1722
1723 nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1724 size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1725 if (old_nodes) {
1726 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1727 return -EFAULT;
1728 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1729 if (new_nodes)
1730 new = old + size / sizeof(unsigned long);
1731 if (copy_to_user(old, nodes_addr(tmp_mask), size))
1732 return -EFAULT;
1733 }
1734 if (new_nodes) {
1735 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1736 return -EFAULT;
1737 if (new == NULL)
1738 new = compat_alloc_user_space(size);
1739 if (copy_to_user(new, nodes_addr(tmp_mask), size))
1740 return -EFAULT;
1741 }
1742 return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1743}
1744
1745#endif /* CONFIG_COMPAT */
1da177e4 1746
20ca87f2
LX
1747bool vma_migratable(struct vm_area_struct *vma)
1748{
1749 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1750 return false;
1751
1752 /*
1753 * DAX device mappings require predictable access latency, so avoid
1754 * incurring periodic faults.
1755 */
1756 if (vma_is_dax(vma))
1757 return false;
1758
1759 if (is_vm_hugetlb_page(vma) &&
1760 !hugepage_migration_supported(hstate_vma(vma)))
1761 return false;
1762
1763 /*
1764 * Migration allocates pages in the highest zone. If we cannot
1765 * do so then migration (at least from node to node) is not
1766 * possible.
1767 */
1768 if (vma->vm_file &&
1769 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1770 < policy_zone)
1771 return false;
1772 return true;
1773}
1774
74d2c3a0
ON
1775struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1776 unsigned long addr)
1da177e4 1777{
8d90274b 1778 struct mempolicy *pol = NULL;
1da177e4
LT
1779
1780 if (vma) {
480eccf9 1781 if (vma->vm_ops && vma->vm_ops->get_policy) {
8d90274b 1782 pol = vma->vm_ops->get_policy(vma, addr);
00442ad0 1783 } else if (vma->vm_policy) {
1da177e4 1784 pol = vma->vm_policy;
00442ad0
MG
1785
1786 /*
1787 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1788 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1789 * count on these policies which will be dropped by
1790 * mpol_cond_put() later
1791 */
1792 if (mpol_needs_cond_ref(pol))
1793 mpol_get(pol);
1794 }
1da177e4 1795 }
f15ca78e 1796
74d2c3a0
ON
1797 return pol;
1798}
1799
1800/*
dd6eecb9 1801 * get_vma_policy(@vma, @addr)
74d2c3a0
ON
1802 * @vma: virtual memory area whose policy is sought
1803 * @addr: address in @vma for shared policy lookup
1804 *
1805 * Returns effective policy for a VMA at specified address.
dd6eecb9 1806 * Falls back to current->mempolicy or system default policy, as necessary.
74d2c3a0
ON
1807 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1808 * count--added by the get_policy() vm_op, as appropriate--to protect against
1809 * freeing by another task. It is the caller's responsibility to free the
1810 * extra reference for shared policies.
1811 */
ac79f78d 1812static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
dd6eecb9 1813 unsigned long addr)
74d2c3a0
ON
1814{
1815 struct mempolicy *pol = __get_vma_policy(vma, addr);
1816
8d90274b 1817 if (!pol)
dd6eecb9 1818 pol = get_task_policy(current);
8d90274b 1819
1da177e4
LT
1820 return pol;
1821}
1822
6b6482bb 1823bool vma_policy_mof(struct vm_area_struct *vma)
fc314724 1824{
6b6482bb 1825 struct mempolicy *pol;
fc314724 1826
6b6482bb
ON
1827 if (vma->vm_ops && vma->vm_ops->get_policy) {
1828 bool ret = false;
fc314724 1829
6b6482bb
ON
1830 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1831 if (pol && (pol->flags & MPOL_F_MOF))
1832 ret = true;
1833 mpol_cond_put(pol);
8d90274b 1834
6b6482bb 1835 return ret;
fc314724
MG
1836 }
1837
6b6482bb 1838 pol = vma->vm_policy;
8d90274b 1839 if (!pol)
6b6482bb 1840 pol = get_task_policy(current);
8d90274b 1841
fc314724
MG
1842 return pol->flags & MPOL_F_MOF;
1843}
1844
d3eb1570
LJ
1845static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1846{
1847 enum zone_type dynamic_policy_zone = policy_zone;
1848
1849 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1850
1851 /*
1852 * if policy->v.nodes has movable memory only,
1853 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1854 *
1855 * policy->v.nodes is intersect with node_states[N_MEMORY].
1856 * so if the following test faile, it implies
1857 * policy->v.nodes has movable memory only.
1858 */
1859 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1860 dynamic_policy_zone = ZONE_MOVABLE;
1861
1862 return zone >= dynamic_policy_zone;
1863}
1864
52cd3b07
LS
1865/*
1866 * Return a nodemask representing a mempolicy for filtering nodes for
1867 * page allocation
1868 */
8ca39e68 1869nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
19770b32
MG
1870{
1871 /* Lower zones don't get a nodemask applied for MPOL_BIND */
45c4745a 1872 if (unlikely(policy->mode == MPOL_BIND) &&
d3eb1570 1873 apply_policy_zone(policy, gfp_zone(gfp)) &&
19770b32
MG
1874 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1875 return &policy->v.nodes;
1876
1877 return NULL;
1878}
1879
04ec6264 1880/* Return the node id preferred by the given mempolicy, or the given id */
f8fd5253 1881static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1da177e4 1882{
6d840958
MH
1883 if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1884 nd = policy->v.preferred_node;
1885 else {
19770b32 1886 /*
6d840958
MH
1887 * __GFP_THISNODE shouldn't even be used with the bind policy
1888 * because we might easily break the expectation to stay on the
1889 * requested node and not break the policy.
19770b32 1890 */
6d840958 1891 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1da177e4 1892 }
6d840958 1893
04ec6264 1894 return nd;
1da177e4
LT
1895}
1896
1897/* Do dynamic interleaving for a process */
1898static unsigned interleave_nodes(struct mempolicy *policy)
1899{
45816682 1900 unsigned next;
1da177e4
LT
1901 struct task_struct *me = current;
1902
45816682 1903 next = next_node_in(me->il_prev, policy->v.nodes);
f5b087b5 1904 if (next < MAX_NUMNODES)
45816682
VB
1905 me->il_prev = next;
1906 return next;
1da177e4
LT
1907}
1908
dc85da15
CL
1909/*
1910 * Depending on the memory policy provide a node from which to allocate the
1911 * next slab entry.
1912 */
2a389610 1913unsigned int mempolicy_slab_node(void)
dc85da15 1914{
e7b691b0 1915 struct mempolicy *policy;
2a389610 1916 int node = numa_mem_id();
e7b691b0
AK
1917
1918 if (in_interrupt())
2a389610 1919 return node;
e7b691b0
AK
1920
1921 policy = current->mempolicy;
fc36b8d3 1922 if (!policy || policy->flags & MPOL_F_LOCAL)
2a389610 1923 return node;
bea904d5
LS
1924
1925 switch (policy->mode) {
1926 case MPOL_PREFERRED:
fc36b8d3
LS
1927 /*
1928 * handled MPOL_F_LOCAL above
1929 */
1930 return policy->v.preferred_node;
765c4507 1931
dc85da15
CL
1932 case MPOL_INTERLEAVE:
1933 return interleave_nodes(policy);
1934
dd1a239f 1935 case MPOL_BIND: {
c33d6c06
MG
1936 struct zoneref *z;
1937
dc85da15
CL
1938 /*
1939 * Follow bind policy behavior and start allocation at the
1940 * first node.
1941 */
19770b32 1942 struct zonelist *zonelist;
19770b32 1943 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
c9634cf0 1944 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
c33d6c06
MG
1945 z = first_zones_zonelist(zonelist, highest_zoneidx,
1946 &policy->v.nodes);
c1093b74 1947 return z->zone ? zone_to_nid(z->zone) : node;
dd1a239f 1948 }
dc85da15 1949
dc85da15 1950 default:
bea904d5 1951 BUG();
dc85da15
CL
1952 }
1953}
1954
fee83b3a
AM
1955/*
1956 * Do static interleaving for a VMA with known offset @n. Returns the n'th
1957 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1958 * number of present nodes.
1959 */
98c70baa 1960static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1da177e4 1961{
dfcd3c0d 1962 unsigned nnodes = nodes_weight(pol->v.nodes);
f5b087b5 1963 unsigned target;
fee83b3a
AM
1964 int i;
1965 int nid;
1da177e4 1966
f5b087b5
DR
1967 if (!nnodes)
1968 return numa_node_id();
fee83b3a
AM
1969 target = (unsigned int)n % nnodes;
1970 nid = first_node(pol->v.nodes);
1971 for (i = 0; i < target; i++)
dfcd3c0d 1972 nid = next_node(nid, pol->v.nodes);
1da177e4
LT
1973 return nid;
1974}
1975
5da7ca86
CL
1976/* Determine a node number for interleave */
1977static inline unsigned interleave_nid(struct mempolicy *pol,
1978 struct vm_area_struct *vma, unsigned long addr, int shift)
1979{
1980 if (vma) {
1981 unsigned long off;
1982
3b98b087
NA
1983 /*
1984 * for small pages, there is no difference between
1985 * shift and PAGE_SHIFT, so the bit-shift is safe.
1986 * for huge pages, since vm_pgoff is in units of small
1987 * pages, we need to shift off the always 0 bits to get
1988 * a useful offset.
1989 */
1990 BUG_ON(shift < PAGE_SHIFT);
1991 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca86 1992 off += (addr - vma->vm_start) >> shift;
98c70baa 1993 return offset_il_node(pol, off);
5da7ca86
CL
1994 } else
1995 return interleave_nodes(pol);
1996}
1997
00ac59ad 1998#ifdef CONFIG_HUGETLBFS
480eccf9 1999/*
04ec6264 2000 * huge_node(@vma, @addr, @gfp_flags, @mpol)
b46e14ac
FF
2001 * @vma: virtual memory area whose policy is sought
2002 * @addr: address in @vma for shared policy lookup and interleave policy
2003 * @gfp_flags: for requested zone
2004 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2005 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
480eccf9 2006 *
04ec6264 2007 * Returns a nid suitable for a huge page allocation and a pointer
52cd3b07
LS
2008 * to the struct mempolicy for conditional unref after allocation.
2009 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
2010 * @nodemask for filtering the zonelist.
c0ff7453 2011 *
d26914d1 2012 * Must be protected by read_mems_allowed_begin()
480eccf9 2013 */
04ec6264
VB
2014int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2015 struct mempolicy **mpol, nodemask_t **nodemask)
5da7ca86 2016{
04ec6264 2017 int nid;
5da7ca86 2018
dd6eecb9 2019 *mpol = get_vma_policy(vma, addr);
19770b32 2020 *nodemask = NULL; /* assume !MPOL_BIND */
5da7ca86 2021
52cd3b07 2022 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
04ec6264
VB
2023 nid = interleave_nid(*mpol, vma, addr,
2024 huge_page_shift(hstate_vma(vma)));
52cd3b07 2025 } else {
04ec6264 2026 nid = policy_node(gfp_flags, *mpol, numa_node_id());
52cd3b07
LS
2027 if ((*mpol)->mode == MPOL_BIND)
2028 *nodemask = &(*mpol)->v.nodes;
480eccf9 2029 }
04ec6264 2030 return nid;
5da7ca86 2031}
06808b08
LS
2032
2033/*
2034 * init_nodemask_of_mempolicy
2035 *
2036 * If the current task's mempolicy is "default" [NULL], return 'false'
2037 * to indicate default policy. Otherwise, extract the policy nodemask
2038 * for 'bind' or 'interleave' policy into the argument nodemask, or
2039 * initialize the argument nodemask to contain the single node for
2040 * 'preferred' or 'local' policy and return 'true' to indicate presence
2041 * of non-default mempolicy.
2042 *
2043 * We don't bother with reference counting the mempolicy [mpol_get/put]
2044 * because the current task is examining it's own mempolicy and a task's
2045 * mempolicy is only ever changed by the task itself.
2046 *
2047 * N.B., it is the caller's responsibility to free a returned nodemask.
2048 */
2049bool init_nodemask_of_mempolicy(nodemask_t *mask)
2050{
2051 struct mempolicy *mempolicy;
2052 int nid;
2053
2054 if (!(mask && current->mempolicy))
2055 return false;
2056
c0ff7453 2057 task_lock(current);
06808b08
LS
2058 mempolicy = current->mempolicy;
2059 switch (mempolicy->mode) {
2060 case MPOL_PREFERRED:
2061 if (mempolicy->flags & MPOL_F_LOCAL)
2062 nid = numa_node_id();
2063 else
2064 nid = mempolicy->v.preferred_node;
2065 init_nodemask_of_node(mask, nid);
2066 break;
2067
2068 case MPOL_BIND:
06808b08
LS
2069 case MPOL_INTERLEAVE:
2070 *mask = mempolicy->v.nodes;
2071 break;
2072
2073 default:
2074 BUG();
2075 }
c0ff7453 2076 task_unlock(current);
06808b08
LS
2077
2078 return true;
2079}
00ac59ad 2080#endif
5da7ca86 2081
6f48d0eb
DR
2082/*
2083 * mempolicy_nodemask_intersects
2084 *
2085 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2086 * policy. Otherwise, check for intersection between mask and the policy
2087 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
2088 * policy, always return true since it may allocate elsewhere on fallback.
2089 *
2090 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2091 */
2092bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2093 const nodemask_t *mask)
2094{
2095 struct mempolicy *mempolicy;
2096 bool ret = true;
2097
2098 if (!mask)
2099 return ret;
2100 task_lock(tsk);
2101 mempolicy = tsk->mempolicy;
2102 if (!mempolicy)
2103 goto out;
2104
2105 switch (mempolicy->mode) {
2106 case MPOL_PREFERRED:
2107 /*
2108 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2109 * allocate from, they may fallback to other nodes when oom.
2110 * Thus, it's possible for tsk to have allocated memory from
2111 * nodes in mask.
2112 */
2113 break;
2114 case MPOL_BIND:
2115 case MPOL_INTERLEAVE:
2116 ret = nodes_intersects(mempolicy->v.nodes, *mask);
2117 break;
2118 default:
2119 BUG();
2120 }
2121out:
2122 task_unlock(tsk);
2123 return ret;
2124}
2125
1da177e4
LT
2126/* Allocate a page in interleaved policy.
2127 Own path because it needs to do special accounting. */
662f3a0b
AK
2128static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2129 unsigned nid)
1da177e4 2130{
1da177e4
LT
2131 struct page *page;
2132
04ec6264 2133 page = __alloc_pages(gfp, order, nid);
4518085e
KW
2134 /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2135 if (!static_branch_likely(&vm_numa_stat_key))
2136 return page;
de55c8b2
AR
2137 if (page && page_to_nid(page) == nid) {
2138 preempt_disable();
2139 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2140 preempt_enable();
2141 }
1da177e4
LT
2142 return page;
2143}
2144
2145/**
0bbbc0b3 2146 * alloc_pages_vma - Allocate a page for a VMA.
1da177e4
LT
2147 *
2148 * @gfp:
2149 * %GFP_USER user allocation.
2150 * %GFP_KERNEL kernel allocations,
2151 * %GFP_HIGHMEM highmem/user allocations,
2152 * %GFP_FS allocation should not call back into a file system.
2153 * %GFP_ATOMIC don't sleep.
2154 *
0bbbc0b3 2155 * @order:Order of the GFP allocation.
1da177e4
LT
2156 * @vma: Pointer to VMA or NULL if not available.
2157 * @addr: Virtual Address of the allocation. Must be inside the VMA.
be97a41b 2158 * @node: Which node to prefer for allocation (modulo policy).
19deb769 2159 * @hugepage: for hugepages try only the preferred node if possible
1da177e4
LT
2160 *
2161 * This function allocates a page from the kernel page pool and applies
2162 * a NUMA policy associated with the VMA or the current process.
3e4e28c5 2163 * When VMA is not NULL caller must read-lock the mmap_lock of the
1da177e4 2164 * mm_struct of the VMA to prevent it from going away. Should be used for
be97a41b
VB
2165 * all allocations for pages that will be mapped into user space. Returns
2166 * NULL when no page can be allocated.
1da177e4
LT
2167 */
2168struct page *
0bbbc0b3 2169alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
19deb769 2170 unsigned long addr, int node, bool hugepage)
1da177e4 2171{
cc9a6c87 2172 struct mempolicy *pol;
c0ff7453 2173 struct page *page;
04ec6264 2174 int preferred_nid;
be97a41b 2175 nodemask_t *nmask;
cc9a6c87 2176
dd6eecb9 2177 pol = get_vma_policy(vma, addr);
1da177e4 2178
0867a57c
VB
2179 if (pol->mode == MPOL_INTERLEAVE) {
2180 unsigned nid;
2181
2182 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2183 mpol_cond_put(pol);
2184 page = alloc_page_interleave(gfp, order, nid);
2185 goto out;
19deb769
DR
2186 }
2187
2188 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2189 int hpage_node = node;
2190
2191 /*
2192 * For hugepage allocation and non-interleave policy which
2193 * allows the current node (or other explicitly preferred
2194 * node) we only try to allocate from the current/preferred
2195 * node and don't fall back to other nodes, as the cost of
2196 * remote accesses would likely offset THP benefits.
2197 *
2198 * If the policy is interleave, or does not allow the current
2199 * node in its nodemask, we allocate the standard way.
2200 */
2201 if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2202 hpage_node = pol->v.preferred_node;
2203
2204 nmask = policy_nodemask(gfp, pol);
2205 if (!nmask || node_isset(hpage_node, *nmask)) {
2206 mpol_cond_put(pol);
cc638f32
VB
2207 /*
2208 * First, try to allocate THP only on local node, but
2209 * don't reclaim unnecessarily, just compact.
2210 */
19deb769 2211 page = __alloc_pages_node(hpage_node,
cc638f32 2212 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
76e654cc
DR
2213
2214 /*
2215 * If hugepage allocations are configured to always
2216 * synchronous compact or the vma has been madvised
2217 * to prefer hugepage backing, retry allowing remote
cc638f32 2218 * memory with both reclaim and compact as well.
76e654cc
DR
2219 */
2220 if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2221 page = __alloc_pages_node(hpage_node,
cc638f32 2222 gfp, order);
76e654cc 2223
19deb769
DR
2224 goto out;
2225 }
356ff8a9
DR
2226 }
2227
be97a41b 2228 nmask = policy_nodemask(gfp, pol);
04ec6264
VB
2229 preferred_nid = policy_node(gfp, pol, node);
2230 page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
d51e9894 2231 mpol_cond_put(pol);
be97a41b 2232out:
c0ff7453 2233 return page;
1da177e4 2234}
69262215 2235EXPORT_SYMBOL(alloc_pages_vma);
1da177e4
LT
2236
2237/**
2238 * alloc_pages_current - Allocate pages.
2239 *
2240 * @gfp:
2241 * %GFP_USER user allocation,
2242 * %GFP_KERNEL kernel allocation,
2243 * %GFP_HIGHMEM highmem allocation,
2244 * %GFP_FS don't call back into a file system.
2245 * %GFP_ATOMIC don't sleep.
2246 * @order: Power of two of allocation size in pages. 0 is a single page.
2247 *
2248 * Allocate a page from the kernel page pool. When not in
2249 * interrupt context and apply the current process NUMA policy.
2250 * Returns NULL when no page can be allocated.
1da177e4 2251 */
dd0fc66f 2252struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4 2253{
8d90274b 2254 struct mempolicy *pol = &default_policy;
c0ff7453 2255 struct page *page;
1da177e4 2256
8d90274b
ON
2257 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2258 pol = get_task_policy(current);
52cd3b07
LS
2259
2260 /*
2261 * No reference counting needed for current->mempolicy
2262 * nor system default_policy
2263 */
45c4745a 2264 if (pol->mode == MPOL_INTERLEAVE)
c0ff7453
MX
2265 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2266 else
2267 page = __alloc_pages_nodemask(gfp, order,
04ec6264 2268 policy_node(gfp, pol, numa_node_id()),
5c4b4be3 2269 policy_nodemask(gfp, pol));
cc9a6c87 2270
c0ff7453 2271 return page;
1da177e4
LT
2272}
2273EXPORT_SYMBOL(alloc_pages_current);
2274
ef0855d3
ON
2275int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2276{
2277 struct mempolicy *pol = mpol_dup(vma_policy(src));
2278
2279 if (IS_ERR(pol))
2280 return PTR_ERR(pol);
2281 dst->vm_policy = pol;
2282 return 0;
2283}
2284
4225399a 2285/*
846a16bf 2286 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a
PJ
2287 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2288 * with the mems_allowed returned by cpuset_mems_allowed(). This
2289 * keeps mempolicies cpuset relative after its cpuset moves. See
2290 * further kernel/cpuset.c update_nodemask().
708c1bbc
MX
2291 *
2292 * current's mempolicy may be rebinded by the other task(the task that changes
2293 * cpuset's mems), so we needn't do rebind work for current task.
4225399a 2294 */
4225399a 2295
846a16bf
LS
2296/* Slow path of a mempolicy duplicate */
2297struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4
LT
2298{
2299 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2300
2301 if (!new)
2302 return ERR_PTR(-ENOMEM);
708c1bbc
MX
2303
2304 /* task's mempolicy is protected by alloc_lock */
2305 if (old == current->mempolicy) {
2306 task_lock(current);
2307 *new = *old;
2308 task_unlock(current);
2309 } else
2310 *new = *old;
2311
4225399a
PJ
2312 if (current_cpuset_is_being_rebound()) {
2313 nodemask_t mems = cpuset_mems_allowed(current);
213980c0 2314 mpol_rebind_policy(new, &mems);
4225399a 2315 }
1da177e4 2316 atomic_set(&new->refcnt, 1);
1da177e4
LT
2317 return new;
2318}
2319
2320/* Slow path of a mempolicy comparison */
fcfb4dcc 2321bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1da177e4
LT
2322{
2323 if (!a || !b)
fcfb4dcc 2324 return false;
45c4745a 2325 if (a->mode != b->mode)
fcfb4dcc 2326 return false;
19800502 2327 if (a->flags != b->flags)
fcfb4dcc 2328 return false;
19800502
BL
2329 if (mpol_store_user_nodemask(a))
2330 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
fcfb4dcc 2331 return false;
19800502 2332
45c4745a 2333 switch (a->mode) {
19770b32 2334 case MPOL_BIND:
1da177e4 2335 case MPOL_INTERLEAVE:
fcfb4dcc 2336 return !!nodes_equal(a->v.nodes, b->v.nodes);
1da177e4 2337 case MPOL_PREFERRED:
8970a63e
YX
2338 /* a's ->flags is the same as b's */
2339 if (a->flags & MPOL_F_LOCAL)
2340 return true;
75719661 2341 return a->v.preferred_node == b->v.preferred_node;
1da177e4
LT
2342 default:
2343 BUG();
fcfb4dcc 2344 return false;
1da177e4
LT
2345 }
2346}
2347
1da177e4
LT
2348/*
2349 * Shared memory backing store policy support.
2350 *
2351 * Remember policies even when nobody has shared memory mapped.
2352 * The policies are kept in Red-Black tree linked from the inode.
4a8c7bb5 2353 * They are protected by the sp->lock rwlock, which should be held
1da177e4
LT
2354 * for any accesses to the tree.
2355 */
2356
4a8c7bb5
NZ
2357/*
2358 * lookup first element intersecting start-end. Caller holds sp->lock for
2359 * reading or for writing
2360 */
1da177e4
LT
2361static struct sp_node *
2362sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2363{
2364 struct rb_node *n = sp->root.rb_node;
2365
2366 while (n) {
2367 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2368
2369 if (start >= p->end)
2370 n = n->rb_right;
2371 else if (end <= p->start)
2372 n = n->rb_left;
2373 else
2374 break;
2375 }
2376 if (!n)
2377 return NULL;
2378 for (;;) {
2379 struct sp_node *w = NULL;
2380 struct rb_node *prev = rb_prev(n);
2381 if (!prev)
2382 break;
2383 w = rb_entry(prev, struct sp_node, nd);
2384 if (w->end <= start)
2385 break;
2386 n = prev;
2387 }
2388 return rb_entry(n, struct sp_node, nd);
2389}
2390
4a8c7bb5
NZ
2391/*
2392 * Insert a new shared policy into the list. Caller holds sp->lock for
2393 * writing.
2394 */
1da177e4
LT
2395static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2396{
2397 struct rb_node **p = &sp->root.rb_node;
2398 struct rb_node *parent = NULL;
2399 struct sp_node *nd;
2400
2401 while (*p) {
2402 parent = *p;
2403 nd = rb_entry(parent, struct sp_node, nd);
2404 if (new->start < nd->start)
2405 p = &(*p)->rb_left;
2406 else if (new->end > nd->end)
2407 p = &(*p)->rb_right;
2408 else
2409 BUG();
2410 }
2411 rb_link_node(&new->nd, parent, p);
2412 rb_insert_color(&new->nd, &sp->root);
140d5a49 2413 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
45c4745a 2414 new->policy ? new->policy->mode : 0);
1da177e4
LT
2415}
2416
2417/* Find shared policy intersecting idx */
2418struct mempolicy *
2419mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2420{
2421 struct mempolicy *pol = NULL;
2422 struct sp_node *sn;
2423
2424 if (!sp->root.rb_node)
2425 return NULL;
4a8c7bb5 2426 read_lock(&sp->lock);
1da177e4
LT
2427 sn = sp_lookup(sp, idx, idx+1);
2428 if (sn) {
2429 mpol_get(sn->policy);
2430 pol = sn->policy;
2431 }
4a8c7bb5 2432 read_unlock(&sp->lock);
1da177e4
LT
2433 return pol;
2434}
2435
63f74ca2
KM
2436static void sp_free(struct sp_node *n)
2437{
2438 mpol_put(n->policy);
2439 kmem_cache_free(sn_cache, n);
2440}
2441
771fb4d8
LS
2442/**
2443 * mpol_misplaced - check whether current page node is valid in policy
2444 *
b46e14ac
FF
2445 * @page: page to be checked
2446 * @vma: vm area where page mapped
2447 * @addr: virtual address where page mapped
771fb4d8
LS
2448 *
2449 * Lookup current policy node id for vma,addr and "compare to" page's
2450 * node id.
2451 *
2452 * Returns:
2453 * -1 - not misplaced, page is in the right node
2454 * node - node id where the page should be
2455 *
2456 * Policy determination "mimics" alloc_page_vma().
2457 * Called from fault path where we know the vma and faulting address.
2458 */
2459int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2460{
2461 struct mempolicy *pol;
c33d6c06 2462 struct zoneref *z;
771fb4d8
LS
2463 int curnid = page_to_nid(page);
2464 unsigned long pgoff;
90572890
PZ
2465 int thiscpu = raw_smp_processor_id();
2466 int thisnid = cpu_to_node(thiscpu);
98fa15f3 2467 int polnid = NUMA_NO_NODE;
771fb4d8
LS
2468 int ret = -1;
2469
dd6eecb9 2470 pol = get_vma_policy(vma, addr);
771fb4d8
LS
2471 if (!(pol->flags & MPOL_F_MOF))
2472 goto out;
2473
2474 switch (pol->mode) {
2475 case MPOL_INTERLEAVE:
771fb4d8
LS
2476 pgoff = vma->vm_pgoff;
2477 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
98c70baa 2478 polnid = offset_il_node(pol, pgoff);
771fb4d8
LS
2479 break;
2480
2481 case MPOL_PREFERRED:
2482 if (pol->flags & MPOL_F_LOCAL)
2483 polnid = numa_node_id();
2484 else
2485 polnid = pol->v.preferred_node;
2486 break;
2487
2488 case MPOL_BIND:
c33d6c06 2489
771fb4d8
LS
2490 /*
2491 * allows binding to multiple nodes.
2492 * use current page if in policy nodemask,
2493 * else select nearest allowed node, if any.
2494 * If no allowed nodes, use current [!misplaced].
2495 */
2496 if (node_isset(curnid, pol->v.nodes))
2497 goto out;
c33d6c06 2498 z = first_zones_zonelist(
771fb4d8
LS
2499 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2500 gfp_zone(GFP_HIGHUSER),
c33d6c06 2501 &pol->v.nodes);
c1093b74 2502 polnid = zone_to_nid(z->zone);
771fb4d8
LS
2503 break;
2504
2505 default:
2506 BUG();
2507 }
5606e387
MG
2508
2509 /* Migrate the page towards the node whose CPU is referencing it */
e42c8ff2 2510 if (pol->flags & MPOL_F_MORON) {
90572890 2511 polnid = thisnid;
5606e387 2512
10f39042 2513 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
de1c9ce6 2514 goto out;
e42c8ff2
MG
2515 }
2516
771fb4d8
LS
2517 if (curnid != polnid)
2518 ret = polnid;
2519out:
2520 mpol_cond_put(pol);
2521
2522 return ret;
2523}
2524
c11600e4
DR
2525/*
2526 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2527 * dropped after task->mempolicy is set to NULL so that any allocation done as
2528 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2529 * policy.
2530 */
2531void mpol_put_task_policy(struct task_struct *task)
2532{
2533 struct mempolicy *pol;
2534
2535 task_lock(task);
2536 pol = task->mempolicy;
2537 task->mempolicy = NULL;
2538 task_unlock(task);
2539 mpol_put(pol);
2540}
2541
1da177e4
LT
2542static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2543{
140d5a49 2544 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1da177e4 2545 rb_erase(&n->nd, &sp->root);
63f74ca2 2546 sp_free(n);
1da177e4
LT
2547}
2548
42288fe3
MG
2549static void sp_node_init(struct sp_node *node, unsigned long start,
2550 unsigned long end, struct mempolicy *pol)
2551{
2552 node->start = start;
2553 node->end = end;
2554 node->policy = pol;
2555}
2556
dbcb0f19
AB
2557static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2558 struct mempolicy *pol)
1da177e4 2559{
869833f2
KM
2560 struct sp_node *n;
2561 struct mempolicy *newpol;
1da177e4 2562
869833f2 2563 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1da177e4
LT
2564 if (!n)
2565 return NULL;
869833f2
KM
2566
2567 newpol = mpol_dup(pol);
2568 if (IS_ERR(newpol)) {
2569 kmem_cache_free(sn_cache, n);
2570 return NULL;
2571 }
2572 newpol->flags |= MPOL_F_SHARED;
42288fe3 2573 sp_node_init(n, start, end, newpol);
869833f2 2574
1da177e4
LT
2575 return n;
2576}
2577
2578/* Replace a policy range. */
2579static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2580 unsigned long end, struct sp_node *new)
2581{
b22d127a 2582 struct sp_node *n;
42288fe3
MG
2583 struct sp_node *n_new = NULL;
2584 struct mempolicy *mpol_new = NULL;
b22d127a 2585 int ret = 0;
1da177e4 2586
42288fe3 2587restart:
4a8c7bb5 2588 write_lock(&sp->lock);
1da177e4
LT
2589 n = sp_lookup(sp, start, end);
2590 /* Take care of old policies in the same range. */
2591 while (n && n->start < end) {
2592 struct rb_node *next = rb_next(&n->nd);
2593 if (n->start >= start) {
2594 if (n->end <= end)
2595 sp_delete(sp, n);
2596 else
2597 n->start = end;
2598 } else {
2599 /* Old policy spanning whole new range. */
2600 if (n->end > end) {
42288fe3
MG
2601 if (!n_new)
2602 goto alloc_new;
2603
2604 *mpol_new = *n->policy;
2605 atomic_set(&mpol_new->refcnt, 1);
7880639c 2606 sp_node_init(n_new, end, n->end, mpol_new);
1da177e4 2607 n->end = start;
5ca39575 2608 sp_insert(sp, n_new);
42288fe3
MG
2609 n_new = NULL;
2610 mpol_new = NULL;
1da177e4
LT
2611 break;
2612 } else
2613 n->end = start;
2614 }
2615 if (!next)
2616 break;
2617 n = rb_entry(next, struct sp_node, nd);
2618 }
2619 if (new)
2620 sp_insert(sp, new);
4a8c7bb5 2621 write_unlock(&sp->lock);
42288fe3
MG
2622 ret = 0;
2623
2624err_out:
2625 if (mpol_new)
2626 mpol_put(mpol_new);
2627 if (n_new)
2628 kmem_cache_free(sn_cache, n_new);
2629
b22d127a 2630 return ret;
42288fe3
MG
2631
2632alloc_new:
4a8c7bb5 2633 write_unlock(&sp->lock);
42288fe3
MG
2634 ret = -ENOMEM;
2635 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2636 if (!n_new)
2637 goto err_out;
2638 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2639 if (!mpol_new)
2640 goto err_out;
2641 goto restart;
1da177e4
LT
2642}
2643
71fe804b
LS
2644/**
2645 * mpol_shared_policy_init - initialize shared policy for inode
2646 * @sp: pointer to inode shared policy
2647 * @mpol: struct mempolicy to install
2648 *
2649 * Install non-NULL @mpol in inode's shared policy rb-tree.
2650 * On entry, the current task has a reference on a non-NULL @mpol.
2651 * This must be released on exit.
4bfc4495 2652 * This is called at get_inode() calls and we can use GFP_KERNEL.
71fe804b
LS
2653 */
2654void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2655{
58568d2a
MX
2656 int ret;
2657
71fe804b 2658 sp->root = RB_ROOT; /* empty tree == default mempolicy */
4a8c7bb5 2659 rwlock_init(&sp->lock);
71fe804b
LS
2660
2661 if (mpol) {
2662 struct vm_area_struct pvma;
2663 struct mempolicy *new;
4bfc4495 2664 NODEMASK_SCRATCH(scratch);
71fe804b 2665
4bfc4495 2666 if (!scratch)
5c0c1654 2667 goto put_mpol;
71fe804b
LS
2668 /* contextualize the tmpfs mount point mempolicy */
2669 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
15d77835 2670 if (IS_ERR(new))
0cae3457 2671 goto free_scratch; /* no valid nodemask intersection */
58568d2a
MX
2672
2673 task_lock(current);
4bfc4495 2674 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
58568d2a 2675 task_unlock(current);
15d77835 2676 if (ret)
5c0c1654 2677 goto put_new;
71fe804b
LS
2678
2679 /* Create pseudo-vma that contains just the policy */
2c4541e2 2680 vma_init(&pvma, NULL);
71fe804b
LS
2681 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2682 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
15d77835 2683
5c0c1654 2684put_new:
71fe804b 2685 mpol_put(new); /* drop initial ref */
0cae3457 2686free_scratch:
4bfc4495 2687 NODEMASK_SCRATCH_FREE(scratch);
5c0c1654
LS
2688put_mpol:
2689 mpol_put(mpol); /* drop our incoming ref on sb mpol */
7339ff83
RH
2690 }
2691}
2692
1da177e4
LT
2693int mpol_set_shared_policy(struct shared_policy *info,
2694 struct vm_area_struct *vma, struct mempolicy *npol)
2695{
2696 int err;
2697 struct sp_node *new = NULL;
2698 unsigned long sz = vma_pages(vma);
2699
028fec41 2700 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1da177e4 2701 vma->vm_pgoff,
45c4745a 2702 sz, npol ? npol->mode : -1,
028fec41 2703 npol ? npol->flags : -1,
00ef2d2f 2704 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
1da177e4
LT
2705
2706 if (npol) {
2707 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2708 if (!new)
2709 return -ENOMEM;
2710 }
2711 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2712 if (err && new)
63f74ca2 2713 sp_free(new);
1da177e4
LT
2714 return err;
2715}
2716
2717/* Free a backing policy store on inode delete. */
2718void mpol_free_shared_policy(struct shared_policy *p)
2719{
2720 struct sp_node *n;
2721 struct rb_node *next;
2722
2723 if (!p->root.rb_node)
2724 return;
4a8c7bb5 2725 write_lock(&p->lock);
1da177e4
LT
2726 next = rb_first(&p->root);
2727 while (next) {
2728 n = rb_entry(next, struct sp_node, nd);
2729 next = rb_next(&n->nd);
63f74ca2 2730 sp_delete(p, n);
1da177e4 2731 }
4a8c7bb5 2732 write_unlock(&p->lock);
1da177e4
LT
2733}
2734
1a687c2e 2735#ifdef CONFIG_NUMA_BALANCING
c297663c 2736static int __initdata numabalancing_override;
1a687c2e
MG
2737
2738static void __init check_numabalancing_enable(void)
2739{
2740 bool numabalancing_default = false;
2741
2742 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2743 numabalancing_default = true;
2744
c297663c
MG
2745 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2746 if (numabalancing_override)
2747 set_numabalancing_state(numabalancing_override == 1);
2748
b0dc2b9b 2749 if (num_online_nodes() > 1 && !numabalancing_override) {
756a025f 2750 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
c297663c 2751 numabalancing_default ? "Enabling" : "Disabling");
1a687c2e
MG
2752 set_numabalancing_state(numabalancing_default);
2753 }
2754}
2755
2756static int __init setup_numabalancing(char *str)
2757{
2758 int ret = 0;
2759 if (!str)
2760 goto out;
1a687c2e
MG
2761
2762 if (!strcmp(str, "enable")) {
c297663c 2763 numabalancing_override = 1;
1a687c2e
MG
2764 ret = 1;
2765 } else if (!strcmp(str, "disable")) {
c297663c 2766 numabalancing_override = -1;
1a687c2e
MG
2767 ret = 1;
2768 }
2769out:
2770 if (!ret)
4a404bea 2771 pr_warn("Unable to parse numa_balancing=\n");
1a687c2e
MG
2772
2773 return ret;
2774}
2775__setup("numa_balancing=", setup_numabalancing);
2776#else
2777static inline void __init check_numabalancing_enable(void)
2778{
2779}
2780#endif /* CONFIG_NUMA_BALANCING */
2781
1da177e4
LT
2782/* assumes fs == KERNEL_DS */
2783void __init numa_policy_init(void)
2784{
b71636e2
PM
2785 nodemask_t interleave_nodes;
2786 unsigned long largest = 0;
2787 int nid, prefer = 0;
2788
1da177e4
LT
2789 policy_cache = kmem_cache_create("numa_policy",
2790 sizeof(struct mempolicy),
20c2df83 2791 0, SLAB_PANIC, NULL);
1da177e4
LT
2792
2793 sn_cache = kmem_cache_create("shared_policy_node",
2794 sizeof(struct sp_node),
20c2df83 2795 0, SLAB_PANIC, NULL);
1da177e4 2796
5606e387
MG
2797 for_each_node(nid) {
2798 preferred_node_policy[nid] = (struct mempolicy) {
2799 .refcnt = ATOMIC_INIT(1),
2800 .mode = MPOL_PREFERRED,
2801 .flags = MPOL_F_MOF | MPOL_F_MORON,
2802 .v = { .preferred_node = nid, },
2803 };
2804 }
2805
b71636e2
PM
2806 /*
2807 * Set interleaving policy for system init. Interleaving is only
2808 * enabled across suitably sized nodes (default is >= 16MB), or
2809 * fall back to the largest node if they're all smaller.
2810 */
2811 nodes_clear(interleave_nodes);
01f13bd6 2812 for_each_node_state(nid, N_MEMORY) {
b71636e2
PM
2813 unsigned long total_pages = node_present_pages(nid);
2814
2815 /* Preserve the largest node */
2816 if (largest < total_pages) {
2817 largest = total_pages;
2818 prefer = nid;
2819 }
2820
2821 /* Interleave this node? */
2822 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2823 node_set(nid, interleave_nodes);
2824 }
2825
2826 /* All too small, use the largest */
2827 if (unlikely(nodes_empty(interleave_nodes)))
2828 node_set(prefer, interleave_nodes);
1da177e4 2829
028fec41 2830 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
b1de0d13 2831 pr_err("%s: interleaving failed\n", __func__);
1a687c2e
MG
2832
2833 check_numabalancing_enable();
1da177e4
LT
2834}
2835
8bccd85f 2836/* Reset policy of current process to default */
1da177e4
LT
2837void numa_default_policy(void)
2838{
028fec41 2839 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 2840}
68860ec1 2841
095f1fc4
LS
2842/*
2843 * Parse and format mempolicy from/to strings
2844 */
2845
1a75a6c8 2846/*
f2a07f40 2847 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
1a75a6c8 2848 */
345ace9c
LS
2849static const char * const policy_modes[] =
2850{
2851 [MPOL_DEFAULT] = "default",
2852 [MPOL_PREFERRED] = "prefer",
2853 [MPOL_BIND] = "bind",
2854 [MPOL_INTERLEAVE] = "interleave",
d3a71033 2855 [MPOL_LOCAL] = "local",
345ace9c 2856};
1a75a6c8 2857
095f1fc4
LS
2858
2859#ifdef CONFIG_TMPFS
2860/**
f2a07f40 2861 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
095f1fc4 2862 * @str: string containing mempolicy to parse
71fe804b 2863 * @mpol: pointer to struct mempolicy pointer, returned on success.
095f1fc4
LS
2864 *
2865 * Format of input:
2866 * <mode>[=<flags>][:<nodelist>]
2867 *
71fe804b 2868 * On success, returns 0, else 1
095f1fc4 2869 */
a7a88b23 2870int mpol_parse_str(char *str, struct mempolicy **mpol)
095f1fc4 2871{
71fe804b 2872 struct mempolicy *new = NULL;
f2a07f40 2873 unsigned short mode_flags;
71fe804b 2874 nodemask_t nodes;
095f1fc4
LS
2875 char *nodelist = strchr(str, ':');
2876 char *flags = strchr(str, '=');
dedf2c73 2877 int err = 1, mode;
095f1fc4 2878
c7a91bc7
DC
2879 if (flags)
2880 *flags++ = '\0'; /* terminate mode string */
2881
095f1fc4
LS
2882 if (nodelist) {
2883 /* NUL-terminate mode or flags string */
2884 *nodelist++ = '\0';
71fe804b 2885 if (nodelist_parse(nodelist, nodes))
095f1fc4 2886 goto out;
01f13bd6 2887 if (!nodes_subset(nodes, node_states[N_MEMORY]))
095f1fc4 2888 goto out;
71fe804b
LS
2889 } else
2890 nodes_clear(nodes);
2891
dedf2c73 2892 mode = match_string(policy_modes, MPOL_MAX, str);
2893 if (mode < 0)
095f1fc4
LS
2894 goto out;
2895
71fe804b 2896 switch (mode) {
095f1fc4 2897 case MPOL_PREFERRED:
71fe804b 2898 /*
aa9f7d51
RD
2899 * Insist on a nodelist of one node only, although later
2900 * we use first_node(nodes) to grab a single node, so here
2901 * nodelist (or nodes) cannot be empty.
71fe804b 2902 */
095f1fc4
LS
2903 if (nodelist) {
2904 char *rest = nodelist;
2905 while (isdigit(*rest))
2906 rest++;
926f2ae0
KM
2907 if (*rest)
2908 goto out;
aa9f7d51
RD
2909 if (nodes_empty(nodes))
2910 goto out;
095f1fc4
LS
2911 }
2912 break;
095f1fc4
LS
2913 case MPOL_INTERLEAVE:
2914 /*
2915 * Default to online nodes with memory if no nodelist
2916 */
2917 if (!nodelist)
01f13bd6 2918 nodes = node_states[N_MEMORY];
3f226aa1 2919 break;
71fe804b 2920 case MPOL_LOCAL:
3f226aa1 2921 /*
71fe804b 2922 * Don't allow a nodelist; mpol_new() checks flags
3f226aa1 2923 */
71fe804b 2924 if (nodelist)
3f226aa1 2925 goto out;
71fe804b 2926 mode = MPOL_PREFERRED;
3f226aa1 2927 break;
413b43de
RT
2928 case MPOL_DEFAULT:
2929 /*
2930 * Insist on a empty nodelist
2931 */
2932 if (!nodelist)
2933 err = 0;
2934 goto out;
d69b2e63
KM
2935 case MPOL_BIND:
2936 /*
2937 * Insist on a nodelist
2938 */
2939 if (!nodelist)
2940 goto out;
095f1fc4
LS
2941 }
2942
71fe804b 2943 mode_flags = 0;
095f1fc4
LS
2944 if (flags) {
2945 /*
2946 * Currently, we only support two mutually exclusive
2947 * mode flags.
2948 */
2949 if (!strcmp(flags, "static"))
71fe804b 2950 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4 2951 else if (!strcmp(flags, "relative"))
71fe804b 2952 mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4 2953 else
926f2ae0 2954 goto out;
095f1fc4 2955 }
71fe804b
LS
2956
2957 new = mpol_new(mode, mode_flags, &nodes);
2958 if (IS_ERR(new))
926f2ae0
KM
2959 goto out;
2960
f2a07f40
HD
2961 /*
2962 * Save nodes for mpol_to_str() to show the tmpfs mount options
2963 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2964 */
2965 if (mode != MPOL_PREFERRED)
2966 new->v.nodes = nodes;
2967 else if (nodelist)
2968 new->v.preferred_node = first_node(nodes);
2969 else
2970 new->flags |= MPOL_F_LOCAL;
2971
2972 /*
2973 * Save nodes for contextualization: this will be used to "clone"
2974 * the mempolicy in a specific context [cpuset] at a later time.
2975 */
2976 new->w.user_nodemask = nodes;
2977
926f2ae0 2978 err = 0;
71fe804b 2979
095f1fc4
LS
2980out:
2981 /* Restore string for error message */
2982 if (nodelist)
2983 *--nodelist = ':';
2984 if (flags)
2985 *--flags = '=';
71fe804b
LS
2986 if (!err)
2987 *mpol = new;
095f1fc4
LS
2988 return err;
2989}
2990#endif /* CONFIG_TMPFS */
2991
71fe804b
LS
2992/**
2993 * mpol_to_str - format a mempolicy structure for printing
2994 * @buffer: to contain formatted mempolicy string
2995 * @maxlen: length of @buffer
2996 * @pol: pointer to mempolicy to be formatted
71fe804b 2997 *
948927ee
DR
2998 * Convert @pol into a string. If @buffer is too short, truncate the string.
2999 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3000 * longest flag, "relative", and to display at least a few node ids.
1a75a6c8 3001 */
948927ee 3002void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1a75a6c8
CL
3003{
3004 char *p = buffer;
948927ee
DR
3005 nodemask_t nodes = NODE_MASK_NONE;
3006 unsigned short mode = MPOL_DEFAULT;
3007 unsigned short flags = 0;
2291990a 3008
8790c71a 3009 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
bea904d5 3010 mode = pol->mode;
948927ee
DR
3011 flags = pol->flags;
3012 }
bea904d5 3013
1a75a6c8
CL
3014 switch (mode) {
3015 case MPOL_DEFAULT:
1a75a6c8 3016 break;
1a75a6c8 3017 case MPOL_PREFERRED:
fc36b8d3 3018 if (flags & MPOL_F_LOCAL)
f2a07f40 3019 mode = MPOL_LOCAL;
53f2556b 3020 else
fc36b8d3 3021 node_set(pol->v.preferred_node, nodes);
1a75a6c8 3022 break;
1a75a6c8 3023 case MPOL_BIND:
1a75a6c8 3024 case MPOL_INTERLEAVE:
f2a07f40 3025 nodes = pol->v.nodes;
1a75a6c8 3026 break;
1a75a6c8 3027 default:
948927ee
DR
3028 WARN_ON_ONCE(1);
3029 snprintf(p, maxlen, "unknown");
3030 return;
1a75a6c8
CL
3031 }
3032
b7a9f420 3033 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
1a75a6c8 3034
fc36b8d3 3035 if (flags & MPOL_MODE_FLAGS) {
948927ee 3036 p += snprintf(p, buffer + maxlen - p, "=");
f5b087b5 3037
2291990a
LS
3038 /*
3039 * Currently, the only defined flags are mutually exclusive
3040 */
f5b087b5 3041 if (flags & MPOL_F_STATIC_NODES)
2291990a
LS
3042 p += snprintf(p, buffer + maxlen - p, "static");
3043 else if (flags & MPOL_F_RELATIVE_NODES)
3044 p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b5
DR
3045 }
3046
9e763e0f
TH
3047 if (!nodes_empty(nodes))
3048 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3049 nodemask_pr_args(&nodes));
1a75a6c8 3050}