]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - mm/mempolicy.c
sched/headers: Prepare to use <linux/rcuupdate.h> instead of <linux/rculist.h> in...
[mirror_ubuntu-bionic-kernel.git] / mm / mempolicy.c
CommitLineData
1da177e4
LT
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4 28 * preferred Try a specific node first before normal fallback.
00ef2d2f 29 * As a special case NUMA_NO_NODE here means do the allocation
1da177e4
LT
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
1da177e4
LT
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
1da177e4
LT
66*/
67
b1de0d13
MH
68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
1da177e4
LT
70#include <linux/mempolicy.h>
71#include <linux/mm.h>
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
6e84f315 76#include <linux/sched/mm.h>
6a3827d7 77#include <linux/sched/numa_balancing.h>
1da177e4
LT
78#include <linux/nodemask.h>
79#include <linux/cpuset.h>
1da177e4
LT
80#include <linux/slab.h>
81#include <linux/string.h>
b95f1b31 82#include <linux/export.h>
b488893a 83#include <linux/nsproxy.h>
1da177e4
LT
84#include <linux/interrupt.h>
85#include <linux/init.h>
86#include <linux/compat.h>
dc9aa5b9 87#include <linux/swap.h>
1a75a6c8
CL
88#include <linux/seq_file.h>
89#include <linux/proc_fs.h>
b20a3503 90#include <linux/migrate.h>
62b61f61 91#include <linux/ksm.h>
95a402c3 92#include <linux/rmap.h>
86c3a764 93#include <linux/security.h>
dbcb0f19 94#include <linux/syscalls.h>
095f1fc4 95#include <linux/ctype.h>
6d9c285a 96#include <linux/mm_inline.h>
b24f53a0 97#include <linux/mmu_notifier.h>
b1de0d13 98#include <linux/printk.h>
dc9aa5b9 99
1da177e4 100#include <asm/tlbflush.h>
7c0f6ba6 101#include <linux/uaccess.h>
1da177e4 102
62695a84
NP
103#include "internal.h"
104
38e35860 105/* Internal flags */
dc9aa5b9 106#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 107#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
dc9aa5b9 108
fcc234f8
PE
109static struct kmem_cache *policy_cache;
110static struct kmem_cache *sn_cache;
1da177e4 111
1da177e4
LT
112/* Highest zone. An specific allocation for a zone below that is not
113 policied. */
6267276f 114enum zone_type policy_zone = 0;
1da177e4 115
bea904d5
LS
116/*
117 * run-time system-wide default policy => local allocation
118 */
e754d79d 119static struct mempolicy default_policy = {
1da177e4 120 .refcnt = ATOMIC_INIT(1), /* never free it */
bea904d5 121 .mode = MPOL_PREFERRED,
fc36b8d3 122 .flags = MPOL_F_LOCAL,
1da177e4
LT
123};
124
5606e387
MG
125static struct mempolicy preferred_node_policy[MAX_NUMNODES];
126
74d2c3a0 127struct mempolicy *get_task_policy(struct task_struct *p)
5606e387
MG
128{
129 struct mempolicy *pol = p->mempolicy;
f15ca78e 130 int node;
5606e387 131
f15ca78e
ON
132 if (pol)
133 return pol;
5606e387 134
f15ca78e
ON
135 node = numa_node_id();
136 if (node != NUMA_NO_NODE) {
137 pol = &preferred_node_policy[node];
138 /* preferred_node_policy is not initialised early in boot */
139 if (pol->mode)
140 return pol;
5606e387
MG
141 }
142
f15ca78e 143 return &default_policy;
5606e387
MG
144}
145
37012946
DR
146static const struct mempolicy_operations {
147 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
708c1bbc
MX
148 /*
149 * If read-side task has no lock to protect task->mempolicy, write-side
150 * task will rebind the task->mempolicy by two step. The first step is
151 * setting all the newly nodes, and the second step is cleaning all the
152 * disallowed nodes. In this way, we can avoid finding no node to alloc
153 * page.
154 * If we have a lock to protect task->mempolicy in read-side, we do
155 * rebind directly.
156 *
157 * step:
158 * MPOL_REBIND_ONCE - do rebind work at once
159 * MPOL_REBIND_STEP1 - set all the newly nodes
160 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
161 */
162 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
163 enum mpol_rebind_step step);
37012946
DR
164} mpol_ops[MPOL_MAX];
165
f5b087b5
DR
166static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
167{
6d556294 168 return pol->flags & MPOL_MODE_FLAGS;
4c50bc01
DR
169}
170
171static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
172 const nodemask_t *rel)
173{
174 nodemask_t tmp;
175 nodes_fold(tmp, *orig, nodes_weight(*rel));
176 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
177}
178
37012946
DR
179static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
180{
181 if (nodes_empty(*nodes))
182 return -EINVAL;
183 pol->v.nodes = *nodes;
184 return 0;
185}
186
187static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
188{
189 if (!nodes)
fc36b8d3 190 pol->flags |= MPOL_F_LOCAL; /* local allocation */
37012946
DR
191 else if (nodes_empty(*nodes))
192 return -EINVAL; /* no allowed nodes */
193 else
194 pol->v.preferred_node = first_node(*nodes);
195 return 0;
196}
197
198static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
199{
859f7ef1 200 if (nodes_empty(*nodes))
37012946
DR
201 return -EINVAL;
202 pol->v.nodes = *nodes;
203 return 0;
204}
205
58568d2a
MX
206/*
207 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
208 * any, for the new policy. mpol_new() has already validated the nodes
209 * parameter with respect to the policy mode and flags. But, we need to
210 * handle an empty nodemask with MPOL_PREFERRED here.
211 *
212 * Must be called holding task's alloc_lock to protect task's mems_allowed
213 * and mempolicy. May also be called holding the mmap_semaphore for write.
214 */
4bfc4495
KH
215static int mpol_set_nodemask(struct mempolicy *pol,
216 const nodemask_t *nodes, struct nodemask_scratch *nsc)
58568d2a 217{
58568d2a
MX
218 int ret;
219
220 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
221 if (pol == NULL)
222 return 0;
01f13bd6 223 /* Check N_MEMORY */
4bfc4495 224 nodes_and(nsc->mask1,
01f13bd6 225 cpuset_current_mems_allowed, node_states[N_MEMORY]);
58568d2a
MX
226
227 VM_BUG_ON(!nodes);
228 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
229 nodes = NULL; /* explicit local allocation */
230 else {
231 if (pol->flags & MPOL_F_RELATIVE_NODES)
859f7ef1 232 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
58568d2a 233 else
4bfc4495
KH
234 nodes_and(nsc->mask2, *nodes, nsc->mask1);
235
58568d2a
MX
236 if (mpol_store_user_nodemask(pol))
237 pol->w.user_nodemask = *nodes;
238 else
239 pol->w.cpuset_mems_allowed =
240 cpuset_current_mems_allowed;
241 }
242
4bfc4495
KH
243 if (nodes)
244 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
245 else
246 ret = mpol_ops[pol->mode].create(pol, NULL);
58568d2a
MX
247 return ret;
248}
249
250/*
251 * This function just creates a new policy, does some check and simple
252 * initialization. You must invoke mpol_set_nodemask() to set nodes.
253 */
028fec41
DR
254static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
255 nodemask_t *nodes)
1da177e4
LT
256{
257 struct mempolicy *policy;
258
028fec41 259 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
00ef2d2f 260 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
140d5a49 261
3e1f0645
DR
262 if (mode == MPOL_DEFAULT) {
263 if (nodes && !nodes_empty(*nodes))
37012946 264 return ERR_PTR(-EINVAL);
d3a71033 265 return NULL;
37012946 266 }
3e1f0645
DR
267 VM_BUG_ON(!nodes);
268
269 /*
270 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
271 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
272 * All other modes require a valid pointer to a non-empty nodemask.
273 */
274 if (mode == MPOL_PREFERRED) {
275 if (nodes_empty(*nodes)) {
276 if (((flags & MPOL_F_STATIC_NODES) ||
277 (flags & MPOL_F_RELATIVE_NODES)))
278 return ERR_PTR(-EINVAL);
3e1f0645 279 }
479e2802 280 } else if (mode == MPOL_LOCAL) {
8d303e44
PK
281 if (!nodes_empty(*nodes) ||
282 (flags & MPOL_F_STATIC_NODES) ||
283 (flags & MPOL_F_RELATIVE_NODES))
479e2802
PZ
284 return ERR_PTR(-EINVAL);
285 mode = MPOL_PREFERRED;
3e1f0645
DR
286 } else if (nodes_empty(*nodes))
287 return ERR_PTR(-EINVAL);
1da177e4
LT
288 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
289 if (!policy)
290 return ERR_PTR(-ENOMEM);
291 atomic_set(&policy->refcnt, 1);
45c4745a 292 policy->mode = mode;
3e1f0645 293 policy->flags = flags;
37012946 294
1da177e4 295 return policy;
37012946
DR
296}
297
52cd3b07
LS
298/* Slow path of a mpol destructor. */
299void __mpol_put(struct mempolicy *p)
300{
301 if (!atomic_dec_and_test(&p->refcnt))
302 return;
52cd3b07
LS
303 kmem_cache_free(policy_cache, p);
304}
305
708c1bbc
MX
306static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
307 enum mpol_rebind_step step)
37012946
DR
308{
309}
310
708c1bbc
MX
311/*
312 * step:
313 * MPOL_REBIND_ONCE - do rebind work at once
314 * MPOL_REBIND_STEP1 - set all the newly nodes
315 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
316 */
317static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
318 enum mpol_rebind_step step)
37012946
DR
319{
320 nodemask_t tmp;
321
322 if (pol->flags & MPOL_F_STATIC_NODES)
323 nodes_and(tmp, pol->w.user_nodemask, *nodes);
324 else if (pol->flags & MPOL_F_RELATIVE_NODES)
325 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
326 else {
708c1bbc
MX
327 /*
328 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
329 * result
330 */
331 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
332 nodes_remap(tmp, pol->v.nodes,
333 pol->w.cpuset_mems_allowed, *nodes);
334 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
335 } else if (step == MPOL_REBIND_STEP2) {
336 tmp = pol->w.cpuset_mems_allowed;
337 pol->w.cpuset_mems_allowed = *nodes;
338 } else
339 BUG();
37012946 340 }
f5b087b5 341
708c1bbc
MX
342 if (nodes_empty(tmp))
343 tmp = *nodes;
344
345 if (step == MPOL_REBIND_STEP1)
346 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
347 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
348 pol->v.nodes = tmp;
349 else
350 BUG();
351
37012946 352 if (!node_isset(current->il_next, tmp)) {
0edaf86c 353 current->il_next = next_node_in(current->il_next, tmp);
37012946
DR
354 if (current->il_next >= MAX_NUMNODES)
355 current->il_next = numa_node_id();
356 }
357}
358
359static void mpol_rebind_preferred(struct mempolicy *pol,
708c1bbc
MX
360 const nodemask_t *nodes,
361 enum mpol_rebind_step step)
37012946
DR
362{
363 nodemask_t tmp;
364
37012946
DR
365 if (pol->flags & MPOL_F_STATIC_NODES) {
366 int node = first_node(pol->w.user_nodemask);
367
fc36b8d3 368 if (node_isset(node, *nodes)) {
37012946 369 pol->v.preferred_node = node;
fc36b8d3
LS
370 pol->flags &= ~MPOL_F_LOCAL;
371 } else
372 pol->flags |= MPOL_F_LOCAL;
37012946
DR
373 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
374 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
375 pol->v.preferred_node = first_node(tmp);
fc36b8d3 376 } else if (!(pol->flags & MPOL_F_LOCAL)) {
37012946
DR
377 pol->v.preferred_node = node_remap(pol->v.preferred_node,
378 pol->w.cpuset_mems_allowed,
379 *nodes);
380 pol->w.cpuset_mems_allowed = *nodes;
381 }
1da177e4
LT
382}
383
708c1bbc
MX
384/*
385 * mpol_rebind_policy - Migrate a policy to a different set of nodes
386 *
387 * If read-side task has no lock to protect task->mempolicy, write-side
388 * task will rebind the task->mempolicy by two step. The first step is
389 * setting all the newly nodes, and the second step is cleaning all the
390 * disallowed nodes. In this way, we can avoid finding no node to alloc
391 * page.
392 * If we have a lock to protect task->mempolicy in read-side, we do
393 * rebind directly.
394 *
395 * step:
396 * MPOL_REBIND_ONCE - do rebind work at once
397 * MPOL_REBIND_STEP1 - set all the newly nodes
398 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
399 */
400static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
401 enum mpol_rebind_step step)
1d0d2680 402{
1d0d2680
DR
403 if (!pol)
404 return;
89c522c7 405 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
1d0d2680
DR
406 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
407 return;
708c1bbc
MX
408
409 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
410 return;
411
412 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
413 BUG();
414
415 if (step == MPOL_REBIND_STEP1)
416 pol->flags |= MPOL_F_REBINDING;
417 else if (step == MPOL_REBIND_STEP2)
418 pol->flags &= ~MPOL_F_REBINDING;
419 else if (step >= MPOL_REBIND_NSTEP)
420 BUG();
421
422 mpol_ops[pol->mode].rebind(pol, newmask, step);
1d0d2680
DR
423}
424
425/*
426 * Wrapper for mpol_rebind_policy() that just requires task
427 * pointer, and updates task mempolicy.
58568d2a
MX
428 *
429 * Called with task's alloc_lock held.
1d0d2680
DR
430 */
431
708c1bbc
MX
432void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
433 enum mpol_rebind_step step)
1d0d2680 434{
708c1bbc 435 mpol_rebind_policy(tsk->mempolicy, new, step);
1d0d2680
DR
436}
437
438/*
439 * Rebind each vma in mm to new nodemask.
440 *
441 * Call holding a reference to mm. Takes mm->mmap_sem during call.
442 */
443
444void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
445{
446 struct vm_area_struct *vma;
447
448 down_write(&mm->mmap_sem);
449 for (vma = mm->mmap; vma; vma = vma->vm_next)
708c1bbc 450 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
1d0d2680
DR
451 up_write(&mm->mmap_sem);
452}
453
37012946
DR
454static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
455 [MPOL_DEFAULT] = {
456 .rebind = mpol_rebind_default,
457 },
458 [MPOL_INTERLEAVE] = {
459 .create = mpol_new_interleave,
460 .rebind = mpol_rebind_nodemask,
461 },
462 [MPOL_PREFERRED] = {
463 .create = mpol_new_preferred,
464 .rebind = mpol_rebind_preferred,
465 },
466 [MPOL_BIND] = {
467 .create = mpol_new_bind,
468 .rebind = mpol_rebind_nodemask,
469 },
470};
471
fc301289
CL
472static void migrate_page_add(struct page *page, struct list_head *pagelist,
473 unsigned long flags);
1a75a6c8 474
6f4576e3
NH
475struct queue_pages {
476 struct list_head *pagelist;
477 unsigned long flags;
478 nodemask_t *nmask;
479 struct vm_area_struct *prev;
480};
481
98094945
NH
482/*
483 * Scan through pages checking if pages follow certain conditions,
484 * and move them to the pagelist if they do.
485 */
6f4576e3
NH
486static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
487 unsigned long end, struct mm_walk *walk)
1da177e4 488{
6f4576e3
NH
489 struct vm_area_struct *vma = walk->vma;
490 struct page *page;
491 struct queue_pages *qp = walk->private;
492 unsigned long flags = qp->flags;
248db92d 493 int nid, ret;
91612e0d 494 pte_t *pte;
705e87c0 495 spinlock_t *ptl;
941150a3 496
248db92d
KS
497 if (pmd_trans_huge(*pmd)) {
498 ptl = pmd_lock(walk->mm, pmd);
499 if (pmd_trans_huge(*pmd)) {
500 page = pmd_page(*pmd);
501 if (is_huge_zero_page(page)) {
502 spin_unlock(ptl);
fd60775a 503 __split_huge_pmd(vma, pmd, addr, false, NULL);
248db92d
KS
504 } else {
505 get_page(page);
506 spin_unlock(ptl);
507 lock_page(page);
508 ret = split_huge_page(page);
509 unlock_page(page);
510 put_page(page);
511 if (ret)
512 return 0;
513 }
514 } else {
515 spin_unlock(ptl);
516 }
517 }
91612e0d 518
337d9abf
NH
519 if (pmd_trans_unstable(pmd))
520 return 0;
248db92d 521retry:
6f4576e3
NH
522 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
523 for (; addr != end; pte++, addr += PAGE_SIZE) {
91612e0d 524 if (!pte_present(*pte))
1da177e4 525 continue;
6aab341e
LT
526 page = vm_normal_page(vma, addr, *pte);
527 if (!page)
1da177e4 528 continue;
053837fc 529 /*
62b61f61
HD
530 * vm_normal_page() filters out zero pages, but there might
531 * still be PageReserved pages to skip, perhaps in a VDSO.
053837fc 532 */
b79bc0a0 533 if (PageReserved(page))
f4598c8b 534 continue;
6aab341e 535 nid = page_to_nid(page);
6f4576e3 536 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
38e35860 537 continue;
800d8c63 538 if (PageTransCompound(page)) {
248db92d
KS
539 get_page(page);
540 pte_unmap_unlock(pte, ptl);
541 lock_page(page);
542 ret = split_huge_page(page);
543 unlock_page(page);
544 put_page(page);
545 /* Failed to split -- skip. */
546 if (ret) {
547 pte = pte_offset_map_lock(walk->mm, pmd,
548 addr, &ptl);
549 continue;
550 }
551 goto retry;
552 }
38e35860 553
77bf45e7 554 migrate_page_add(page, qp->pagelist, flags);
6f4576e3
NH
555 }
556 pte_unmap_unlock(pte - 1, ptl);
557 cond_resched();
558 return 0;
91612e0d
HD
559}
560
6f4576e3
NH
561static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
562 unsigned long addr, unsigned long end,
563 struct mm_walk *walk)
e2d8cf40
NH
564{
565#ifdef CONFIG_HUGETLB_PAGE
6f4576e3
NH
566 struct queue_pages *qp = walk->private;
567 unsigned long flags = qp->flags;
e2d8cf40
NH
568 int nid;
569 struct page *page;
cb900f41 570 spinlock_t *ptl;
d4c54919 571 pte_t entry;
e2d8cf40 572
6f4576e3
NH
573 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
574 entry = huge_ptep_get(pte);
d4c54919
NH
575 if (!pte_present(entry))
576 goto unlock;
577 page = pte_page(entry);
e2d8cf40 578 nid = page_to_nid(page);
6f4576e3 579 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
e2d8cf40
NH
580 goto unlock;
581 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
582 if (flags & (MPOL_MF_MOVE_ALL) ||
583 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
6f4576e3 584 isolate_huge_page(page, qp->pagelist);
e2d8cf40 585unlock:
cb900f41 586 spin_unlock(ptl);
e2d8cf40
NH
587#else
588 BUG();
589#endif
91612e0d 590 return 0;
1da177e4
LT
591}
592
5877231f 593#ifdef CONFIG_NUMA_BALANCING
b24f53a0 594/*
4b10e7d5
MG
595 * This is used to mark a range of virtual addresses to be inaccessible.
596 * These are later cleared by a NUMA hinting fault. Depending on these
597 * faults, pages may be migrated for better NUMA placement.
598 *
599 * This is assuming that NUMA faults are handled using PROT_NONE. If
600 * an architecture makes a different choice, it will need further
601 * changes to the core.
b24f53a0 602 */
4b10e7d5
MG
603unsigned long change_prot_numa(struct vm_area_struct *vma,
604 unsigned long addr, unsigned long end)
b24f53a0 605{
4b10e7d5 606 int nr_updated;
b24f53a0 607
4d942466 608 nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
03c5a6e1
MG
609 if (nr_updated)
610 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
b24f53a0 611
4b10e7d5 612 return nr_updated;
b24f53a0
LS
613}
614#else
615static unsigned long change_prot_numa(struct vm_area_struct *vma,
616 unsigned long addr, unsigned long end)
617{
618 return 0;
619}
5877231f 620#endif /* CONFIG_NUMA_BALANCING */
b24f53a0 621
6f4576e3
NH
622static int queue_pages_test_walk(unsigned long start, unsigned long end,
623 struct mm_walk *walk)
624{
625 struct vm_area_struct *vma = walk->vma;
626 struct queue_pages *qp = walk->private;
627 unsigned long endvma = vma->vm_end;
628 unsigned long flags = qp->flags;
629
77bf45e7 630 if (!vma_migratable(vma))
48684a65
NH
631 return 1;
632
6f4576e3
NH
633 if (endvma > end)
634 endvma = end;
635 if (vma->vm_start > start)
636 start = vma->vm_start;
637
638 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
639 if (!vma->vm_next && vma->vm_end < end)
640 return -EFAULT;
641 if (qp->prev && qp->prev->vm_end < vma->vm_start)
642 return -EFAULT;
643 }
644
645 qp->prev = vma;
646
6f4576e3
NH
647 if (flags & MPOL_MF_LAZY) {
648 /* Similar to task_numa_work, skip inaccessible VMAs */
4355c018
LC
649 if (!is_vm_hugetlb_page(vma) &&
650 (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
651 !(vma->vm_flags & VM_MIXEDMAP))
6f4576e3
NH
652 change_prot_numa(vma, start, endvma);
653 return 1;
654 }
655
77bf45e7
KS
656 /* queue pages from current vma */
657 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
6f4576e3
NH
658 return 0;
659 return 1;
660}
661
dc9aa5b9 662/*
98094945
NH
663 * Walk through page tables and collect pages to be migrated.
664 *
665 * If pages found in a given range are on a set of nodes (determined by
666 * @nodes and @flags,) it's isolated and queued to the pagelist which is
667 * passed via @private.)
dc9aa5b9 668 */
d05f0cdc 669static int
98094945 670queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
6f4576e3
NH
671 nodemask_t *nodes, unsigned long flags,
672 struct list_head *pagelist)
1da177e4 673{
6f4576e3
NH
674 struct queue_pages qp = {
675 .pagelist = pagelist,
676 .flags = flags,
677 .nmask = nodes,
678 .prev = NULL,
679 };
680 struct mm_walk queue_pages_walk = {
681 .hugetlb_entry = queue_pages_hugetlb,
682 .pmd_entry = queue_pages_pte_range,
683 .test_walk = queue_pages_test_walk,
684 .mm = mm,
685 .private = &qp,
686 };
687
688 return walk_page_range(start, end, &queue_pages_walk);
1da177e4
LT
689}
690
869833f2
KM
691/*
692 * Apply policy to a single VMA
693 * This must be called with the mmap_sem held for writing.
694 */
695static int vma_replace_policy(struct vm_area_struct *vma,
696 struct mempolicy *pol)
8d34694c 697{
869833f2
KM
698 int err;
699 struct mempolicy *old;
700 struct mempolicy *new;
8d34694c
KM
701
702 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
703 vma->vm_start, vma->vm_end, vma->vm_pgoff,
704 vma->vm_ops, vma->vm_file,
705 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
706
869833f2
KM
707 new = mpol_dup(pol);
708 if (IS_ERR(new))
709 return PTR_ERR(new);
710
711 if (vma->vm_ops && vma->vm_ops->set_policy) {
8d34694c 712 err = vma->vm_ops->set_policy(vma, new);
869833f2
KM
713 if (err)
714 goto err_out;
8d34694c 715 }
869833f2
KM
716
717 old = vma->vm_policy;
718 vma->vm_policy = new; /* protected by mmap_sem */
719 mpol_put(old);
720
721 return 0;
722 err_out:
723 mpol_put(new);
8d34694c
KM
724 return err;
725}
726
1da177e4 727/* Step 2: apply policy to a range and do splits. */
9d8cebd4
KM
728static int mbind_range(struct mm_struct *mm, unsigned long start,
729 unsigned long end, struct mempolicy *new_pol)
1da177e4
LT
730{
731 struct vm_area_struct *next;
9d8cebd4
KM
732 struct vm_area_struct *prev;
733 struct vm_area_struct *vma;
734 int err = 0;
e26a5114 735 pgoff_t pgoff;
9d8cebd4
KM
736 unsigned long vmstart;
737 unsigned long vmend;
1da177e4 738
097d5910 739 vma = find_vma(mm, start);
9d8cebd4
KM
740 if (!vma || vma->vm_start > start)
741 return -EFAULT;
742
097d5910 743 prev = vma->vm_prev;
e26a5114
KM
744 if (start > vma->vm_start)
745 prev = vma;
746
9d8cebd4 747 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
1da177e4 748 next = vma->vm_next;
9d8cebd4
KM
749 vmstart = max(start, vma->vm_start);
750 vmend = min(end, vma->vm_end);
751
e26a5114
KM
752 if (mpol_equal(vma_policy(vma), new_pol))
753 continue;
754
755 pgoff = vma->vm_pgoff +
756 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
9d8cebd4 757 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
19a809af
AA
758 vma->anon_vma, vma->vm_file, pgoff,
759 new_pol, vma->vm_userfaultfd_ctx);
9d8cebd4
KM
760 if (prev) {
761 vma = prev;
762 next = vma->vm_next;
3964acd0
ON
763 if (mpol_equal(vma_policy(vma), new_pol))
764 continue;
765 /* vma_merge() joined vma && vma->next, case 8 */
766 goto replace;
9d8cebd4
KM
767 }
768 if (vma->vm_start != vmstart) {
769 err = split_vma(vma->vm_mm, vma, vmstart, 1);
770 if (err)
771 goto out;
772 }
773 if (vma->vm_end != vmend) {
774 err = split_vma(vma->vm_mm, vma, vmend, 0);
775 if (err)
776 goto out;
777 }
3964acd0 778 replace:
869833f2 779 err = vma_replace_policy(vma, new_pol);
8d34694c
KM
780 if (err)
781 goto out;
1da177e4 782 }
9d8cebd4
KM
783
784 out:
1da177e4
LT
785 return err;
786}
787
1da177e4 788/* Set the process memory policy */
028fec41
DR
789static long do_set_mempolicy(unsigned short mode, unsigned short flags,
790 nodemask_t *nodes)
1da177e4 791{
58568d2a 792 struct mempolicy *new, *old;
4bfc4495 793 NODEMASK_SCRATCH(scratch);
58568d2a 794 int ret;
1da177e4 795
4bfc4495
KH
796 if (!scratch)
797 return -ENOMEM;
f4e53d91 798
4bfc4495
KH
799 new = mpol_new(mode, flags, nodes);
800 if (IS_ERR(new)) {
801 ret = PTR_ERR(new);
802 goto out;
803 }
2c7c3a7d 804
58568d2a 805 task_lock(current);
4bfc4495 806 ret = mpol_set_nodemask(new, nodes, scratch);
58568d2a
MX
807 if (ret) {
808 task_unlock(current);
58568d2a 809 mpol_put(new);
4bfc4495 810 goto out;
58568d2a
MX
811 }
812 old = current->mempolicy;
1da177e4 813 current->mempolicy = new;
45c4745a 814 if (new && new->mode == MPOL_INTERLEAVE &&
f5b087b5 815 nodes_weight(new->v.nodes))
dfcd3c0d 816 current->il_next = first_node(new->v.nodes);
58568d2a 817 task_unlock(current);
58568d2a 818 mpol_put(old);
4bfc4495
KH
819 ret = 0;
820out:
821 NODEMASK_SCRATCH_FREE(scratch);
822 return ret;
1da177e4
LT
823}
824
bea904d5
LS
825/*
826 * Return nodemask for policy for get_mempolicy() query
58568d2a
MX
827 *
828 * Called with task's alloc_lock held
bea904d5
LS
829 */
830static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4 831{
dfcd3c0d 832 nodes_clear(*nodes);
bea904d5
LS
833 if (p == &default_policy)
834 return;
835
45c4745a 836 switch (p->mode) {
19770b32
MG
837 case MPOL_BIND:
838 /* Fall through */
1da177e4 839 case MPOL_INTERLEAVE:
dfcd3c0d 840 *nodes = p->v.nodes;
1da177e4
LT
841 break;
842 case MPOL_PREFERRED:
fc36b8d3 843 if (!(p->flags & MPOL_F_LOCAL))
dfcd3c0d 844 node_set(p->v.preferred_node, *nodes);
53f2556b 845 /* else return empty node mask for local allocation */
1da177e4
LT
846 break;
847 default:
848 BUG();
849 }
850}
851
d4edcf0d 852static int lookup_node(unsigned long addr)
1da177e4
LT
853{
854 struct page *p;
855 int err;
856
768ae309 857 err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
1da177e4
LT
858 if (err >= 0) {
859 err = page_to_nid(p);
860 put_page(p);
861 }
862 return err;
863}
864
1da177e4 865/* Retrieve NUMA policy */
dbcb0f19
AB
866static long do_get_mempolicy(int *policy, nodemask_t *nmask,
867 unsigned long addr, unsigned long flags)
1da177e4 868{
8bccd85f 869 int err;
1da177e4
LT
870 struct mm_struct *mm = current->mm;
871 struct vm_area_struct *vma = NULL;
872 struct mempolicy *pol = current->mempolicy;
873
754af6f5
LS
874 if (flags &
875 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 876 return -EINVAL;
754af6f5
LS
877
878 if (flags & MPOL_F_MEMS_ALLOWED) {
879 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
880 return -EINVAL;
881 *policy = 0; /* just so it's initialized */
58568d2a 882 task_lock(current);
754af6f5 883 *nmask = cpuset_current_mems_allowed;
58568d2a 884 task_unlock(current);
754af6f5
LS
885 return 0;
886 }
887
1da177e4 888 if (flags & MPOL_F_ADDR) {
bea904d5
LS
889 /*
890 * Do NOT fall back to task policy if the
891 * vma/shared policy at addr is NULL. We
892 * want to return MPOL_DEFAULT in this case.
893 */
1da177e4
LT
894 down_read(&mm->mmap_sem);
895 vma = find_vma_intersection(mm, addr, addr+1);
896 if (!vma) {
897 up_read(&mm->mmap_sem);
898 return -EFAULT;
899 }
900 if (vma->vm_ops && vma->vm_ops->get_policy)
901 pol = vma->vm_ops->get_policy(vma, addr);
902 else
903 pol = vma->vm_policy;
904 } else if (addr)
905 return -EINVAL;
906
907 if (!pol)
bea904d5 908 pol = &default_policy; /* indicates default behavior */
1da177e4
LT
909
910 if (flags & MPOL_F_NODE) {
911 if (flags & MPOL_F_ADDR) {
d4edcf0d 912 err = lookup_node(addr);
1da177e4
LT
913 if (err < 0)
914 goto out;
8bccd85f 915 *policy = err;
1da177e4 916 } else if (pol == current->mempolicy &&
45c4745a 917 pol->mode == MPOL_INTERLEAVE) {
8bccd85f 918 *policy = current->il_next;
1da177e4
LT
919 } else {
920 err = -EINVAL;
921 goto out;
922 }
bea904d5
LS
923 } else {
924 *policy = pol == &default_policy ? MPOL_DEFAULT :
925 pol->mode;
d79df630
DR
926 /*
927 * Internal mempolicy flags must be masked off before exposing
928 * the policy to userspace.
929 */
930 *policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d5 931 }
1da177e4
LT
932
933 if (vma) {
934 up_read(&current->mm->mmap_sem);
935 vma = NULL;
936 }
937
1da177e4 938 err = 0;
58568d2a 939 if (nmask) {
c6b6ef8b
LS
940 if (mpol_store_user_nodemask(pol)) {
941 *nmask = pol->w.user_nodemask;
942 } else {
943 task_lock(current);
944 get_policy_nodemask(pol, nmask);
945 task_unlock(current);
946 }
58568d2a 947 }
1da177e4
LT
948
949 out:
52cd3b07 950 mpol_cond_put(pol);
1da177e4
LT
951 if (vma)
952 up_read(&current->mm->mmap_sem);
953 return err;
954}
955
b20a3503 956#ifdef CONFIG_MIGRATION
6ce3c4c0
CL
957/*
958 * page migration
959 */
fc301289
CL
960static void migrate_page_add(struct page *page, struct list_head *pagelist,
961 unsigned long flags)
6ce3c4c0
CL
962{
963 /*
fc301289 964 * Avoid migrating a page that is shared with others.
6ce3c4c0 965 */
62695a84
NP
966 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
967 if (!isolate_lru_page(page)) {
968 list_add_tail(&page->lru, pagelist);
599d0c95 969 inc_node_page_state(page, NR_ISOLATED_ANON +
6d9c285a 970 page_is_file_cache(page));
62695a84
NP
971 }
972 }
7e2ab150 973}
6ce3c4c0 974
742755a1 975static struct page *new_node_page(struct page *page, unsigned long node, int **x)
95a402c3 976{
e2d8cf40
NH
977 if (PageHuge(page))
978 return alloc_huge_page_node(page_hstate(compound_head(page)),
979 node);
980 else
96db800f 981 return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
b360edb4 982 __GFP_THISNODE, 0);
95a402c3
CL
983}
984
7e2ab150
CL
985/*
986 * Migrate pages from one node to a target node.
987 * Returns error or the number of pages not migrated.
988 */
dbcb0f19
AB
989static int migrate_to_node(struct mm_struct *mm, int source, int dest,
990 int flags)
7e2ab150
CL
991{
992 nodemask_t nmask;
993 LIST_HEAD(pagelist);
994 int err = 0;
995
996 nodes_clear(nmask);
997 node_set(source, nmask);
6ce3c4c0 998
08270807
MK
999 /*
1000 * This does not "check" the range but isolates all pages that
1001 * need migration. Between passing in the full user address
1002 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1003 */
1004 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
98094945 1005 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
7e2ab150
CL
1006 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1007
cf608ac1 1008 if (!list_empty(&pagelist)) {
68711a74 1009 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
9c620e2b 1010 MIGRATE_SYNC, MR_SYSCALL);
cf608ac1 1011 if (err)
e2d8cf40 1012 putback_movable_pages(&pagelist);
cf608ac1 1013 }
95a402c3 1014
7e2ab150 1015 return err;
6ce3c4c0
CL
1016}
1017
39743889 1018/*
7e2ab150
CL
1019 * Move pages between the two nodesets so as to preserve the physical
1020 * layout as much as possible.
39743889
CL
1021 *
1022 * Returns the number of page that could not be moved.
1023 */
0ce72d4f
AM
1024int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1025 const nodemask_t *to, int flags)
39743889 1026{
7e2ab150 1027 int busy = 0;
0aedadf9 1028 int err;
7e2ab150 1029 nodemask_t tmp;
39743889 1030
0aedadf9
CL
1031 err = migrate_prep();
1032 if (err)
1033 return err;
1034
53f2556b 1035 down_read(&mm->mmap_sem);
39743889 1036
da0aa138
KM
1037 /*
1038 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1039 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1040 * bit in 'tmp', and return that <source, dest> pair for migration.
1041 * The pair of nodemasks 'to' and 'from' define the map.
1042 *
1043 * If no pair of bits is found that way, fallback to picking some
1044 * pair of 'source' and 'dest' bits that are not the same. If the
1045 * 'source' and 'dest' bits are the same, this represents a node
1046 * that will be migrating to itself, so no pages need move.
1047 *
1048 * If no bits are left in 'tmp', or if all remaining bits left
1049 * in 'tmp' correspond to the same bit in 'to', return false
1050 * (nothing left to migrate).
1051 *
1052 * This lets us pick a pair of nodes to migrate between, such that
1053 * if possible the dest node is not already occupied by some other
1054 * source node, minimizing the risk of overloading the memory on a
1055 * node that would happen if we migrated incoming memory to a node
1056 * before migrating outgoing memory source that same node.
1057 *
1058 * A single scan of tmp is sufficient. As we go, we remember the
1059 * most recent <s, d> pair that moved (s != d). If we find a pair
1060 * that not only moved, but what's better, moved to an empty slot
1061 * (d is not set in tmp), then we break out then, with that pair.
ae0e47f0 1062 * Otherwise when we finish scanning from_tmp, we at least have the
da0aa138
KM
1063 * most recent <s, d> pair that moved. If we get all the way through
1064 * the scan of tmp without finding any node that moved, much less
1065 * moved to an empty node, then there is nothing left worth migrating.
1066 */
d4984711 1067
0ce72d4f 1068 tmp = *from;
7e2ab150
CL
1069 while (!nodes_empty(tmp)) {
1070 int s,d;
b76ac7e7 1071 int source = NUMA_NO_NODE;
7e2ab150
CL
1072 int dest = 0;
1073
1074 for_each_node_mask(s, tmp) {
4a5b18cc
LW
1075
1076 /*
1077 * do_migrate_pages() tries to maintain the relative
1078 * node relationship of the pages established between
1079 * threads and memory areas.
1080 *
1081 * However if the number of source nodes is not equal to
1082 * the number of destination nodes we can not preserve
1083 * this node relative relationship. In that case, skip
1084 * copying memory from a node that is in the destination
1085 * mask.
1086 *
1087 * Example: [2,3,4] -> [3,4,5] moves everything.
1088 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1089 */
1090
0ce72d4f
AM
1091 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1092 (node_isset(s, *to)))
4a5b18cc
LW
1093 continue;
1094
0ce72d4f 1095 d = node_remap(s, *from, *to);
7e2ab150
CL
1096 if (s == d)
1097 continue;
1098
1099 source = s; /* Node moved. Memorize */
1100 dest = d;
1101
1102 /* dest not in remaining from nodes? */
1103 if (!node_isset(dest, tmp))
1104 break;
1105 }
b76ac7e7 1106 if (source == NUMA_NO_NODE)
7e2ab150
CL
1107 break;
1108
1109 node_clear(source, tmp);
1110 err = migrate_to_node(mm, source, dest, flags);
1111 if (err > 0)
1112 busy += err;
1113 if (err < 0)
1114 break;
39743889
CL
1115 }
1116 up_read(&mm->mmap_sem);
7e2ab150
CL
1117 if (err < 0)
1118 return err;
1119 return busy;
b20a3503
CL
1120
1121}
1122
3ad33b24
LS
1123/*
1124 * Allocate a new page for page migration based on vma policy.
d05f0cdc 1125 * Start by assuming the page is mapped by the same vma as contains @start.
3ad33b24
LS
1126 * Search forward from there, if not. N.B., this assumes that the
1127 * list of pages handed to migrate_pages()--which is how we get here--
1128 * is in virtual address order.
1129 */
d05f0cdc 1130static struct page *new_page(struct page *page, unsigned long start, int **x)
95a402c3 1131{
d05f0cdc 1132 struct vm_area_struct *vma;
3ad33b24 1133 unsigned long uninitialized_var(address);
95a402c3 1134
d05f0cdc 1135 vma = find_vma(current->mm, start);
3ad33b24
LS
1136 while (vma) {
1137 address = page_address_in_vma(page, vma);
1138 if (address != -EFAULT)
1139 break;
1140 vma = vma->vm_next;
1141 }
11c731e8
WL
1142
1143 if (PageHuge(page)) {
cc81717e
MH
1144 BUG_ON(!vma);
1145 return alloc_huge_page_noerr(vma, address, 1);
11c731e8 1146 }
0bf598d8 1147 /*
11c731e8 1148 * if !vma, alloc_page_vma() will use task or system default policy
0bf598d8 1149 */
3ad33b24 1150 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
95a402c3 1151}
b20a3503
CL
1152#else
1153
1154static void migrate_page_add(struct page *page, struct list_head *pagelist,
1155 unsigned long flags)
1156{
39743889
CL
1157}
1158
0ce72d4f
AM
1159int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1160 const nodemask_t *to, int flags)
b20a3503
CL
1161{
1162 return -ENOSYS;
1163}
95a402c3 1164
d05f0cdc 1165static struct page *new_page(struct page *page, unsigned long start, int **x)
95a402c3
CL
1166{
1167 return NULL;
1168}
b20a3503
CL
1169#endif
1170
dbcb0f19 1171static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
1172 unsigned short mode, unsigned short mode_flags,
1173 nodemask_t *nmask, unsigned long flags)
6ce3c4c0 1174{
6ce3c4c0
CL
1175 struct mm_struct *mm = current->mm;
1176 struct mempolicy *new;
1177 unsigned long end;
1178 int err;
1179 LIST_HEAD(pagelist);
1180
b24f53a0 1181 if (flags & ~(unsigned long)MPOL_MF_VALID)
6ce3c4c0 1182 return -EINVAL;
74c00241 1183 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
1184 return -EPERM;
1185
1186 if (start & ~PAGE_MASK)
1187 return -EINVAL;
1188
1189 if (mode == MPOL_DEFAULT)
1190 flags &= ~MPOL_MF_STRICT;
1191
1192 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1193 end = start + len;
1194
1195 if (end < start)
1196 return -EINVAL;
1197 if (end == start)
1198 return 0;
1199
028fec41 1200 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
1201 if (IS_ERR(new))
1202 return PTR_ERR(new);
1203
b24f53a0
LS
1204 if (flags & MPOL_MF_LAZY)
1205 new->flags |= MPOL_F_MOF;
1206
6ce3c4c0
CL
1207 /*
1208 * If we are using the default policy then operation
1209 * on discontinuous address spaces is okay after all
1210 */
1211 if (!new)
1212 flags |= MPOL_MF_DISCONTIG_OK;
1213
028fec41
DR
1214 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1215 start, start + len, mode, mode_flags,
00ef2d2f 1216 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
6ce3c4c0 1217
0aedadf9
CL
1218 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1219
1220 err = migrate_prep();
1221 if (err)
b05ca738 1222 goto mpol_out;
0aedadf9 1223 }
4bfc4495
KH
1224 {
1225 NODEMASK_SCRATCH(scratch);
1226 if (scratch) {
1227 down_write(&mm->mmap_sem);
1228 task_lock(current);
1229 err = mpol_set_nodemask(new, nmask, scratch);
1230 task_unlock(current);
1231 if (err)
1232 up_write(&mm->mmap_sem);
1233 } else
1234 err = -ENOMEM;
1235 NODEMASK_SCRATCH_FREE(scratch);
1236 }
b05ca738
KM
1237 if (err)
1238 goto mpol_out;
1239
d05f0cdc 1240 err = queue_pages_range(mm, start, end, nmask,
6ce3c4c0 1241 flags | MPOL_MF_INVERT, &pagelist);
d05f0cdc 1242 if (!err)
9d8cebd4 1243 err = mbind_range(mm, start, end, new);
7e2ab150 1244
b24f53a0
LS
1245 if (!err) {
1246 int nr_failed = 0;
1247
cf608ac1 1248 if (!list_empty(&pagelist)) {
b24f53a0 1249 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
d05f0cdc
HD
1250 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1251 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
cf608ac1 1252 if (nr_failed)
74060e4d 1253 putback_movable_pages(&pagelist);
cf608ac1 1254 }
6ce3c4c0 1255
b24f53a0 1256 if (nr_failed && (flags & MPOL_MF_STRICT))
6ce3c4c0 1257 err = -EIO;
ab8a3e14 1258 } else
b0e5fd73 1259 putback_movable_pages(&pagelist);
b20a3503 1260
6ce3c4c0 1261 up_write(&mm->mmap_sem);
b05ca738 1262 mpol_out:
f0be3d32 1263 mpol_put(new);
6ce3c4c0
CL
1264 return err;
1265}
1266
8bccd85f
CL
1267/*
1268 * User space interface with variable sized bitmaps for nodelists.
1269 */
1270
1271/* Copy a node mask from user space. */
39743889 1272static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
1273 unsigned long maxnode)
1274{
1275 unsigned long k;
1276 unsigned long nlongs;
1277 unsigned long endmask;
1278
1279 --maxnode;
1280 nodes_clear(*nodes);
1281 if (maxnode == 0 || !nmask)
1282 return 0;
a9c930ba 1283 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 1284 return -EINVAL;
8bccd85f
CL
1285
1286 nlongs = BITS_TO_LONGS(maxnode);
1287 if ((maxnode % BITS_PER_LONG) == 0)
1288 endmask = ~0UL;
1289 else
1290 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1291
1292 /* When the user specified more nodes than supported just check
1293 if the non supported part is all zero. */
1294 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1295 if (nlongs > PAGE_SIZE/sizeof(long))
1296 return -EINVAL;
1297 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1298 unsigned long t;
1299 if (get_user(t, nmask + k))
1300 return -EFAULT;
1301 if (k == nlongs - 1) {
1302 if (t & endmask)
1303 return -EINVAL;
1304 } else if (t)
1305 return -EINVAL;
1306 }
1307 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1308 endmask = ~0UL;
1309 }
1310
1311 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1312 return -EFAULT;
1313 nodes_addr(*nodes)[nlongs-1] &= endmask;
1314 return 0;
1315}
1316
1317/* Copy a kernel node mask to user space */
1318static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1319 nodemask_t *nodes)
1320{
1321 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1322 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1323
1324 if (copy > nbytes) {
1325 if (copy > PAGE_SIZE)
1326 return -EINVAL;
1327 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1328 return -EFAULT;
1329 copy = nbytes;
1330 }
1331 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1332}
1333
938bb9f5 1334SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
f7f28ca9 1335 unsigned long, mode, const unsigned long __user *, nmask,
938bb9f5 1336 unsigned long, maxnode, unsigned, flags)
8bccd85f
CL
1337{
1338 nodemask_t nodes;
1339 int err;
028fec41 1340 unsigned short mode_flags;
8bccd85f 1341
028fec41
DR
1342 mode_flags = mode & MPOL_MODE_FLAGS;
1343 mode &= ~MPOL_MODE_FLAGS;
a3b51e01
DR
1344 if (mode >= MPOL_MAX)
1345 return -EINVAL;
4c50bc01
DR
1346 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1347 (mode_flags & MPOL_F_RELATIVE_NODES))
1348 return -EINVAL;
8bccd85f
CL
1349 err = get_nodes(&nodes, nmask, maxnode);
1350 if (err)
1351 return err;
028fec41 1352 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
8bccd85f
CL
1353}
1354
1355/* Set the process memory policy */
23c8902d 1356SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
938bb9f5 1357 unsigned long, maxnode)
8bccd85f
CL
1358{
1359 int err;
1360 nodemask_t nodes;
028fec41 1361 unsigned short flags;
8bccd85f 1362
028fec41
DR
1363 flags = mode & MPOL_MODE_FLAGS;
1364 mode &= ~MPOL_MODE_FLAGS;
1365 if ((unsigned int)mode >= MPOL_MAX)
8bccd85f 1366 return -EINVAL;
4c50bc01
DR
1367 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1368 return -EINVAL;
8bccd85f
CL
1369 err = get_nodes(&nodes, nmask, maxnode);
1370 if (err)
1371 return err;
028fec41 1372 return do_set_mempolicy(mode, flags, &nodes);
8bccd85f
CL
1373}
1374
938bb9f5
HC
1375SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1376 const unsigned long __user *, old_nodes,
1377 const unsigned long __user *, new_nodes)
39743889 1378{
c69e8d9c 1379 const struct cred *cred = current_cred(), *tcred;
596d7cfa 1380 struct mm_struct *mm = NULL;
39743889 1381 struct task_struct *task;
39743889
CL
1382 nodemask_t task_nodes;
1383 int err;
596d7cfa
KM
1384 nodemask_t *old;
1385 nodemask_t *new;
1386 NODEMASK_SCRATCH(scratch);
1387
1388 if (!scratch)
1389 return -ENOMEM;
39743889 1390
596d7cfa
KM
1391 old = &scratch->mask1;
1392 new = &scratch->mask2;
1393
1394 err = get_nodes(old, old_nodes, maxnode);
39743889 1395 if (err)
596d7cfa 1396 goto out;
39743889 1397
596d7cfa 1398 err = get_nodes(new, new_nodes, maxnode);
39743889 1399 if (err)
596d7cfa 1400 goto out;
39743889
CL
1401
1402 /* Find the mm_struct */
55cfaa3c 1403 rcu_read_lock();
228ebcbe 1404 task = pid ? find_task_by_vpid(pid) : current;
39743889 1405 if (!task) {
55cfaa3c 1406 rcu_read_unlock();
596d7cfa
KM
1407 err = -ESRCH;
1408 goto out;
39743889 1409 }
3268c63e 1410 get_task_struct(task);
39743889 1411
596d7cfa 1412 err = -EINVAL;
39743889
CL
1413
1414 /*
1415 * Check if this process has the right to modify the specified
1416 * process. The right exists if the process has administrative
7f927fcc 1417 * capabilities, superuser privileges or the same
39743889
CL
1418 * userid as the target process.
1419 */
c69e8d9c 1420 tcred = __task_cred(task);
b38a86eb
EB
1421 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1422 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
74c00241 1423 !capable(CAP_SYS_NICE)) {
c69e8d9c 1424 rcu_read_unlock();
39743889 1425 err = -EPERM;
3268c63e 1426 goto out_put;
39743889 1427 }
c69e8d9c 1428 rcu_read_unlock();
39743889
CL
1429
1430 task_nodes = cpuset_mems_allowed(task);
1431 /* Is the user allowed to access the target nodes? */
596d7cfa 1432 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889 1433 err = -EPERM;
3268c63e 1434 goto out_put;
39743889
CL
1435 }
1436
01f13bd6 1437 if (!nodes_subset(*new, node_states[N_MEMORY])) {
3b42d28b 1438 err = -EINVAL;
3268c63e 1439 goto out_put;
3b42d28b
CL
1440 }
1441
86c3a764
DQ
1442 err = security_task_movememory(task);
1443 if (err)
3268c63e 1444 goto out_put;
86c3a764 1445
3268c63e
CL
1446 mm = get_task_mm(task);
1447 put_task_struct(task);
f2a9ef88
SL
1448
1449 if (!mm) {
3268c63e 1450 err = -EINVAL;
f2a9ef88
SL
1451 goto out;
1452 }
1453
1454 err = do_migrate_pages(mm, old, new,
1455 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
3268c63e
CL
1456
1457 mmput(mm);
1458out:
596d7cfa
KM
1459 NODEMASK_SCRATCH_FREE(scratch);
1460
39743889 1461 return err;
3268c63e
CL
1462
1463out_put:
1464 put_task_struct(task);
1465 goto out;
1466
39743889
CL
1467}
1468
1469
8bccd85f 1470/* Retrieve NUMA policy */
938bb9f5
HC
1471SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1472 unsigned long __user *, nmask, unsigned long, maxnode,
1473 unsigned long, addr, unsigned long, flags)
8bccd85f 1474{
dbcb0f19
AB
1475 int err;
1476 int uninitialized_var(pval);
8bccd85f
CL
1477 nodemask_t nodes;
1478
1479 if (nmask != NULL && maxnode < MAX_NUMNODES)
1480 return -EINVAL;
1481
1482 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1483
1484 if (err)
1485 return err;
1486
1487 if (policy && put_user(pval, policy))
1488 return -EFAULT;
1489
1490 if (nmask)
1491 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1492
1493 return err;
1494}
1495
1da177e4
LT
1496#ifdef CONFIG_COMPAT
1497
c93e0f6c
HC
1498COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1499 compat_ulong_t __user *, nmask,
1500 compat_ulong_t, maxnode,
1501 compat_ulong_t, addr, compat_ulong_t, flags)
1da177e4
LT
1502{
1503 long err;
1504 unsigned long __user *nm = NULL;
1505 unsigned long nr_bits, alloc_size;
1506 DECLARE_BITMAP(bm, MAX_NUMNODES);
1507
1508 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1509 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1510
1511 if (nmask)
1512 nm = compat_alloc_user_space(alloc_size);
1513
1514 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1515
1516 if (!err && nmask) {
2bbff6c7
KH
1517 unsigned long copy_size;
1518 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1519 err = copy_from_user(bm, nm, copy_size);
1da177e4
LT
1520 /* ensure entire bitmap is zeroed */
1521 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1522 err |= compat_put_bitmap(nmask, bm, nr_bits);
1523 }
1524
1525 return err;
1526}
1527
c93e0f6c
HC
1528COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1529 compat_ulong_t, maxnode)
1da177e4
LT
1530{
1531 long err = 0;
1532 unsigned long __user *nm = NULL;
1533 unsigned long nr_bits, alloc_size;
1534 DECLARE_BITMAP(bm, MAX_NUMNODES);
1535
1536 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1537 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1538
1539 if (nmask) {
1540 err = compat_get_bitmap(bm, nmask, nr_bits);
1541 nm = compat_alloc_user_space(alloc_size);
1542 err |= copy_to_user(nm, bm, alloc_size);
1543 }
1544
1545 if (err)
1546 return -EFAULT;
1547
1548 return sys_set_mempolicy(mode, nm, nr_bits+1);
1549}
1550
c93e0f6c
HC
1551COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1552 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1553 compat_ulong_t, maxnode, compat_ulong_t, flags)
1da177e4
LT
1554{
1555 long err = 0;
1556 unsigned long __user *nm = NULL;
1557 unsigned long nr_bits, alloc_size;
dfcd3c0d 1558 nodemask_t bm;
1da177e4
LT
1559
1560 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1561 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1562
1563 if (nmask) {
dfcd3c0d 1564 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4 1565 nm = compat_alloc_user_space(alloc_size);
dfcd3c0d 1566 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4
LT
1567 }
1568
1569 if (err)
1570 return -EFAULT;
1571
1572 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1573}
1574
1575#endif
1576
74d2c3a0
ON
1577struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1578 unsigned long addr)
1da177e4 1579{
8d90274b 1580 struct mempolicy *pol = NULL;
1da177e4
LT
1581
1582 if (vma) {
480eccf9 1583 if (vma->vm_ops && vma->vm_ops->get_policy) {
8d90274b 1584 pol = vma->vm_ops->get_policy(vma, addr);
00442ad0 1585 } else if (vma->vm_policy) {
1da177e4 1586 pol = vma->vm_policy;
00442ad0
MG
1587
1588 /*
1589 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1590 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1591 * count on these policies which will be dropped by
1592 * mpol_cond_put() later
1593 */
1594 if (mpol_needs_cond_ref(pol))
1595 mpol_get(pol);
1596 }
1da177e4 1597 }
f15ca78e 1598
74d2c3a0
ON
1599 return pol;
1600}
1601
1602/*
dd6eecb9 1603 * get_vma_policy(@vma, @addr)
74d2c3a0
ON
1604 * @vma: virtual memory area whose policy is sought
1605 * @addr: address in @vma for shared policy lookup
1606 *
1607 * Returns effective policy for a VMA at specified address.
dd6eecb9 1608 * Falls back to current->mempolicy or system default policy, as necessary.
74d2c3a0
ON
1609 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1610 * count--added by the get_policy() vm_op, as appropriate--to protect against
1611 * freeing by another task. It is the caller's responsibility to free the
1612 * extra reference for shared policies.
1613 */
dd6eecb9
ON
1614static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1615 unsigned long addr)
74d2c3a0
ON
1616{
1617 struct mempolicy *pol = __get_vma_policy(vma, addr);
1618
8d90274b 1619 if (!pol)
dd6eecb9 1620 pol = get_task_policy(current);
8d90274b 1621
1da177e4
LT
1622 return pol;
1623}
1624
6b6482bb 1625bool vma_policy_mof(struct vm_area_struct *vma)
fc314724 1626{
6b6482bb 1627 struct mempolicy *pol;
fc314724 1628
6b6482bb
ON
1629 if (vma->vm_ops && vma->vm_ops->get_policy) {
1630 bool ret = false;
fc314724 1631
6b6482bb
ON
1632 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1633 if (pol && (pol->flags & MPOL_F_MOF))
1634 ret = true;
1635 mpol_cond_put(pol);
8d90274b 1636
6b6482bb 1637 return ret;
fc314724
MG
1638 }
1639
6b6482bb 1640 pol = vma->vm_policy;
8d90274b 1641 if (!pol)
6b6482bb 1642 pol = get_task_policy(current);
8d90274b 1643
fc314724
MG
1644 return pol->flags & MPOL_F_MOF;
1645}
1646
d3eb1570
LJ
1647static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1648{
1649 enum zone_type dynamic_policy_zone = policy_zone;
1650
1651 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1652
1653 /*
1654 * if policy->v.nodes has movable memory only,
1655 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1656 *
1657 * policy->v.nodes is intersect with node_states[N_MEMORY].
1658 * so if the following test faile, it implies
1659 * policy->v.nodes has movable memory only.
1660 */
1661 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1662 dynamic_policy_zone = ZONE_MOVABLE;
1663
1664 return zone >= dynamic_policy_zone;
1665}
1666
52cd3b07
LS
1667/*
1668 * Return a nodemask representing a mempolicy for filtering nodes for
1669 * page allocation
1670 */
1671static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
19770b32
MG
1672{
1673 /* Lower zones don't get a nodemask applied for MPOL_BIND */
45c4745a 1674 if (unlikely(policy->mode == MPOL_BIND) &&
d3eb1570 1675 apply_policy_zone(policy, gfp_zone(gfp)) &&
19770b32
MG
1676 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1677 return &policy->v.nodes;
1678
1679 return NULL;
1680}
1681
52cd3b07 1682/* Return a zonelist indicated by gfp for node representing a mempolicy */
2f5f9486
AK
1683static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1684 int nd)
1da177e4 1685{
6d840958
MH
1686 if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1687 nd = policy->v.preferred_node;
1688 else {
19770b32 1689 /*
6d840958
MH
1690 * __GFP_THISNODE shouldn't even be used with the bind policy
1691 * because we might easily break the expectation to stay on the
1692 * requested node and not break the policy.
19770b32 1693 */
6d840958 1694 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1da177e4 1695 }
6d840958 1696
0e88460d 1697 return node_zonelist(nd, gfp);
1da177e4
LT
1698}
1699
1700/* Do dynamic interleaving for a process */
1701static unsigned interleave_nodes(struct mempolicy *policy)
1702{
1703 unsigned nid, next;
1704 struct task_struct *me = current;
1705
1706 nid = me->il_next;
0edaf86c 1707 next = next_node_in(nid, policy->v.nodes);
f5b087b5
DR
1708 if (next < MAX_NUMNODES)
1709 me->il_next = next;
1da177e4
LT
1710 return nid;
1711}
1712
dc85da15
CL
1713/*
1714 * Depending on the memory policy provide a node from which to allocate the
1715 * next slab entry.
1716 */
2a389610 1717unsigned int mempolicy_slab_node(void)
dc85da15 1718{
e7b691b0 1719 struct mempolicy *policy;
2a389610 1720 int node = numa_mem_id();
e7b691b0
AK
1721
1722 if (in_interrupt())
2a389610 1723 return node;
e7b691b0
AK
1724
1725 policy = current->mempolicy;
fc36b8d3 1726 if (!policy || policy->flags & MPOL_F_LOCAL)
2a389610 1727 return node;
bea904d5
LS
1728
1729 switch (policy->mode) {
1730 case MPOL_PREFERRED:
fc36b8d3
LS
1731 /*
1732 * handled MPOL_F_LOCAL above
1733 */
1734 return policy->v.preferred_node;
765c4507 1735
dc85da15
CL
1736 case MPOL_INTERLEAVE:
1737 return interleave_nodes(policy);
1738
dd1a239f 1739 case MPOL_BIND: {
c33d6c06
MG
1740 struct zoneref *z;
1741
dc85da15
CL
1742 /*
1743 * Follow bind policy behavior and start allocation at the
1744 * first node.
1745 */
19770b32 1746 struct zonelist *zonelist;
19770b32 1747 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
c9634cf0 1748 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
c33d6c06
MG
1749 z = first_zones_zonelist(zonelist, highest_zoneidx,
1750 &policy->v.nodes);
1751 return z->zone ? z->zone->node : node;
dd1a239f 1752 }
dc85da15 1753
dc85da15 1754 default:
bea904d5 1755 BUG();
dc85da15
CL
1756 }
1757}
1758
fee83b3a
AM
1759/*
1760 * Do static interleaving for a VMA with known offset @n. Returns the n'th
1761 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1762 * number of present nodes.
1763 */
1da177e4 1764static unsigned offset_il_node(struct mempolicy *pol,
fee83b3a 1765 struct vm_area_struct *vma, unsigned long n)
1da177e4 1766{
dfcd3c0d 1767 unsigned nnodes = nodes_weight(pol->v.nodes);
f5b087b5 1768 unsigned target;
fee83b3a
AM
1769 int i;
1770 int nid;
1da177e4 1771
f5b087b5
DR
1772 if (!nnodes)
1773 return numa_node_id();
fee83b3a
AM
1774 target = (unsigned int)n % nnodes;
1775 nid = first_node(pol->v.nodes);
1776 for (i = 0; i < target; i++)
dfcd3c0d 1777 nid = next_node(nid, pol->v.nodes);
1da177e4
LT
1778 return nid;
1779}
1780
5da7ca86
CL
1781/* Determine a node number for interleave */
1782static inline unsigned interleave_nid(struct mempolicy *pol,
1783 struct vm_area_struct *vma, unsigned long addr, int shift)
1784{
1785 if (vma) {
1786 unsigned long off;
1787
3b98b087
NA
1788 /*
1789 * for small pages, there is no difference between
1790 * shift and PAGE_SHIFT, so the bit-shift is safe.
1791 * for huge pages, since vm_pgoff is in units of small
1792 * pages, we need to shift off the always 0 bits to get
1793 * a useful offset.
1794 */
1795 BUG_ON(shift < PAGE_SHIFT);
1796 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca86
CL
1797 off += (addr - vma->vm_start) >> shift;
1798 return offset_il_node(pol, vma, off);
1799 } else
1800 return interleave_nodes(pol);
1801}
1802
00ac59ad 1803#ifdef CONFIG_HUGETLBFS
480eccf9
LS
1804/*
1805 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
b46e14ac
FF
1806 * @vma: virtual memory area whose policy is sought
1807 * @addr: address in @vma for shared policy lookup and interleave policy
1808 * @gfp_flags: for requested zone
1809 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1810 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
480eccf9 1811 *
52cd3b07
LS
1812 * Returns a zonelist suitable for a huge page allocation and a pointer
1813 * to the struct mempolicy for conditional unref after allocation.
1814 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1815 * @nodemask for filtering the zonelist.
c0ff7453 1816 *
d26914d1 1817 * Must be protected by read_mems_allowed_begin()
480eccf9 1818 */
396faf03 1819struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
19770b32
MG
1820 gfp_t gfp_flags, struct mempolicy **mpol,
1821 nodemask_t **nodemask)
5da7ca86 1822{
480eccf9 1823 struct zonelist *zl;
5da7ca86 1824
dd6eecb9 1825 *mpol = get_vma_policy(vma, addr);
19770b32 1826 *nodemask = NULL; /* assume !MPOL_BIND */
5da7ca86 1827
52cd3b07
LS
1828 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1829 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
a5516438 1830 huge_page_shift(hstate_vma(vma))), gfp_flags);
52cd3b07 1831 } else {
2f5f9486 1832 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
52cd3b07
LS
1833 if ((*mpol)->mode == MPOL_BIND)
1834 *nodemask = &(*mpol)->v.nodes;
480eccf9
LS
1835 }
1836 return zl;
5da7ca86 1837}
06808b08
LS
1838
1839/*
1840 * init_nodemask_of_mempolicy
1841 *
1842 * If the current task's mempolicy is "default" [NULL], return 'false'
1843 * to indicate default policy. Otherwise, extract the policy nodemask
1844 * for 'bind' or 'interleave' policy into the argument nodemask, or
1845 * initialize the argument nodemask to contain the single node for
1846 * 'preferred' or 'local' policy and return 'true' to indicate presence
1847 * of non-default mempolicy.
1848 *
1849 * We don't bother with reference counting the mempolicy [mpol_get/put]
1850 * because the current task is examining it's own mempolicy and a task's
1851 * mempolicy is only ever changed by the task itself.
1852 *
1853 * N.B., it is the caller's responsibility to free a returned nodemask.
1854 */
1855bool init_nodemask_of_mempolicy(nodemask_t *mask)
1856{
1857 struct mempolicy *mempolicy;
1858 int nid;
1859
1860 if (!(mask && current->mempolicy))
1861 return false;
1862
c0ff7453 1863 task_lock(current);
06808b08
LS
1864 mempolicy = current->mempolicy;
1865 switch (mempolicy->mode) {
1866 case MPOL_PREFERRED:
1867 if (mempolicy->flags & MPOL_F_LOCAL)
1868 nid = numa_node_id();
1869 else
1870 nid = mempolicy->v.preferred_node;
1871 init_nodemask_of_node(mask, nid);
1872 break;
1873
1874 case MPOL_BIND:
1875 /* Fall through */
1876 case MPOL_INTERLEAVE:
1877 *mask = mempolicy->v.nodes;
1878 break;
1879
1880 default:
1881 BUG();
1882 }
c0ff7453 1883 task_unlock(current);
06808b08
LS
1884
1885 return true;
1886}
00ac59ad 1887#endif
5da7ca86 1888
6f48d0eb
DR
1889/*
1890 * mempolicy_nodemask_intersects
1891 *
1892 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1893 * policy. Otherwise, check for intersection between mask and the policy
1894 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1895 * policy, always return true since it may allocate elsewhere on fallback.
1896 *
1897 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1898 */
1899bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1900 const nodemask_t *mask)
1901{
1902 struct mempolicy *mempolicy;
1903 bool ret = true;
1904
1905 if (!mask)
1906 return ret;
1907 task_lock(tsk);
1908 mempolicy = tsk->mempolicy;
1909 if (!mempolicy)
1910 goto out;
1911
1912 switch (mempolicy->mode) {
1913 case MPOL_PREFERRED:
1914 /*
1915 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1916 * allocate from, they may fallback to other nodes when oom.
1917 * Thus, it's possible for tsk to have allocated memory from
1918 * nodes in mask.
1919 */
1920 break;
1921 case MPOL_BIND:
1922 case MPOL_INTERLEAVE:
1923 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1924 break;
1925 default:
1926 BUG();
1927 }
1928out:
1929 task_unlock(tsk);
1930 return ret;
1931}
1932
1da177e4
LT
1933/* Allocate a page in interleaved policy.
1934 Own path because it needs to do special accounting. */
662f3a0b
AK
1935static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1936 unsigned nid)
1da177e4
LT
1937{
1938 struct zonelist *zl;
1939 struct page *page;
1940
0e88460d 1941 zl = node_zonelist(nid, gfp);
1da177e4 1942 page = __alloc_pages(gfp, order, zl);
dd1a239f 1943 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
ca889e6c 1944 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1da177e4
LT
1945 return page;
1946}
1947
1948/**
0bbbc0b3 1949 * alloc_pages_vma - Allocate a page for a VMA.
1da177e4
LT
1950 *
1951 * @gfp:
1952 * %GFP_USER user allocation.
1953 * %GFP_KERNEL kernel allocations,
1954 * %GFP_HIGHMEM highmem/user allocations,
1955 * %GFP_FS allocation should not call back into a file system.
1956 * %GFP_ATOMIC don't sleep.
1957 *
0bbbc0b3 1958 * @order:Order of the GFP allocation.
1da177e4
LT
1959 * @vma: Pointer to VMA or NULL if not available.
1960 * @addr: Virtual Address of the allocation. Must be inside the VMA.
be97a41b
VB
1961 * @node: Which node to prefer for allocation (modulo policy).
1962 * @hugepage: for hugepages try only the preferred node if possible
1da177e4
LT
1963 *
1964 * This function allocates a page from the kernel page pool and applies
1965 * a NUMA policy associated with the VMA or the current process.
1966 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1967 * mm_struct of the VMA to prevent it from going away. Should be used for
be97a41b
VB
1968 * all allocations for pages that will be mapped into user space. Returns
1969 * NULL when no page can be allocated.
1da177e4
LT
1970 */
1971struct page *
0bbbc0b3 1972alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
be97a41b 1973 unsigned long addr, int node, bool hugepage)
1da177e4 1974{
cc9a6c87 1975 struct mempolicy *pol;
c0ff7453 1976 struct page *page;
cc9a6c87 1977 unsigned int cpuset_mems_cookie;
be97a41b
VB
1978 struct zonelist *zl;
1979 nodemask_t *nmask;
cc9a6c87
MG
1980
1981retry_cpuset:
dd6eecb9 1982 pol = get_vma_policy(vma, addr);
d26914d1 1983 cpuset_mems_cookie = read_mems_allowed_begin();
1da177e4 1984
0867a57c
VB
1985 if (pol->mode == MPOL_INTERLEAVE) {
1986 unsigned nid;
1987
1988 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1989 mpol_cond_put(pol);
1990 page = alloc_page_interleave(gfp, order, nid);
1991 goto out;
1992 }
1993
1994 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1995 int hpage_node = node;
1996
be97a41b
VB
1997 /*
1998 * For hugepage allocation and non-interleave policy which
0867a57c
VB
1999 * allows the current node (or other explicitly preferred
2000 * node) we only try to allocate from the current/preferred
2001 * node and don't fall back to other nodes, as the cost of
2002 * remote accesses would likely offset THP benefits.
be97a41b
VB
2003 *
2004 * If the policy is interleave, or does not allow the current
2005 * node in its nodemask, we allocate the standard way.
2006 */
0867a57c
VB
2007 if (pol->mode == MPOL_PREFERRED &&
2008 !(pol->flags & MPOL_F_LOCAL))
2009 hpage_node = pol->v.preferred_node;
2010
be97a41b 2011 nmask = policy_nodemask(gfp, pol);
0867a57c 2012 if (!nmask || node_isset(hpage_node, *nmask)) {
be97a41b 2013 mpol_cond_put(pol);
96db800f 2014 page = __alloc_pages_node(hpage_node,
5265047a 2015 gfp | __GFP_THISNODE, order);
be97a41b
VB
2016 goto out;
2017 }
2018 }
2019
be97a41b
VB
2020 nmask = policy_nodemask(gfp, pol);
2021 zl = policy_zonelist(gfp, pol, node);
be97a41b 2022 page = __alloc_pages_nodemask(gfp, order, zl, nmask);
d51e9894 2023 mpol_cond_put(pol);
be97a41b 2024out:
d26914d1 2025 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
cc9a6c87 2026 goto retry_cpuset;
c0ff7453 2027 return page;
1da177e4
LT
2028}
2029
2030/**
2031 * alloc_pages_current - Allocate pages.
2032 *
2033 * @gfp:
2034 * %GFP_USER user allocation,
2035 * %GFP_KERNEL kernel allocation,
2036 * %GFP_HIGHMEM highmem allocation,
2037 * %GFP_FS don't call back into a file system.
2038 * %GFP_ATOMIC don't sleep.
2039 * @order: Power of two of allocation size in pages. 0 is a single page.
2040 *
2041 * Allocate a page from the kernel page pool. When not in
2042 * interrupt context and apply the current process NUMA policy.
2043 * Returns NULL when no page can be allocated.
2044 *
cf2a473c 2045 * Don't call cpuset_update_task_memory_state() unless
1da177e4
LT
2046 * 1) it's ok to take cpuset_sem (can WAIT), and
2047 * 2) allocating for current task (not interrupt).
2048 */
dd0fc66f 2049struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4 2050{
8d90274b 2051 struct mempolicy *pol = &default_policy;
c0ff7453 2052 struct page *page;
cc9a6c87 2053 unsigned int cpuset_mems_cookie;
1da177e4 2054
8d90274b
ON
2055 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2056 pol = get_task_policy(current);
52cd3b07 2057
cc9a6c87 2058retry_cpuset:
d26914d1 2059 cpuset_mems_cookie = read_mems_allowed_begin();
cc9a6c87 2060
52cd3b07
LS
2061 /*
2062 * No reference counting needed for current->mempolicy
2063 * nor system default_policy
2064 */
45c4745a 2065 if (pol->mode == MPOL_INTERLEAVE)
c0ff7453
MX
2066 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2067 else
2068 page = __alloc_pages_nodemask(gfp, order,
5c4b4be3
AK
2069 policy_zonelist(gfp, pol, numa_node_id()),
2070 policy_nodemask(gfp, pol));
cc9a6c87 2071
d26914d1 2072 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
cc9a6c87
MG
2073 goto retry_cpuset;
2074
c0ff7453 2075 return page;
1da177e4
LT
2076}
2077EXPORT_SYMBOL(alloc_pages_current);
2078
ef0855d3
ON
2079int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2080{
2081 struct mempolicy *pol = mpol_dup(vma_policy(src));
2082
2083 if (IS_ERR(pol))
2084 return PTR_ERR(pol);
2085 dst->vm_policy = pol;
2086 return 0;
2087}
2088
4225399a 2089/*
846a16bf 2090 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a
PJ
2091 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2092 * with the mems_allowed returned by cpuset_mems_allowed(). This
2093 * keeps mempolicies cpuset relative after its cpuset moves. See
2094 * further kernel/cpuset.c update_nodemask().
708c1bbc
MX
2095 *
2096 * current's mempolicy may be rebinded by the other task(the task that changes
2097 * cpuset's mems), so we needn't do rebind work for current task.
4225399a 2098 */
4225399a 2099
846a16bf
LS
2100/* Slow path of a mempolicy duplicate */
2101struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4
LT
2102{
2103 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2104
2105 if (!new)
2106 return ERR_PTR(-ENOMEM);
708c1bbc
MX
2107
2108 /* task's mempolicy is protected by alloc_lock */
2109 if (old == current->mempolicy) {
2110 task_lock(current);
2111 *new = *old;
2112 task_unlock(current);
2113 } else
2114 *new = *old;
2115
4225399a
PJ
2116 if (current_cpuset_is_being_rebound()) {
2117 nodemask_t mems = cpuset_mems_allowed(current);
708c1bbc
MX
2118 if (new->flags & MPOL_F_REBINDING)
2119 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2120 else
2121 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
4225399a 2122 }
1da177e4 2123 atomic_set(&new->refcnt, 1);
1da177e4
LT
2124 return new;
2125}
2126
2127/* Slow path of a mempolicy comparison */
fcfb4dcc 2128bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1da177e4
LT
2129{
2130 if (!a || !b)
fcfb4dcc 2131 return false;
45c4745a 2132 if (a->mode != b->mode)
fcfb4dcc 2133 return false;
19800502 2134 if (a->flags != b->flags)
fcfb4dcc 2135 return false;
19800502
BL
2136 if (mpol_store_user_nodemask(a))
2137 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
fcfb4dcc 2138 return false;
19800502 2139
45c4745a 2140 switch (a->mode) {
19770b32
MG
2141 case MPOL_BIND:
2142 /* Fall through */
1da177e4 2143 case MPOL_INTERLEAVE:
fcfb4dcc 2144 return !!nodes_equal(a->v.nodes, b->v.nodes);
1da177e4 2145 case MPOL_PREFERRED:
75719661 2146 return a->v.preferred_node == b->v.preferred_node;
1da177e4
LT
2147 default:
2148 BUG();
fcfb4dcc 2149 return false;
1da177e4
LT
2150 }
2151}
2152
1da177e4
LT
2153/*
2154 * Shared memory backing store policy support.
2155 *
2156 * Remember policies even when nobody has shared memory mapped.
2157 * The policies are kept in Red-Black tree linked from the inode.
4a8c7bb5 2158 * They are protected by the sp->lock rwlock, which should be held
1da177e4
LT
2159 * for any accesses to the tree.
2160 */
2161
4a8c7bb5
NZ
2162/*
2163 * lookup first element intersecting start-end. Caller holds sp->lock for
2164 * reading or for writing
2165 */
1da177e4
LT
2166static struct sp_node *
2167sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2168{
2169 struct rb_node *n = sp->root.rb_node;
2170
2171 while (n) {
2172 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2173
2174 if (start >= p->end)
2175 n = n->rb_right;
2176 else if (end <= p->start)
2177 n = n->rb_left;
2178 else
2179 break;
2180 }
2181 if (!n)
2182 return NULL;
2183 for (;;) {
2184 struct sp_node *w = NULL;
2185 struct rb_node *prev = rb_prev(n);
2186 if (!prev)
2187 break;
2188 w = rb_entry(prev, struct sp_node, nd);
2189 if (w->end <= start)
2190 break;
2191 n = prev;
2192 }
2193 return rb_entry(n, struct sp_node, nd);
2194}
2195
4a8c7bb5
NZ
2196/*
2197 * Insert a new shared policy into the list. Caller holds sp->lock for
2198 * writing.
2199 */
1da177e4
LT
2200static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2201{
2202 struct rb_node **p = &sp->root.rb_node;
2203 struct rb_node *parent = NULL;
2204 struct sp_node *nd;
2205
2206 while (*p) {
2207 parent = *p;
2208 nd = rb_entry(parent, struct sp_node, nd);
2209 if (new->start < nd->start)
2210 p = &(*p)->rb_left;
2211 else if (new->end > nd->end)
2212 p = &(*p)->rb_right;
2213 else
2214 BUG();
2215 }
2216 rb_link_node(&new->nd, parent, p);
2217 rb_insert_color(&new->nd, &sp->root);
140d5a49 2218 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
45c4745a 2219 new->policy ? new->policy->mode : 0);
1da177e4
LT
2220}
2221
2222/* Find shared policy intersecting idx */
2223struct mempolicy *
2224mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2225{
2226 struct mempolicy *pol = NULL;
2227 struct sp_node *sn;
2228
2229 if (!sp->root.rb_node)
2230 return NULL;
4a8c7bb5 2231 read_lock(&sp->lock);
1da177e4
LT
2232 sn = sp_lookup(sp, idx, idx+1);
2233 if (sn) {
2234 mpol_get(sn->policy);
2235 pol = sn->policy;
2236 }
4a8c7bb5 2237 read_unlock(&sp->lock);
1da177e4
LT
2238 return pol;
2239}
2240
63f74ca2
KM
2241static void sp_free(struct sp_node *n)
2242{
2243 mpol_put(n->policy);
2244 kmem_cache_free(sn_cache, n);
2245}
2246
771fb4d8
LS
2247/**
2248 * mpol_misplaced - check whether current page node is valid in policy
2249 *
b46e14ac
FF
2250 * @page: page to be checked
2251 * @vma: vm area where page mapped
2252 * @addr: virtual address where page mapped
771fb4d8
LS
2253 *
2254 * Lookup current policy node id for vma,addr and "compare to" page's
2255 * node id.
2256 *
2257 * Returns:
2258 * -1 - not misplaced, page is in the right node
2259 * node - node id where the page should be
2260 *
2261 * Policy determination "mimics" alloc_page_vma().
2262 * Called from fault path where we know the vma and faulting address.
2263 */
2264int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2265{
2266 struct mempolicy *pol;
c33d6c06 2267 struct zoneref *z;
771fb4d8
LS
2268 int curnid = page_to_nid(page);
2269 unsigned long pgoff;
90572890
PZ
2270 int thiscpu = raw_smp_processor_id();
2271 int thisnid = cpu_to_node(thiscpu);
771fb4d8
LS
2272 int polnid = -1;
2273 int ret = -1;
2274
2275 BUG_ON(!vma);
2276
dd6eecb9 2277 pol = get_vma_policy(vma, addr);
771fb4d8
LS
2278 if (!(pol->flags & MPOL_F_MOF))
2279 goto out;
2280
2281 switch (pol->mode) {
2282 case MPOL_INTERLEAVE:
2283 BUG_ON(addr >= vma->vm_end);
2284 BUG_ON(addr < vma->vm_start);
2285
2286 pgoff = vma->vm_pgoff;
2287 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2288 polnid = offset_il_node(pol, vma, pgoff);
2289 break;
2290
2291 case MPOL_PREFERRED:
2292 if (pol->flags & MPOL_F_LOCAL)
2293 polnid = numa_node_id();
2294 else
2295 polnid = pol->v.preferred_node;
2296 break;
2297
2298 case MPOL_BIND:
c33d6c06 2299
771fb4d8
LS
2300 /*
2301 * allows binding to multiple nodes.
2302 * use current page if in policy nodemask,
2303 * else select nearest allowed node, if any.
2304 * If no allowed nodes, use current [!misplaced].
2305 */
2306 if (node_isset(curnid, pol->v.nodes))
2307 goto out;
c33d6c06 2308 z = first_zones_zonelist(
771fb4d8
LS
2309 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2310 gfp_zone(GFP_HIGHUSER),
c33d6c06
MG
2311 &pol->v.nodes);
2312 polnid = z->zone->node;
771fb4d8
LS
2313 break;
2314
2315 default:
2316 BUG();
2317 }
5606e387
MG
2318
2319 /* Migrate the page towards the node whose CPU is referencing it */
e42c8ff2 2320 if (pol->flags & MPOL_F_MORON) {
90572890 2321 polnid = thisnid;
5606e387 2322
10f39042 2323 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
de1c9ce6 2324 goto out;
e42c8ff2
MG
2325 }
2326
771fb4d8
LS
2327 if (curnid != polnid)
2328 ret = polnid;
2329out:
2330 mpol_cond_put(pol);
2331
2332 return ret;
2333}
2334
c11600e4
DR
2335/*
2336 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2337 * dropped after task->mempolicy is set to NULL so that any allocation done as
2338 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2339 * policy.
2340 */
2341void mpol_put_task_policy(struct task_struct *task)
2342{
2343 struct mempolicy *pol;
2344
2345 task_lock(task);
2346 pol = task->mempolicy;
2347 task->mempolicy = NULL;
2348 task_unlock(task);
2349 mpol_put(pol);
2350}
2351
1da177e4
LT
2352static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2353{
140d5a49 2354 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1da177e4 2355 rb_erase(&n->nd, &sp->root);
63f74ca2 2356 sp_free(n);
1da177e4
LT
2357}
2358
42288fe3
MG
2359static void sp_node_init(struct sp_node *node, unsigned long start,
2360 unsigned long end, struct mempolicy *pol)
2361{
2362 node->start = start;
2363 node->end = end;
2364 node->policy = pol;
2365}
2366
dbcb0f19
AB
2367static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2368 struct mempolicy *pol)
1da177e4 2369{
869833f2
KM
2370 struct sp_node *n;
2371 struct mempolicy *newpol;
1da177e4 2372
869833f2 2373 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1da177e4
LT
2374 if (!n)
2375 return NULL;
869833f2
KM
2376
2377 newpol = mpol_dup(pol);
2378 if (IS_ERR(newpol)) {
2379 kmem_cache_free(sn_cache, n);
2380 return NULL;
2381 }
2382 newpol->flags |= MPOL_F_SHARED;
42288fe3 2383 sp_node_init(n, start, end, newpol);
869833f2 2384
1da177e4
LT
2385 return n;
2386}
2387
2388/* Replace a policy range. */
2389static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2390 unsigned long end, struct sp_node *new)
2391{
b22d127a 2392 struct sp_node *n;
42288fe3
MG
2393 struct sp_node *n_new = NULL;
2394 struct mempolicy *mpol_new = NULL;
b22d127a 2395 int ret = 0;
1da177e4 2396
42288fe3 2397restart:
4a8c7bb5 2398 write_lock(&sp->lock);
1da177e4
LT
2399 n = sp_lookup(sp, start, end);
2400 /* Take care of old policies in the same range. */
2401 while (n && n->start < end) {
2402 struct rb_node *next = rb_next(&n->nd);
2403 if (n->start >= start) {
2404 if (n->end <= end)
2405 sp_delete(sp, n);
2406 else
2407 n->start = end;
2408 } else {
2409 /* Old policy spanning whole new range. */
2410 if (n->end > end) {
42288fe3
MG
2411 if (!n_new)
2412 goto alloc_new;
2413
2414 *mpol_new = *n->policy;
2415 atomic_set(&mpol_new->refcnt, 1);
7880639c 2416 sp_node_init(n_new, end, n->end, mpol_new);
1da177e4 2417 n->end = start;
5ca39575 2418 sp_insert(sp, n_new);
42288fe3
MG
2419 n_new = NULL;
2420 mpol_new = NULL;
1da177e4
LT
2421 break;
2422 } else
2423 n->end = start;
2424 }
2425 if (!next)
2426 break;
2427 n = rb_entry(next, struct sp_node, nd);
2428 }
2429 if (new)
2430 sp_insert(sp, new);
4a8c7bb5 2431 write_unlock(&sp->lock);
42288fe3
MG
2432 ret = 0;
2433
2434err_out:
2435 if (mpol_new)
2436 mpol_put(mpol_new);
2437 if (n_new)
2438 kmem_cache_free(sn_cache, n_new);
2439
b22d127a 2440 return ret;
42288fe3
MG
2441
2442alloc_new:
4a8c7bb5 2443 write_unlock(&sp->lock);
42288fe3
MG
2444 ret = -ENOMEM;
2445 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2446 if (!n_new)
2447 goto err_out;
2448 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2449 if (!mpol_new)
2450 goto err_out;
2451 goto restart;
1da177e4
LT
2452}
2453
71fe804b
LS
2454/**
2455 * mpol_shared_policy_init - initialize shared policy for inode
2456 * @sp: pointer to inode shared policy
2457 * @mpol: struct mempolicy to install
2458 *
2459 * Install non-NULL @mpol in inode's shared policy rb-tree.
2460 * On entry, the current task has a reference on a non-NULL @mpol.
2461 * This must be released on exit.
4bfc4495 2462 * This is called at get_inode() calls and we can use GFP_KERNEL.
71fe804b
LS
2463 */
2464void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2465{
58568d2a
MX
2466 int ret;
2467
71fe804b 2468 sp->root = RB_ROOT; /* empty tree == default mempolicy */
4a8c7bb5 2469 rwlock_init(&sp->lock);
71fe804b
LS
2470
2471 if (mpol) {
2472 struct vm_area_struct pvma;
2473 struct mempolicy *new;
4bfc4495 2474 NODEMASK_SCRATCH(scratch);
71fe804b 2475
4bfc4495 2476 if (!scratch)
5c0c1654 2477 goto put_mpol;
71fe804b
LS
2478 /* contextualize the tmpfs mount point mempolicy */
2479 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
15d77835 2480 if (IS_ERR(new))
0cae3457 2481 goto free_scratch; /* no valid nodemask intersection */
58568d2a
MX
2482
2483 task_lock(current);
4bfc4495 2484 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
58568d2a 2485 task_unlock(current);
15d77835 2486 if (ret)
5c0c1654 2487 goto put_new;
71fe804b
LS
2488
2489 /* Create pseudo-vma that contains just the policy */
2490 memset(&pvma, 0, sizeof(struct vm_area_struct));
2491 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2492 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
15d77835 2493
5c0c1654 2494put_new:
71fe804b 2495 mpol_put(new); /* drop initial ref */
0cae3457 2496free_scratch:
4bfc4495 2497 NODEMASK_SCRATCH_FREE(scratch);
5c0c1654
LS
2498put_mpol:
2499 mpol_put(mpol); /* drop our incoming ref on sb mpol */
7339ff83
RH
2500 }
2501}
2502
1da177e4
LT
2503int mpol_set_shared_policy(struct shared_policy *info,
2504 struct vm_area_struct *vma, struct mempolicy *npol)
2505{
2506 int err;
2507 struct sp_node *new = NULL;
2508 unsigned long sz = vma_pages(vma);
2509
028fec41 2510 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1da177e4 2511 vma->vm_pgoff,
45c4745a 2512 sz, npol ? npol->mode : -1,
028fec41 2513 npol ? npol->flags : -1,
00ef2d2f 2514 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
1da177e4
LT
2515
2516 if (npol) {
2517 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2518 if (!new)
2519 return -ENOMEM;
2520 }
2521 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2522 if (err && new)
63f74ca2 2523 sp_free(new);
1da177e4
LT
2524 return err;
2525}
2526
2527/* Free a backing policy store on inode delete. */
2528void mpol_free_shared_policy(struct shared_policy *p)
2529{
2530 struct sp_node *n;
2531 struct rb_node *next;
2532
2533 if (!p->root.rb_node)
2534 return;
4a8c7bb5 2535 write_lock(&p->lock);
1da177e4
LT
2536 next = rb_first(&p->root);
2537 while (next) {
2538 n = rb_entry(next, struct sp_node, nd);
2539 next = rb_next(&n->nd);
63f74ca2 2540 sp_delete(p, n);
1da177e4 2541 }
4a8c7bb5 2542 write_unlock(&p->lock);
1da177e4
LT
2543}
2544
1a687c2e 2545#ifdef CONFIG_NUMA_BALANCING
c297663c 2546static int __initdata numabalancing_override;
1a687c2e
MG
2547
2548static void __init check_numabalancing_enable(void)
2549{
2550 bool numabalancing_default = false;
2551
2552 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2553 numabalancing_default = true;
2554
c297663c
MG
2555 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2556 if (numabalancing_override)
2557 set_numabalancing_state(numabalancing_override == 1);
2558
b0dc2b9b 2559 if (num_online_nodes() > 1 && !numabalancing_override) {
756a025f 2560 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
c297663c 2561 numabalancing_default ? "Enabling" : "Disabling");
1a687c2e
MG
2562 set_numabalancing_state(numabalancing_default);
2563 }
2564}
2565
2566static int __init setup_numabalancing(char *str)
2567{
2568 int ret = 0;
2569 if (!str)
2570 goto out;
1a687c2e
MG
2571
2572 if (!strcmp(str, "enable")) {
c297663c 2573 numabalancing_override = 1;
1a687c2e
MG
2574 ret = 1;
2575 } else if (!strcmp(str, "disable")) {
c297663c 2576 numabalancing_override = -1;
1a687c2e
MG
2577 ret = 1;
2578 }
2579out:
2580 if (!ret)
4a404bea 2581 pr_warn("Unable to parse numa_balancing=\n");
1a687c2e
MG
2582
2583 return ret;
2584}
2585__setup("numa_balancing=", setup_numabalancing);
2586#else
2587static inline void __init check_numabalancing_enable(void)
2588{
2589}
2590#endif /* CONFIG_NUMA_BALANCING */
2591
1da177e4
LT
2592/* assumes fs == KERNEL_DS */
2593void __init numa_policy_init(void)
2594{
b71636e2
PM
2595 nodemask_t interleave_nodes;
2596 unsigned long largest = 0;
2597 int nid, prefer = 0;
2598
1da177e4
LT
2599 policy_cache = kmem_cache_create("numa_policy",
2600 sizeof(struct mempolicy),
20c2df83 2601 0, SLAB_PANIC, NULL);
1da177e4
LT
2602
2603 sn_cache = kmem_cache_create("shared_policy_node",
2604 sizeof(struct sp_node),
20c2df83 2605 0, SLAB_PANIC, NULL);
1da177e4 2606
5606e387
MG
2607 for_each_node(nid) {
2608 preferred_node_policy[nid] = (struct mempolicy) {
2609 .refcnt = ATOMIC_INIT(1),
2610 .mode = MPOL_PREFERRED,
2611 .flags = MPOL_F_MOF | MPOL_F_MORON,
2612 .v = { .preferred_node = nid, },
2613 };
2614 }
2615
b71636e2
PM
2616 /*
2617 * Set interleaving policy for system init. Interleaving is only
2618 * enabled across suitably sized nodes (default is >= 16MB), or
2619 * fall back to the largest node if they're all smaller.
2620 */
2621 nodes_clear(interleave_nodes);
01f13bd6 2622 for_each_node_state(nid, N_MEMORY) {
b71636e2
PM
2623 unsigned long total_pages = node_present_pages(nid);
2624
2625 /* Preserve the largest node */
2626 if (largest < total_pages) {
2627 largest = total_pages;
2628 prefer = nid;
2629 }
2630
2631 /* Interleave this node? */
2632 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2633 node_set(nid, interleave_nodes);
2634 }
2635
2636 /* All too small, use the largest */
2637 if (unlikely(nodes_empty(interleave_nodes)))
2638 node_set(prefer, interleave_nodes);
1da177e4 2639
028fec41 2640 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
b1de0d13 2641 pr_err("%s: interleaving failed\n", __func__);
1a687c2e
MG
2642
2643 check_numabalancing_enable();
1da177e4
LT
2644}
2645
8bccd85f 2646/* Reset policy of current process to default */
1da177e4
LT
2647void numa_default_policy(void)
2648{
028fec41 2649 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 2650}
68860ec1 2651
095f1fc4
LS
2652/*
2653 * Parse and format mempolicy from/to strings
2654 */
2655
1a75a6c8 2656/*
f2a07f40 2657 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
1a75a6c8 2658 */
345ace9c
LS
2659static const char * const policy_modes[] =
2660{
2661 [MPOL_DEFAULT] = "default",
2662 [MPOL_PREFERRED] = "prefer",
2663 [MPOL_BIND] = "bind",
2664 [MPOL_INTERLEAVE] = "interleave",
d3a71033 2665 [MPOL_LOCAL] = "local",
345ace9c 2666};
1a75a6c8 2667
095f1fc4
LS
2668
2669#ifdef CONFIG_TMPFS
2670/**
f2a07f40 2671 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
095f1fc4 2672 * @str: string containing mempolicy to parse
71fe804b 2673 * @mpol: pointer to struct mempolicy pointer, returned on success.
095f1fc4
LS
2674 *
2675 * Format of input:
2676 * <mode>[=<flags>][:<nodelist>]
2677 *
71fe804b 2678 * On success, returns 0, else 1
095f1fc4 2679 */
a7a88b23 2680int mpol_parse_str(char *str, struct mempolicy **mpol)
095f1fc4 2681{
71fe804b 2682 struct mempolicy *new = NULL;
b4652e84 2683 unsigned short mode;
f2a07f40 2684 unsigned short mode_flags;
71fe804b 2685 nodemask_t nodes;
095f1fc4
LS
2686 char *nodelist = strchr(str, ':');
2687 char *flags = strchr(str, '=');
095f1fc4
LS
2688 int err = 1;
2689
2690 if (nodelist) {
2691 /* NUL-terminate mode or flags string */
2692 *nodelist++ = '\0';
71fe804b 2693 if (nodelist_parse(nodelist, nodes))
095f1fc4 2694 goto out;
01f13bd6 2695 if (!nodes_subset(nodes, node_states[N_MEMORY]))
095f1fc4 2696 goto out;
71fe804b
LS
2697 } else
2698 nodes_clear(nodes);
2699
095f1fc4
LS
2700 if (flags)
2701 *flags++ = '\0'; /* terminate mode string */
2702
479e2802 2703 for (mode = 0; mode < MPOL_MAX; mode++) {
345ace9c 2704 if (!strcmp(str, policy_modes[mode])) {
095f1fc4
LS
2705 break;
2706 }
2707 }
a720094d 2708 if (mode >= MPOL_MAX)
095f1fc4
LS
2709 goto out;
2710
71fe804b 2711 switch (mode) {
095f1fc4 2712 case MPOL_PREFERRED:
71fe804b
LS
2713 /*
2714 * Insist on a nodelist of one node only
2715 */
095f1fc4
LS
2716 if (nodelist) {
2717 char *rest = nodelist;
2718 while (isdigit(*rest))
2719 rest++;
926f2ae0
KM
2720 if (*rest)
2721 goto out;
095f1fc4
LS
2722 }
2723 break;
095f1fc4
LS
2724 case MPOL_INTERLEAVE:
2725 /*
2726 * Default to online nodes with memory if no nodelist
2727 */
2728 if (!nodelist)
01f13bd6 2729 nodes = node_states[N_MEMORY];
3f226aa1 2730 break;
71fe804b 2731 case MPOL_LOCAL:
3f226aa1 2732 /*
71fe804b 2733 * Don't allow a nodelist; mpol_new() checks flags
3f226aa1 2734 */
71fe804b 2735 if (nodelist)
3f226aa1 2736 goto out;
71fe804b 2737 mode = MPOL_PREFERRED;
3f226aa1 2738 break;
413b43de
RT
2739 case MPOL_DEFAULT:
2740 /*
2741 * Insist on a empty nodelist
2742 */
2743 if (!nodelist)
2744 err = 0;
2745 goto out;
d69b2e63
KM
2746 case MPOL_BIND:
2747 /*
2748 * Insist on a nodelist
2749 */
2750 if (!nodelist)
2751 goto out;
095f1fc4
LS
2752 }
2753
71fe804b 2754 mode_flags = 0;
095f1fc4
LS
2755 if (flags) {
2756 /*
2757 * Currently, we only support two mutually exclusive
2758 * mode flags.
2759 */
2760 if (!strcmp(flags, "static"))
71fe804b 2761 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4 2762 else if (!strcmp(flags, "relative"))
71fe804b 2763 mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4 2764 else
926f2ae0 2765 goto out;
095f1fc4 2766 }
71fe804b
LS
2767
2768 new = mpol_new(mode, mode_flags, &nodes);
2769 if (IS_ERR(new))
926f2ae0
KM
2770 goto out;
2771
f2a07f40
HD
2772 /*
2773 * Save nodes for mpol_to_str() to show the tmpfs mount options
2774 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2775 */
2776 if (mode != MPOL_PREFERRED)
2777 new->v.nodes = nodes;
2778 else if (nodelist)
2779 new->v.preferred_node = first_node(nodes);
2780 else
2781 new->flags |= MPOL_F_LOCAL;
2782
2783 /*
2784 * Save nodes for contextualization: this will be used to "clone"
2785 * the mempolicy in a specific context [cpuset] at a later time.
2786 */
2787 new->w.user_nodemask = nodes;
2788
926f2ae0 2789 err = 0;
71fe804b 2790
095f1fc4
LS
2791out:
2792 /* Restore string for error message */
2793 if (nodelist)
2794 *--nodelist = ':';
2795 if (flags)
2796 *--flags = '=';
71fe804b
LS
2797 if (!err)
2798 *mpol = new;
095f1fc4
LS
2799 return err;
2800}
2801#endif /* CONFIG_TMPFS */
2802
71fe804b
LS
2803/**
2804 * mpol_to_str - format a mempolicy structure for printing
2805 * @buffer: to contain formatted mempolicy string
2806 * @maxlen: length of @buffer
2807 * @pol: pointer to mempolicy to be formatted
71fe804b 2808 *
948927ee
DR
2809 * Convert @pol into a string. If @buffer is too short, truncate the string.
2810 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2811 * longest flag, "relative", and to display at least a few node ids.
1a75a6c8 2812 */
948927ee 2813void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1a75a6c8
CL
2814{
2815 char *p = buffer;
948927ee
DR
2816 nodemask_t nodes = NODE_MASK_NONE;
2817 unsigned short mode = MPOL_DEFAULT;
2818 unsigned short flags = 0;
2291990a 2819
8790c71a 2820 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
bea904d5 2821 mode = pol->mode;
948927ee
DR
2822 flags = pol->flags;
2823 }
bea904d5 2824
1a75a6c8
CL
2825 switch (mode) {
2826 case MPOL_DEFAULT:
1a75a6c8 2827 break;
1a75a6c8 2828 case MPOL_PREFERRED:
fc36b8d3 2829 if (flags & MPOL_F_LOCAL)
f2a07f40 2830 mode = MPOL_LOCAL;
53f2556b 2831 else
fc36b8d3 2832 node_set(pol->v.preferred_node, nodes);
1a75a6c8 2833 break;
1a75a6c8 2834 case MPOL_BIND:
1a75a6c8 2835 case MPOL_INTERLEAVE:
f2a07f40 2836 nodes = pol->v.nodes;
1a75a6c8 2837 break;
1a75a6c8 2838 default:
948927ee
DR
2839 WARN_ON_ONCE(1);
2840 snprintf(p, maxlen, "unknown");
2841 return;
1a75a6c8
CL
2842 }
2843
b7a9f420 2844 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
1a75a6c8 2845
fc36b8d3 2846 if (flags & MPOL_MODE_FLAGS) {
948927ee 2847 p += snprintf(p, buffer + maxlen - p, "=");
f5b087b5 2848
2291990a
LS
2849 /*
2850 * Currently, the only defined flags are mutually exclusive
2851 */
f5b087b5 2852 if (flags & MPOL_F_STATIC_NODES)
2291990a
LS
2853 p += snprintf(p, buffer + maxlen - p, "static");
2854 else if (flags & MPOL_F_RELATIVE_NODES)
2855 p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b5
DR
2856 }
2857
9e763e0f
TH
2858 if (!nodes_empty(nodes))
2859 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2860 nodemask_pr_args(&nodes));
1a75a6c8 2861}