]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - mm/mempolicy.c
mm: mempolicy: Implement change_prot_numa() in terms of change_protection()
[mirror_ubuntu-zesty-kernel.git] / mm / mempolicy.c
CommitLineData
1da177e4
LT
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4
LT
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
1da177e4
LT
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
1da177e4
LT
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
1da177e4
LT
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
1da177e4
LT
76#include <linux/slab.h>
77#include <linux/string.h>
b95f1b31 78#include <linux/export.h>
b488893a 79#include <linux/nsproxy.h>
1da177e4
LT
80#include <linux/interrupt.h>
81#include <linux/init.h>
82#include <linux/compat.h>
dc9aa5b9 83#include <linux/swap.h>
1a75a6c8
CL
84#include <linux/seq_file.h>
85#include <linux/proc_fs.h>
b20a3503 86#include <linux/migrate.h>
62b61f61 87#include <linux/ksm.h>
95a402c3 88#include <linux/rmap.h>
86c3a764 89#include <linux/security.h>
dbcb0f19 90#include <linux/syscalls.h>
095f1fc4 91#include <linux/ctype.h>
6d9c285a 92#include <linux/mm_inline.h>
b24f53a0 93#include <linux/mmu_notifier.h>
dc9aa5b9 94
1da177e4
LT
95#include <asm/tlbflush.h>
96#include <asm/uaccess.h>
778d3b0f 97#include <linux/random.h>
1da177e4 98
62695a84
NP
99#include "internal.h"
100
38e35860 101/* Internal flags */
dc9aa5b9 102#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 103#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
dc9aa5b9 104
fcc234f8
PE
105static struct kmem_cache *policy_cache;
106static struct kmem_cache *sn_cache;
1da177e4 107
1da177e4
LT
108/* Highest zone. An specific allocation for a zone below that is not
109 policied. */
6267276f 110enum zone_type policy_zone = 0;
1da177e4 111
bea904d5
LS
112/*
113 * run-time system-wide default policy => local allocation
114 */
e754d79d 115static struct mempolicy default_policy = {
1da177e4 116 .refcnt = ATOMIC_INIT(1), /* never free it */
bea904d5 117 .mode = MPOL_PREFERRED,
fc36b8d3 118 .flags = MPOL_F_LOCAL,
1da177e4
LT
119};
120
37012946
DR
121static const struct mempolicy_operations {
122 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
708c1bbc
MX
123 /*
124 * If read-side task has no lock to protect task->mempolicy, write-side
125 * task will rebind the task->mempolicy by two step. The first step is
126 * setting all the newly nodes, and the second step is cleaning all the
127 * disallowed nodes. In this way, we can avoid finding no node to alloc
128 * page.
129 * If we have a lock to protect task->mempolicy in read-side, we do
130 * rebind directly.
131 *
132 * step:
133 * MPOL_REBIND_ONCE - do rebind work at once
134 * MPOL_REBIND_STEP1 - set all the newly nodes
135 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
136 */
137 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
138 enum mpol_rebind_step step);
37012946
DR
139} mpol_ops[MPOL_MAX];
140
19770b32 141/* Check that the nodemask contains at least one populated zone */
37012946 142static int is_valid_nodemask(const nodemask_t *nodemask)
1da177e4 143{
19770b32 144 int nd, k;
1da177e4 145
19770b32
MG
146 for_each_node_mask(nd, *nodemask) {
147 struct zone *z;
148
149 for (k = 0; k <= policy_zone; k++) {
150 z = &NODE_DATA(nd)->node_zones[k];
151 if (z->present_pages > 0)
152 return 1;
dd942ae3 153 }
8af5e2eb 154 }
19770b32
MG
155
156 return 0;
1da177e4
LT
157}
158
f5b087b5
DR
159static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
160{
6d556294 161 return pol->flags & MPOL_MODE_FLAGS;
4c50bc01
DR
162}
163
164static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
165 const nodemask_t *rel)
166{
167 nodemask_t tmp;
168 nodes_fold(tmp, *orig, nodes_weight(*rel));
169 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
170}
171
37012946
DR
172static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
173{
174 if (nodes_empty(*nodes))
175 return -EINVAL;
176 pol->v.nodes = *nodes;
177 return 0;
178}
179
180static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
181{
182 if (!nodes)
fc36b8d3 183 pol->flags |= MPOL_F_LOCAL; /* local allocation */
37012946
DR
184 else if (nodes_empty(*nodes))
185 return -EINVAL; /* no allowed nodes */
186 else
187 pol->v.preferred_node = first_node(*nodes);
188 return 0;
189}
190
191static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
192{
193 if (!is_valid_nodemask(nodes))
194 return -EINVAL;
195 pol->v.nodes = *nodes;
196 return 0;
197}
198
58568d2a
MX
199/*
200 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
201 * any, for the new policy. mpol_new() has already validated the nodes
202 * parameter with respect to the policy mode and flags. But, we need to
203 * handle an empty nodemask with MPOL_PREFERRED here.
204 *
205 * Must be called holding task's alloc_lock to protect task's mems_allowed
206 * and mempolicy. May also be called holding the mmap_semaphore for write.
207 */
4bfc4495
KH
208static int mpol_set_nodemask(struct mempolicy *pol,
209 const nodemask_t *nodes, struct nodemask_scratch *nsc)
58568d2a 210{
58568d2a
MX
211 int ret;
212
213 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
214 if (pol == NULL)
215 return 0;
4bfc4495
KH
216 /* Check N_HIGH_MEMORY */
217 nodes_and(nsc->mask1,
218 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
58568d2a
MX
219
220 VM_BUG_ON(!nodes);
221 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
222 nodes = NULL; /* explicit local allocation */
223 else {
224 if (pol->flags & MPOL_F_RELATIVE_NODES)
4bfc4495 225 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
58568d2a 226 else
4bfc4495
KH
227 nodes_and(nsc->mask2, *nodes, nsc->mask1);
228
58568d2a
MX
229 if (mpol_store_user_nodemask(pol))
230 pol->w.user_nodemask = *nodes;
231 else
232 pol->w.cpuset_mems_allowed =
233 cpuset_current_mems_allowed;
234 }
235
4bfc4495
KH
236 if (nodes)
237 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
238 else
239 ret = mpol_ops[pol->mode].create(pol, NULL);
58568d2a
MX
240 return ret;
241}
242
243/*
244 * This function just creates a new policy, does some check and simple
245 * initialization. You must invoke mpol_set_nodemask() to set nodes.
246 */
028fec41
DR
247static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
248 nodemask_t *nodes)
1da177e4
LT
249{
250 struct mempolicy *policy;
251
028fec41
DR
252 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
253 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
140d5a49 254
d3a71033 255 if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) {
3e1f0645 256 if (nodes && !nodes_empty(*nodes))
37012946 257 return ERR_PTR(-EINVAL);
d3a71033 258 return NULL;
37012946 259 }
3e1f0645
DR
260 VM_BUG_ON(!nodes);
261
262 /*
263 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
264 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
265 * All other modes require a valid pointer to a non-empty nodemask.
266 */
267 if (mode == MPOL_PREFERRED) {
268 if (nodes_empty(*nodes)) {
269 if (((flags & MPOL_F_STATIC_NODES) ||
270 (flags & MPOL_F_RELATIVE_NODES)))
271 return ERR_PTR(-EINVAL);
3e1f0645 272 }
479e2802
PZ
273 } else if (mode == MPOL_LOCAL) {
274 if (!nodes_empty(*nodes))
275 return ERR_PTR(-EINVAL);
276 mode = MPOL_PREFERRED;
3e1f0645
DR
277 } else if (nodes_empty(*nodes))
278 return ERR_PTR(-EINVAL);
1da177e4
LT
279 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
280 if (!policy)
281 return ERR_PTR(-ENOMEM);
282 atomic_set(&policy->refcnt, 1);
45c4745a 283 policy->mode = mode;
3e1f0645 284 policy->flags = flags;
37012946 285
1da177e4 286 return policy;
37012946
DR
287}
288
52cd3b07
LS
289/* Slow path of a mpol destructor. */
290void __mpol_put(struct mempolicy *p)
291{
292 if (!atomic_dec_and_test(&p->refcnt))
293 return;
52cd3b07
LS
294 kmem_cache_free(policy_cache, p);
295}
296
708c1bbc
MX
297static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
298 enum mpol_rebind_step step)
37012946
DR
299{
300}
301
708c1bbc
MX
302/*
303 * step:
304 * MPOL_REBIND_ONCE - do rebind work at once
305 * MPOL_REBIND_STEP1 - set all the newly nodes
306 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
307 */
308static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
309 enum mpol_rebind_step step)
37012946
DR
310{
311 nodemask_t tmp;
312
313 if (pol->flags & MPOL_F_STATIC_NODES)
314 nodes_and(tmp, pol->w.user_nodemask, *nodes);
315 else if (pol->flags & MPOL_F_RELATIVE_NODES)
316 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
317 else {
708c1bbc
MX
318 /*
319 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
320 * result
321 */
322 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
323 nodes_remap(tmp, pol->v.nodes,
324 pol->w.cpuset_mems_allowed, *nodes);
325 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
326 } else if (step == MPOL_REBIND_STEP2) {
327 tmp = pol->w.cpuset_mems_allowed;
328 pol->w.cpuset_mems_allowed = *nodes;
329 } else
330 BUG();
37012946 331 }
f5b087b5 332
708c1bbc
MX
333 if (nodes_empty(tmp))
334 tmp = *nodes;
335
336 if (step == MPOL_REBIND_STEP1)
337 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
338 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
339 pol->v.nodes = tmp;
340 else
341 BUG();
342
37012946
DR
343 if (!node_isset(current->il_next, tmp)) {
344 current->il_next = next_node(current->il_next, tmp);
345 if (current->il_next >= MAX_NUMNODES)
346 current->il_next = first_node(tmp);
347 if (current->il_next >= MAX_NUMNODES)
348 current->il_next = numa_node_id();
349 }
350}
351
352static void mpol_rebind_preferred(struct mempolicy *pol,
708c1bbc
MX
353 const nodemask_t *nodes,
354 enum mpol_rebind_step step)
37012946
DR
355{
356 nodemask_t tmp;
357
37012946
DR
358 if (pol->flags & MPOL_F_STATIC_NODES) {
359 int node = first_node(pol->w.user_nodemask);
360
fc36b8d3 361 if (node_isset(node, *nodes)) {
37012946 362 pol->v.preferred_node = node;
fc36b8d3
LS
363 pol->flags &= ~MPOL_F_LOCAL;
364 } else
365 pol->flags |= MPOL_F_LOCAL;
37012946
DR
366 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
367 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
368 pol->v.preferred_node = first_node(tmp);
fc36b8d3 369 } else if (!(pol->flags & MPOL_F_LOCAL)) {
37012946
DR
370 pol->v.preferred_node = node_remap(pol->v.preferred_node,
371 pol->w.cpuset_mems_allowed,
372 *nodes);
373 pol->w.cpuset_mems_allowed = *nodes;
374 }
1da177e4
LT
375}
376
708c1bbc
MX
377/*
378 * mpol_rebind_policy - Migrate a policy to a different set of nodes
379 *
380 * If read-side task has no lock to protect task->mempolicy, write-side
381 * task will rebind the task->mempolicy by two step. The first step is
382 * setting all the newly nodes, and the second step is cleaning all the
383 * disallowed nodes. In this way, we can avoid finding no node to alloc
384 * page.
385 * If we have a lock to protect task->mempolicy in read-side, we do
386 * rebind directly.
387 *
388 * step:
389 * MPOL_REBIND_ONCE - do rebind work at once
390 * MPOL_REBIND_STEP1 - set all the newly nodes
391 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
392 */
393static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
394 enum mpol_rebind_step step)
1d0d2680 395{
1d0d2680
DR
396 if (!pol)
397 return;
89c522c7 398 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
1d0d2680
DR
399 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
400 return;
708c1bbc
MX
401
402 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
403 return;
404
405 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
406 BUG();
407
408 if (step == MPOL_REBIND_STEP1)
409 pol->flags |= MPOL_F_REBINDING;
410 else if (step == MPOL_REBIND_STEP2)
411 pol->flags &= ~MPOL_F_REBINDING;
412 else if (step >= MPOL_REBIND_NSTEP)
413 BUG();
414
415 mpol_ops[pol->mode].rebind(pol, newmask, step);
1d0d2680
DR
416}
417
418/*
419 * Wrapper for mpol_rebind_policy() that just requires task
420 * pointer, and updates task mempolicy.
58568d2a
MX
421 *
422 * Called with task's alloc_lock held.
1d0d2680
DR
423 */
424
708c1bbc
MX
425void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
426 enum mpol_rebind_step step)
1d0d2680 427{
708c1bbc 428 mpol_rebind_policy(tsk->mempolicy, new, step);
1d0d2680
DR
429}
430
431/*
432 * Rebind each vma in mm to new nodemask.
433 *
434 * Call holding a reference to mm. Takes mm->mmap_sem during call.
435 */
436
437void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
438{
439 struct vm_area_struct *vma;
440
441 down_write(&mm->mmap_sem);
442 for (vma = mm->mmap; vma; vma = vma->vm_next)
708c1bbc 443 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
1d0d2680
DR
444 up_write(&mm->mmap_sem);
445}
446
37012946
DR
447static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
448 [MPOL_DEFAULT] = {
449 .rebind = mpol_rebind_default,
450 },
451 [MPOL_INTERLEAVE] = {
452 .create = mpol_new_interleave,
453 .rebind = mpol_rebind_nodemask,
454 },
455 [MPOL_PREFERRED] = {
456 .create = mpol_new_preferred,
457 .rebind = mpol_rebind_preferred,
458 },
459 [MPOL_BIND] = {
460 .create = mpol_new_bind,
461 .rebind = mpol_rebind_nodemask,
462 },
463};
464
fc301289
CL
465static void migrate_page_add(struct page *page, struct list_head *pagelist,
466 unsigned long flags);
1a75a6c8 467
38e35860 468/* Scan through pages checking if pages follow certain conditions. */
b5810039 469static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
dc9aa5b9
CL
470 unsigned long addr, unsigned long end,
471 const nodemask_t *nodes, unsigned long flags,
38e35860 472 void *private)
1da177e4 473{
91612e0d
HD
474 pte_t *orig_pte;
475 pte_t *pte;
705e87c0 476 spinlock_t *ptl;
941150a3 477
705e87c0 478 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
91612e0d 479 do {
6aab341e 480 struct page *page;
25ba77c1 481 int nid;
91612e0d
HD
482
483 if (!pte_present(*pte))
1da177e4 484 continue;
6aab341e
LT
485 page = vm_normal_page(vma, addr, *pte);
486 if (!page)
1da177e4 487 continue;
053837fc 488 /*
62b61f61
HD
489 * vm_normal_page() filters out zero pages, but there might
490 * still be PageReserved pages to skip, perhaps in a VDSO.
491 * And we cannot move PageKsm pages sensibly or safely yet.
053837fc 492 */
62b61f61 493 if (PageReserved(page) || PageKsm(page))
f4598c8b 494 continue;
6aab341e 495 nid = page_to_nid(page);
38e35860
CL
496 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
497 continue;
498
b1f72d18 499 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
fc301289 500 migrate_page_add(page, private, flags);
38e35860
CL
501 else
502 break;
91612e0d 503 } while (pte++, addr += PAGE_SIZE, addr != end);
705e87c0 504 pte_unmap_unlock(orig_pte, ptl);
91612e0d
HD
505 return addr != end;
506}
507
b5810039 508static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
dc9aa5b9
CL
509 unsigned long addr, unsigned long end,
510 const nodemask_t *nodes, unsigned long flags,
38e35860 511 void *private)
91612e0d
HD
512{
513 pmd_t *pmd;
514 unsigned long next;
515
516 pmd = pmd_offset(pud, addr);
517 do {
518 next = pmd_addr_end(addr, end);
bae9c19b 519 split_huge_page_pmd(vma->vm_mm, pmd);
1a5a9906 520 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
91612e0d 521 continue;
dc9aa5b9 522 if (check_pte_range(vma, pmd, addr, next, nodes,
38e35860 523 flags, private))
91612e0d
HD
524 return -EIO;
525 } while (pmd++, addr = next, addr != end);
526 return 0;
527}
528
b5810039 529static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
dc9aa5b9
CL
530 unsigned long addr, unsigned long end,
531 const nodemask_t *nodes, unsigned long flags,
38e35860 532 void *private)
91612e0d
HD
533{
534 pud_t *pud;
535 unsigned long next;
536
537 pud = pud_offset(pgd, addr);
538 do {
539 next = pud_addr_end(addr, end);
540 if (pud_none_or_clear_bad(pud))
541 continue;
dc9aa5b9 542 if (check_pmd_range(vma, pud, addr, next, nodes,
38e35860 543 flags, private))
91612e0d
HD
544 return -EIO;
545 } while (pud++, addr = next, addr != end);
546 return 0;
547}
548
b5810039 549static inline int check_pgd_range(struct vm_area_struct *vma,
dc9aa5b9
CL
550 unsigned long addr, unsigned long end,
551 const nodemask_t *nodes, unsigned long flags,
38e35860 552 void *private)
91612e0d
HD
553{
554 pgd_t *pgd;
555 unsigned long next;
556
b5810039 557 pgd = pgd_offset(vma->vm_mm, addr);
91612e0d
HD
558 do {
559 next = pgd_addr_end(addr, end);
560 if (pgd_none_or_clear_bad(pgd))
561 continue;
dc9aa5b9 562 if (check_pud_range(vma, pgd, addr, next, nodes,
38e35860 563 flags, private))
91612e0d
HD
564 return -EIO;
565 } while (pgd++, addr = next, addr != end);
566 return 0;
1da177e4
LT
567}
568
b24f53a0
LS
569#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
570/*
4b10e7d5
MG
571 * This is used to mark a range of virtual addresses to be inaccessible.
572 * These are later cleared by a NUMA hinting fault. Depending on these
573 * faults, pages may be migrated for better NUMA placement.
574 *
575 * This is assuming that NUMA faults are handled using PROT_NONE. If
576 * an architecture makes a different choice, it will need further
577 * changes to the core.
b24f53a0 578 */
4b10e7d5
MG
579unsigned long change_prot_numa(struct vm_area_struct *vma,
580 unsigned long addr, unsigned long end)
b24f53a0 581{
4b10e7d5
MG
582 int nr_updated;
583 BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
b24f53a0 584
4b10e7d5 585 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
b24f53a0 586
4b10e7d5 587 return nr_updated;
b24f53a0
LS
588}
589#else
590static unsigned long change_prot_numa(struct vm_area_struct *vma,
591 unsigned long addr, unsigned long end)
592{
593 return 0;
594}
595#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
596
dc9aa5b9
CL
597/*
598 * Check if all pages in a range are on a set of nodes.
599 * If pagelist != NULL then isolate pages from the LRU and
600 * put them on the pagelist.
601 */
1da177e4
LT
602static struct vm_area_struct *
603check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
38e35860 604 const nodemask_t *nodes, unsigned long flags, void *private)
1da177e4
LT
605{
606 int err;
607 struct vm_area_struct *first, *vma, *prev;
608
053837fc 609
1da177e4
LT
610 first = find_vma(mm, start);
611 if (!first)
612 return ERR_PTR(-EFAULT);
613 prev = NULL;
614 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
b24f53a0
LS
615 unsigned long endvma = vma->vm_end;
616
617 if (endvma > end)
618 endvma = end;
619 if (vma->vm_start > start)
620 start = vma->vm_start;
621
dc9aa5b9
CL
622 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
623 if (!vma->vm_next && vma->vm_end < end)
624 return ERR_PTR(-EFAULT);
625 if (prev && prev->vm_end < vma->vm_start)
626 return ERR_PTR(-EFAULT);
627 }
b24f53a0
LS
628
629 if (is_vm_hugetlb_page(vma))
630 goto next;
631
632 if (flags & MPOL_MF_LAZY) {
633 change_prot_numa(vma, start, endvma);
634 goto next;
635 }
636
637 if ((flags & MPOL_MF_STRICT) ||
dc9aa5b9 638 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
b24f53a0 639 vma_migratable(vma))) {
dc9aa5b9 640
dc9aa5b9 641 err = check_pgd_range(vma, start, endvma, nodes,
38e35860 642 flags, private);
1da177e4
LT
643 if (err) {
644 first = ERR_PTR(err);
645 break;
646 }
647 }
b24f53a0 648next:
1da177e4
LT
649 prev = vma;
650 }
651 return first;
652}
653
869833f2
KM
654/*
655 * Apply policy to a single VMA
656 * This must be called with the mmap_sem held for writing.
657 */
658static int vma_replace_policy(struct vm_area_struct *vma,
659 struct mempolicy *pol)
8d34694c 660{
869833f2
KM
661 int err;
662 struct mempolicy *old;
663 struct mempolicy *new;
8d34694c
KM
664
665 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
666 vma->vm_start, vma->vm_end, vma->vm_pgoff,
667 vma->vm_ops, vma->vm_file,
668 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
669
869833f2
KM
670 new = mpol_dup(pol);
671 if (IS_ERR(new))
672 return PTR_ERR(new);
673
674 if (vma->vm_ops && vma->vm_ops->set_policy) {
8d34694c 675 err = vma->vm_ops->set_policy(vma, new);
869833f2
KM
676 if (err)
677 goto err_out;
8d34694c 678 }
869833f2
KM
679
680 old = vma->vm_policy;
681 vma->vm_policy = new; /* protected by mmap_sem */
682 mpol_put(old);
683
684 return 0;
685 err_out:
686 mpol_put(new);
8d34694c
KM
687 return err;
688}
689
1da177e4 690/* Step 2: apply policy to a range and do splits. */
9d8cebd4
KM
691static int mbind_range(struct mm_struct *mm, unsigned long start,
692 unsigned long end, struct mempolicy *new_pol)
1da177e4
LT
693{
694 struct vm_area_struct *next;
9d8cebd4
KM
695 struct vm_area_struct *prev;
696 struct vm_area_struct *vma;
697 int err = 0;
e26a5114 698 pgoff_t pgoff;
9d8cebd4
KM
699 unsigned long vmstart;
700 unsigned long vmend;
1da177e4 701
097d5910 702 vma = find_vma(mm, start);
9d8cebd4
KM
703 if (!vma || vma->vm_start > start)
704 return -EFAULT;
705
097d5910 706 prev = vma->vm_prev;
e26a5114
KM
707 if (start > vma->vm_start)
708 prev = vma;
709
9d8cebd4 710 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
1da177e4 711 next = vma->vm_next;
9d8cebd4
KM
712 vmstart = max(start, vma->vm_start);
713 vmend = min(end, vma->vm_end);
714
e26a5114
KM
715 if (mpol_equal(vma_policy(vma), new_pol))
716 continue;
717
718 pgoff = vma->vm_pgoff +
719 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
9d8cebd4 720 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
e26a5114 721 vma->anon_vma, vma->vm_file, pgoff,
8aacc9f5 722 new_pol);
9d8cebd4
KM
723 if (prev) {
724 vma = prev;
725 next = vma->vm_next;
726 continue;
727 }
728 if (vma->vm_start != vmstart) {
729 err = split_vma(vma->vm_mm, vma, vmstart, 1);
730 if (err)
731 goto out;
732 }
733 if (vma->vm_end != vmend) {
734 err = split_vma(vma->vm_mm, vma, vmend, 0);
735 if (err)
736 goto out;
737 }
869833f2 738 err = vma_replace_policy(vma, new_pol);
8d34694c
KM
739 if (err)
740 goto out;
1da177e4 741 }
9d8cebd4
KM
742
743 out:
1da177e4
LT
744 return err;
745}
746
c61afb18
PJ
747/*
748 * Update task->flags PF_MEMPOLICY bit: set iff non-default
749 * mempolicy. Allows more rapid checking of this (combined perhaps
750 * with other PF_* flag bits) on memory allocation hot code paths.
751 *
752 * If called from outside this file, the task 'p' should -only- be
753 * a newly forked child not yet visible on the task list, because
754 * manipulating the task flags of a visible task is not safe.
755 *
756 * The above limitation is why this routine has the funny name
757 * mpol_fix_fork_child_flag().
758 *
759 * It is also safe to call this with a task pointer of current,
760 * which the static wrapper mpol_set_task_struct_flag() does,
761 * for use within this file.
762 */
763
764void mpol_fix_fork_child_flag(struct task_struct *p)
765{
766 if (p->mempolicy)
767 p->flags |= PF_MEMPOLICY;
768 else
769 p->flags &= ~PF_MEMPOLICY;
770}
771
772static void mpol_set_task_struct_flag(void)
773{
774 mpol_fix_fork_child_flag(current);
775}
776
1da177e4 777/* Set the process memory policy */
028fec41
DR
778static long do_set_mempolicy(unsigned short mode, unsigned short flags,
779 nodemask_t *nodes)
1da177e4 780{
58568d2a 781 struct mempolicy *new, *old;
f4e53d91 782 struct mm_struct *mm = current->mm;
4bfc4495 783 NODEMASK_SCRATCH(scratch);
58568d2a 784 int ret;
1da177e4 785
4bfc4495
KH
786 if (!scratch)
787 return -ENOMEM;
f4e53d91 788
4bfc4495
KH
789 new = mpol_new(mode, flags, nodes);
790 if (IS_ERR(new)) {
791 ret = PTR_ERR(new);
792 goto out;
793 }
f4e53d91
LS
794 /*
795 * prevent changing our mempolicy while show_numa_maps()
796 * is using it.
797 * Note: do_set_mempolicy() can be called at init time
798 * with no 'mm'.
799 */
800 if (mm)
801 down_write(&mm->mmap_sem);
58568d2a 802 task_lock(current);
4bfc4495 803 ret = mpol_set_nodemask(new, nodes, scratch);
58568d2a
MX
804 if (ret) {
805 task_unlock(current);
806 if (mm)
807 up_write(&mm->mmap_sem);
808 mpol_put(new);
4bfc4495 809 goto out;
58568d2a
MX
810 }
811 old = current->mempolicy;
1da177e4 812 current->mempolicy = new;
c61afb18 813 mpol_set_task_struct_flag();
45c4745a 814 if (new && new->mode == MPOL_INTERLEAVE &&
f5b087b5 815 nodes_weight(new->v.nodes))
dfcd3c0d 816 current->il_next = first_node(new->v.nodes);
58568d2a 817 task_unlock(current);
f4e53d91
LS
818 if (mm)
819 up_write(&mm->mmap_sem);
820
58568d2a 821 mpol_put(old);
4bfc4495
KH
822 ret = 0;
823out:
824 NODEMASK_SCRATCH_FREE(scratch);
825 return ret;
1da177e4
LT
826}
827
bea904d5
LS
828/*
829 * Return nodemask for policy for get_mempolicy() query
58568d2a
MX
830 *
831 * Called with task's alloc_lock held
bea904d5
LS
832 */
833static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4 834{
dfcd3c0d 835 nodes_clear(*nodes);
bea904d5
LS
836 if (p == &default_policy)
837 return;
838
45c4745a 839 switch (p->mode) {
19770b32
MG
840 case MPOL_BIND:
841 /* Fall through */
1da177e4 842 case MPOL_INTERLEAVE:
dfcd3c0d 843 *nodes = p->v.nodes;
1da177e4
LT
844 break;
845 case MPOL_PREFERRED:
fc36b8d3 846 if (!(p->flags & MPOL_F_LOCAL))
dfcd3c0d 847 node_set(p->v.preferred_node, *nodes);
53f2556b 848 /* else return empty node mask for local allocation */
1da177e4
LT
849 break;
850 default:
851 BUG();
852 }
853}
854
855static int lookup_node(struct mm_struct *mm, unsigned long addr)
856{
857 struct page *p;
858 int err;
859
860 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
861 if (err >= 0) {
862 err = page_to_nid(p);
863 put_page(p);
864 }
865 return err;
866}
867
1da177e4 868/* Retrieve NUMA policy */
dbcb0f19
AB
869static long do_get_mempolicy(int *policy, nodemask_t *nmask,
870 unsigned long addr, unsigned long flags)
1da177e4 871{
8bccd85f 872 int err;
1da177e4
LT
873 struct mm_struct *mm = current->mm;
874 struct vm_area_struct *vma = NULL;
875 struct mempolicy *pol = current->mempolicy;
876
754af6f5
LS
877 if (flags &
878 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 879 return -EINVAL;
754af6f5
LS
880
881 if (flags & MPOL_F_MEMS_ALLOWED) {
882 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
883 return -EINVAL;
884 *policy = 0; /* just so it's initialized */
58568d2a 885 task_lock(current);
754af6f5 886 *nmask = cpuset_current_mems_allowed;
58568d2a 887 task_unlock(current);
754af6f5
LS
888 return 0;
889 }
890
1da177e4 891 if (flags & MPOL_F_ADDR) {
bea904d5
LS
892 /*
893 * Do NOT fall back to task policy if the
894 * vma/shared policy at addr is NULL. We
895 * want to return MPOL_DEFAULT in this case.
896 */
1da177e4
LT
897 down_read(&mm->mmap_sem);
898 vma = find_vma_intersection(mm, addr, addr+1);
899 if (!vma) {
900 up_read(&mm->mmap_sem);
901 return -EFAULT;
902 }
903 if (vma->vm_ops && vma->vm_ops->get_policy)
904 pol = vma->vm_ops->get_policy(vma, addr);
905 else
906 pol = vma->vm_policy;
907 } else if (addr)
908 return -EINVAL;
909
910 if (!pol)
bea904d5 911 pol = &default_policy; /* indicates default behavior */
1da177e4
LT
912
913 if (flags & MPOL_F_NODE) {
914 if (flags & MPOL_F_ADDR) {
915 err = lookup_node(mm, addr);
916 if (err < 0)
917 goto out;
8bccd85f 918 *policy = err;
1da177e4 919 } else if (pol == current->mempolicy &&
45c4745a 920 pol->mode == MPOL_INTERLEAVE) {
8bccd85f 921 *policy = current->il_next;
1da177e4
LT
922 } else {
923 err = -EINVAL;
924 goto out;
925 }
bea904d5
LS
926 } else {
927 *policy = pol == &default_policy ? MPOL_DEFAULT :
928 pol->mode;
d79df630
DR
929 /*
930 * Internal mempolicy flags must be masked off before exposing
931 * the policy to userspace.
932 */
933 *policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d5 934 }
1da177e4
LT
935
936 if (vma) {
937 up_read(&current->mm->mmap_sem);
938 vma = NULL;
939 }
940
1da177e4 941 err = 0;
58568d2a 942 if (nmask) {
c6b6ef8b
LS
943 if (mpol_store_user_nodemask(pol)) {
944 *nmask = pol->w.user_nodemask;
945 } else {
946 task_lock(current);
947 get_policy_nodemask(pol, nmask);
948 task_unlock(current);
949 }
58568d2a 950 }
1da177e4
LT
951
952 out:
52cd3b07 953 mpol_cond_put(pol);
1da177e4
LT
954 if (vma)
955 up_read(&current->mm->mmap_sem);
956 return err;
957}
958
b20a3503 959#ifdef CONFIG_MIGRATION
6ce3c4c0
CL
960/*
961 * page migration
962 */
fc301289
CL
963static void migrate_page_add(struct page *page, struct list_head *pagelist,
964 unsigned long flags)
6ce3c4c0
CL
965{
966 /*
fc301289 967 * Avoid migrating a page that is shared with others.
6ce3c4c0 968 */
62695a84
NP
969 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
970 if (!isolate_lru_page(page)) {
971 list_add_tail(&page->lru, pagelist);
6d9c285a
KM
972 inc_zone_page_state(page, NR_ISOLATED_ANON +
973 page_is_file_cache(page));
62695a84
NP
974 }
975 }
7e2ab150 976}
6ce3c4c0 977
742755a1 978static struct page *new_node_page(struct page *page, unsigned long node, int **x)
95a402c3 979{
6484eb3e 980 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
95a402c3
CL
981}
982
7e2ab150
CL
983/*
984 * Migrate pages from one node to a target node.
985 * Returns error or the number of pages not migrated.
986 */
dbcb0f19
AB
987static int migrate_to_node(struct mm_struct *mm, int source, int dest,
988 int flags)
7e2ab150
CL
989{
990 nodemask_t nmask;
991 LIST_HEAD(pagelist);
992 int err = 0;
993
994 nodes_clear(nmask);
995 node_set(source, nmask);
6ce3c4c0 996
08270807
MK
997 /*
998 * This does not "check" the range but isolates all pages that
999 * need migration. Between passing in the full user address
1000 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1001 */
1002 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1003 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
7e2ab150
CL
1004 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1005
cf608ac1 1006 if (!list_empty(&pagelist)) {
7f0f2496 1007 err = migrate_pages(&pagelist, new_node_page, dest,
7b2a2d4a
MG
1008 false, MIGRATE_SYNC,
1009 MR_SYSCALL);
cf608ac1
MK
1010 if (err)
1011 putback_lru_pages(&pagelist);
1012 }
95a402c3 1013
7e2ab150 1014 return err;
6ce3c4c0
CL
1015}
1016
39743889 1017/*
7e2ab150
CL
1018 * Move pages between the two nodesets so as to preserve the physical
1019 * layout as much as possible.
39743889
CL
1020 *
1021 * Returns the number of page that could not be moved.
1022 */
0ce72d4f
AM
1023int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1024 const nodemask_t *to, int flags)
39743889 1025{
7e2ab150 1026 int busy = 0;
0aedadf9 1027 int err;
7e2ab150 1028 nodemask_t tmp;
39743889 1029
0aedadf9
CL
1030 err = migrate_prep();
1031 if (err)
1032 return err;
1033
53f2556b 1034 down_read(&mm->mmap_sem);
39743889 1035
0ce72d4f 1036 err = migrate_vmas(mm, from, to, flags);
7b2259b3
CL
1037 if (err)
1038 goto out;
1039
da0aa138
KM
1040 /*
1041 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1042 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1043 * bit in 'tmp', and return that <source, dest> pair for migration.
1044 * The pair of nodemasks 'to' and 'from' define the map.
1045 *
1046 * If no pair of bits is found that way, fallback to picking some
1047 * pair of 'source' and 'dest' bits that are not the same. If the
1048 * 'source' and 'dest' bits are the same, this represents a node
1049 * that will be migrating to itself, so no pages need move.
1050 *
1051 * If no bits are left in 'tmp', or if all remaining bits left
1052 * in 'tmp' correspond to the same bit in 'to', return false
1053 * (nothing left to migrate).
1054 *
1055 * This lets us pick a pair of nodes to migrate between, such that
1056 * if possible the dest node is not already occupied by some other
1057 * source node, minimizing the risk of overloading the memory on a
1058 * node that would happen if we migrated incoming memory to a node
1059 * before migrating outgoing memory source that same node.
1060 *
1061 * A single scan of tmp is sufficient. As we go, we remember the
1062 * most recent <s, d> pair that moved (s != d). If we find a pair
1063 * that not only moved, but what's better, moved to an empty slot
1064 * (d is not set in tmp), then we break out then, with that pair.
ae0e47f0 1065 * Otherwise when we finish scanning from_tmp, we at least have the
da0aa138
KM
1066 * most recent <s, d> pair that moved. If we get all the way through
1067 * the scan of tmp without finding any node that moved, much less
1068 * moved to an empty node, then there is nothing left worth migrating.
1069 */
d4984711 1070
0ce72d4f 1071 tmp = *from;
7e2ab150
CL
1072 while (!nodes_empty(tmp)) {
1073 int s,d;
1074 int source = -1;
1075 int dest = 0;
1076
1077 for_each_node_mask(s, tmp) {
4a5b18cc
LW
1078
1079 /*
1080 * do_migrate_pages() tries to maintain the relative
1081 * node relationship of the pages established between
1082 * threads and memory areas.
1083 *
1084 * However if the number of source nodes is not equal to
1085 * the number of destination nodes we can not preserve
1086 * this node relative relationship. In that case, skip
1087 * copying memory from a node that is in the destination
1088 * mask.
1089 *
1090 * Example: [2,3,4] -> [3,4,5] moves everything.
1091 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1092 */
1093
0ce72d4f
AM
1094 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1095 (node_isset(s, *to)))
4a5b18cc
LW
1096 continue;
1097
0ce72d4f 1098 d = node_remap(s, *from, *to);
7e2ab150
CL
1099 if (s == d)
1100 continue;
1101
1102 source = s; /* Node moved. Memorize */
1103 dest = d;
1104
1105 /* dest not in remaining from nodes? */
1106 if (!node_isset(dest, tmp))
1107 break;
1108 }
1109 if (source == -1)
1110 break;
1111
1112 node_clear(source, tmp);
1113 err = migrate_to_node(mm, source, dest, flags);
1114 if (err > 0)
1115 busy += err;
1116 if (err < 0)
1117 break;
39743889 1118 }
7b2259b3 1119out:
39743889 1120 up_read(&mm->mmap_sem);
7e2ab150
CL
1121 if (err < 0)
1122 return err;
1123 return busy;
b20a3503
CL
1124
1125}
1126
3ad33b24
LS
1127/*
1128 * Allocate a new page for page migration based on vma policy.
1129 * Start assuming that page is mapped by vma pointed to by @private.
1130 * Search forward from there, if not. N.B., this assumes that the
1131 * list of pages handed to migrate_pages()--which is how we get here--
1132 * is in virtual address order.
1133 */
742755a1 1134static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
95a402c3
CL
1135{
1136 struct vm_area_struct *vma = (struct vm_area_struct *)private;
3ad33b24 1137 unsigned long uninitialized_var(address);
95a402c3 1138
3ad33b24
LS
1139 while (vma) {
1140 address = page_address_in_vma(page, vma);
1141 if (address != -EFAULT)
1142 break;
1143 vma = vma->vm_next;
1144 }
1145
1146 /*
1147 * if !vma, alloc_page_vma() will use task or system default policy
1148 */
1149 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
95a402c3 1150}
b20a3503
CL
1151#else
1152
1153static void migrate_page_add(struct page *page, struct list_head *pagelist,
1154 unsigned long flags)
1155{
39743889
CL
1156}
1157
0ce72d4f
AM
1158int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1159 const nodemask_t *to, int flags)
b20a3503
CL
1160{
1161 return -ENOSYS;
1162}
95a402c3 1163
69939749 1164static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
95a402c3
CL
1165{
1166 return NULL;
1167}
b20a3503
CL
1168#endif
1169
dbcb0f19 1170static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
1171 unsigned short mode, unsigned short mode_flags,
1172 nodemask_t *nmask, unsigned long flags)
6ce3c4c0
CL
1173{
1174 struct vm_area_struct *vma;
1175 struct mm_struct *mm = current->mm;
1176 struct mempolicy *new;
1177 unsigned long end;
1178 int err;
1179 LIST_HEAD(pagelist);
1180
b24f53a0 1181 if (flags & ~(unsigned long)MPOL_MF_VALID)
6ce3c4c0 1182 return -EINVAL;
74c00241 1183 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
1184 return -EPERM;
1185
1186 if (start & ~PAGE_MASK)
1187 return -EINVAL;
1188
d3a71033 1189 if (mode == MPOL_DEFAULT || mode == MPOL_NOOP)
6ce3c4c0
CL
1190 flags &= ~MPOL_MF_STRICT;
1191
1192 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1193 end = start + len;
1194
1195 if (end < start)
1196 return -EINVAL;
1197 if (end == start)
1198 return 0;
1199
028fec41 1200 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
1201 if (IS_ERR(new))
1202 return PTR_ERR(new);
1203
b24f53a0
LS
1204 if (flags & MPOL_MF_LAZY)
1205 new->flags |= MPOL_F_MOF;
1206
6ce3c4c0
CL
1207 /*
1208 * If we are using the default policy then operation
1209 * on discontinuous address spaces is okay after all
1210 */
1211 if (!new)
1212 flags |= MPOL_MF_DISCONTIG_OK;
1213
028fec41
DR
1214 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1215 start, start + len, mode, mode_flags,
1216 nmask ? nodes_addr(*nmask)[0] : -1);
6ce3c4c0 1217
0aedadf9
CL
1218 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1219
1220 err = migrate_prep();
1221 if (err)
b05ca738 1222 goto mpol_out;
0aedadf9 1223 }
4bfc4495
KH
1224 {
1225 NODEMASK_SCRATCH(scratch);
1226 if (scratch) {
1227 down_write(&mm->mmap_sem);
1228 task_lock(current);
1229 err = mpol_set_nodemask(new, nmask, scratch);
1230 task_unlock(current);
1231 if (err)
1232 up_write(&mm->mmap_sem);
1233 } else
1234 err = -ENOMEM;
1235 NODEMASK_SCRATCH_FREE(scratch);
1236 }
b05ca738
KM
1237 if (err)
1238 goto mpol_out;
1239
6ce3c4c0
CL
1240 vma = check_range(mm, start, end, nmask,
1241 flags | MPOL_MF_INVERT, &pagelist);
1242
b24f53a0
LS
1243 err = PTR_ERR(vma); /* maybe ... */
1244 if (!IS_ERR(vma) && mode != MPOL_NOOP)
9d8cebd4 1245 err = mbind_range(mm, start, end, new);
7e2ab150 1246
b24f53a0
LS
1247 if (!err) {
1248 int nr_failed = 0;
1249
cf608ac1 1250 if (!list_empty(&pagelist)) {
b24f53a0 1251 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
95a402c3 1252 nr_failed = migrate_pages(&pagelist, new_vma_page,
7f0f2496 1253 (unsigned long)vma,
7b2a2d4a
MG
1254 false, MIGRATE_SYNC,
1255 MR_MEMPOLICY_MBIND);
cf608ac1
MK
1256 if (nr_failed)
1257 putback_lru_pages(&pagelist);
1258 }
6ce3c4c0 1259
b24f53a0 1260 if (nr_failed && (flags & MPOL_MF_STRICT))
6ce3c4c0 1261 err = -EIO;
ab8a3e14
KM
1262 } else
1263 putback_lru_pages(&pagelist);
b20a3503 1264
6ce3c4c0 1265 up_write(&mm->mmap_sem);
b05ca738 1266 mpol_out:
f0be3d32 1267 mpol_put(new);
6ce3c4c0
CL
1268 return err;
1269}
1270
8bccd85f
CL
1271/*
1272 * User space interface with variable sized bitmaps for nodelists.
1273 */
1274
1275/* Copy a node mask from user space. */
39743889 1276static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
1277 unsigned long maxnode)
1278{
1279 unsigned long k;
1280 unsigned long nlongs;
1281 unsigned long endmask;
1282
1283 --maxnode;
1284 nodes_clear(*nodes);
1285 if (maxnode == 0 || !nmask)
1286 return 0;
a9c930ba 1287 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 1288 return -EINVAL;
8bccd85f
CL
1289
1290 nlongs = BITS_TO_LONGS(maxnode);
1291 if ((maxnode % BITS_PER_LONG) == 0)
1292 endmask = ~0UL;
1293 else
1294 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1295
1296 /* When the user specified more nodes than supported just check
1297 if the non supported part is all zero. */
1298 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1299 if (nlongs > PAGE_SIZE/sizeof(long))
1300 return -EINVAL;
1301 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1302 unsigned long t;
1303 if (get_user(t, nmask + k))
1304 return -EFAULT;
1305 if (k == nlongs - 1) {
1306 if (t & endmask)
1307 return -EINVAL;
1308 } else if (t)
1309 return -EINVAL;
1310 }
1311 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1312 endmask = ~0UL;
1313 }
1314
1315 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1316 return -EFAULT;
1317 nodes_addr(*nodes)[nlongs-1] &= endmask;
1318 return 0;
1319}
1320
1321/* Copy a kernel node mask to user space */
1322static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1323 nodemask_t *nodes)
1324{
1325 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1326 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1327
1328 if (copy > nbytes) {
1329 if (copy > PAGE_SIZE)
1330 return -EINVAL;
1331 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1332 return -EFAULT;
1333 copy = nbytes;
1334 }
1335 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1336}
1337
938bb9f5
HC
1338SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1339 unsigned long, mode, unsigned long __user *, nmask,
1340 unsigned long, maxnode, unsigned, flags)
8bccd85f
CL
1341{
1342 nodemask_t nodes;
1343 int err;
028fec41 1344 unsigned short mode_flags;
8bccd85f 1345
028fec41
DR
1346 mode_flags = mode & MPOL_MODE_FLAGS;
1347 mode &= ~MPOL_MODE_FLAGS;
a3b51e01
DR
1348 if (mode >= MPOL_MAX)
1349 return -EINVAL;
4c50bc01
DR
1350 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1351 (mode_flags & MPOL_F_RELATIVE_NODES))
1352 return -EINVAL;
8bccd85f
CL
1353 err = get_nodes(&nodes, nmask, maxnode);
1354 if (err)
1355 return err;
028fec41 1356 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
8bccd85f
CL
1357}
1358
1359/* Set the process memory policy */
938bb9f5
HC
1360SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1361 unsigned long, maxnode)
8bccd85f
CL
1362{
1363 int err;
1364 nodemask_t nodes;
028fec41 1365 unsigned short flags;
8bccd85f 1366
028fec41
DR
1367 flags = mode & MPOL_MODE_FLAGS;
1368 mode &= ~MPOL_MODE_FLAGS;
1369 if ((unsigned int)mode >= MPOL_MAX)
8bccd85f 1370 return -EINVAL;
4c50bc01
DR
1371 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1372 return -EINVAL;
8bccd85f
CL
1373 err = get_nodes(&nodes, nmask, maxnode);
1374 if (err)
1375 return err;
028fec41 1376 return do_set_mempolicy(mode, flags, &nodes);
8bccd85f
CL
1377}
1378
938bb9f5
HC
1379SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1380 const unsigned long __user *, old_nodes,
1381 const unsigned long __user *, new_nodes)
39743889 1382{
c69e8d9c 1383 const struct cred *cred = current_cred(), *tcred;
596d7cfa 1384 struct mm_struct *mm = NULL;
39743889 1385 struct task_struct *task;
39743889
CL
1386 nodemask_t task_nodes;
1387 int err;
596d7cfa
KM
1388 nodemask_t *old;
1389 nodemask_t *new;
1390 NODEMASK_SCRATCH(scratch);
1391
1392 if (!scratch)
1393 return -ENOMEM;
39743889 1394
596d7cfa
KM
1395 old = &scratch->mask1;
1396 new = &scratch->mask2;
1397
1398 err = get_nodes(old, old_nodes, maxnode);
39743889 1399 if (err)
596d7cfa 1400 goto out;
39743889 1401
596d7cfa 1402 err = get_nodes(new, new_nodes, maxnode);
39743889 1403 if (err)
596d7cfa 1404 goto out;
39743889
CL
1405
1406 /* Find the mm_struct */
55cfaa3c 1407 rcu_read_lock();
228ebcbe 1408 task = pid ? find_task_by_vpid(pid) : current;
39743889 1409 if (!task) {
55cfaa3c 1410 rcu_read_unlock();
596d7cfa
KM
1411 err = -ESRCH;
1412 goto out;
39743889 1413 }
3268c63e 1414 get_task_struct(task);
39743889 1415
596d7cfa 1416 err = -EINVAL;
39743889
CL
1417
1418 /*
1419 * Check if this process has the right to modify the specified
1420 * process. The right exists if the process has administrative
7f927fcc 1421 * capabilities, superuser privileges or the same
39743889
CL
1422 * userid as the target process.
1423 */
c69e8d9c 1424 tcred = __task_cred(task);
b38a86eb
EB
1425 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1426 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
74c00241 1427 !capable(CAP_SYS_NICE)) {
c69e8d9c 1428 rcu_read_unlock();
39743889 1429 err = -EPERM;
3268c63e 1430 goto out_put;
39743889 1431 }
c69e8d9c 1432 rcu_read_unlock();
39743889
CL
1433
1434 task_nodes = cpuset_mems_allowed(task);
1435 /* Is the user allowed to access the target nodes? */
596d7cfa 1436 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889 1437 err = -EPERM;
3268c63e 1438 goto out_put;
39743889
CL
1439 }
1440
596d7cfa 1441 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
3b42d28b 1442 err = -EINVAL;
3268c63e 1443 goto out_put;
3b42d28b
CL
1444 }
1445
86c3a764
DQ
1446 err = security_task_movememory(task);
1447 if (err)
3268c63e 1448 goto out_put;
86c3a764 1449
3268c63e
CL
1450 mm = get_task_mm(task);
1451 put_task_struct(task);
f2a9ef88
SL
1452
1453 if (!mm) {
3268c63e 1454 err = -EINVAL;
f2a9ef88
SL
1455 goto out;
1456 }
1457
1458 err = do_migrate_pages(mm, old, new,
1459 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
3268c63e
CL
1460
1461 mmput(mm);
1462out:
596d7cfa
KM
1463 NODEMASK_SCRATCH_FREE(scratch);
1464
39743889 1465 return err;
3268c63e
CL
1466
1467out_put:
1468 put_task_struct(task);
1469 goto out;
1470
39743889
CL
1471}
1472
1473
8bccd85f 1474/* Retrieve NUMA policy */
938bb9f5
HC
1475SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1476 unsigned long __user *, nmask, unsigned long, maxnode,
1477 unsigned long, addr, unsigned long, flags)
8bccd85f 1478{
dbcb0f19
AB
1479 int err;
1480 int uninitialized_var(pval);
8bccd85f
CL
1481 nodemask_t nodes;
1482
1483 if (nmask != NULL && maxnode < MAX_NUMNODES)
1484 return -EINVAL;
1485
1486 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1487
1488 if (err)
1489 return err;
1490
1491 if (policy && put_user(pval, policy))
1492 return -EFAULT;
1493
1494 if (nmask)
1495 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1496
1497 return err;
1498}
1499
1da177e4
LT
1500#ifdef CONFIG_COMPAT
1501
1502asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1503 compat_ulong_t __user *nmask,
1504 compat_ulong_t maxnode,
1505 compat_ulong_t addr, compat_ulong_t flags)
1506{
1507 long err;
1508 unsigned long __user *nm = NULL;
1509 unsigned long nr_bits, alloc_size;
1510 DECLARE_BITMAP(bm, MAX_NUMNODES);
1511
1512 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1513 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1514
1515 if (nmask)
1516 nm = compat_alloc_user_space(alloc_size);
1517
1518 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1519
1520 if (!err && nmask) {
2bbff6c7
KH
1521 unsigned long copy_size;
1522 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1523 err = copy_from_user(bm, nm, copy_size);
1da177e4
LT
1524 /* ensure entire bitmap is zeroed */
1525 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1526 err |= compat_put_bitmap(nmask, bm, nr_bits);
1527 }
1528
1529 return err;
1530}
1531
1532asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1533 compat_ulong_t maxnode)
1534{
1535 long err = 0;
1536 unsigned long __user *nm = NULL;
1537 unsigned long nr_bits, alloc_size;
1538 DECLARE_BITMAP(bm, MAX_NUMNODES);
1539
1540 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1541 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1542
1543 if (nmask) {
1544 err = compat_get_bitmap(bm, nmask, nr_bits);
1545 nm = compat_alloc_user_space(alloc_size);
1546 err |= copy_to_user(nm, bm, alloc_size);
1547 }
1548
1549 if (err)
1550 return -EFAULT;
1551
1552 return sys_set_mempolicy(mode, nm, nr_bits+1);
1553}
1554
1555asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1556 compat_ulong_t mode, compat_ulong_t __user *nmask,
1557 compat_ulong_t maxnode, compat_ulong_t flags)
1558{
1559 long err = 0;
1560 unsigned long __user *nm = NULL;
1561 unsigned long nr_bits, alloc_size;
dfcd3c0d 1562 nodemask_t bm;
1da177e4
LT
1563
1564 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1565 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1566
1567 if (nmask) {
dfcd3c0d 1568 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4 1569 nm = compat_alloc_user_space(alloc_size);
dfcd3c0d 1570 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4
LT
1571 }
1572
1573 if (err)
1574 return -EFAULT;
1575
1576 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1577}
1578
1579#endif
1580
480eccf9
LS
1581/*
1582 * get_vma_policy(@task, @vma, @addr)
1583 * @task - task for fallback if vma policy == default
1584 * @vma - virtual memory area whose policy is sought
1585 * @addr - address in @vma for shared policy lookup
1586 *
1587 * Returns effective policy for a VMA at specified address.
1588 * Falls back to @task or system default policy, as necessary.
32f8516a
DR
1589 * Current or other task's task mempolicy and non-shared vma policies must be
1590 * protected by task_lock(task) by the caller.
52cd3b07
LS
1591 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1592 * count--added by the get_policy() vm_op, as appropriate--to protect against
1593 * freeing by another task. It is the caller's responsibility to free the
1594 * extra reference for shared policies.
480eccf9 1595 */
d98f6cb6 1596struct mempolicy *get_vma_policy(struct task_struct *task,
48fce342 1597 struct vm_area_struct *vma, unsigned long addr)
1da177e4 1598{
6e21c8f1 1599 struct mempolicy *pol = task->mempolicy;
1da177e4
LT
1600
1601 if (vma) {
480eccf9 1602 if (vma->vm_ops && vma->vm_ops->get_policy) {
ae4d8c16
LS
1603 struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1604 addr);
1605 if (vpol)
1606 pol = vpol;
00442ad0 1607 } else if (vma->vm_policy) {
1da177e4 1608 pol = vma->vm_policy;
00442ad0
MG
1609
1610 /*
1611 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1612 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1613 * count on these policies which will be dropped by
1614 * mpol_cond_put() later
1615 */
1616 if (mpol_needs_cond_ref(pol))
1617 mpol_get(pol);
1618 }
1da177e4
LT
1619 }
1620 if (!pol)
1621 pol = &default_policy;
1622 return pol;
1623}
1624
52cd3b07
LS
1625/*
1626 * Return a nodemask representing a mempolicy for filtering nodes for
1627 * page allocation
1628 */
1629static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
19770b32
MG
1630{
1631 /* Lower zones don't get a nodemask applied for MPOL_BIND */
45c4745a 1632 if (unlikely(policy->mode == MPOL_BIND) &&
19770b32
MG
1633 gfp_zone(gfp) >= policy_zone &&
1634 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1635 return &policy->v.nodes;
1636
1637 return NULL;
1638}
1639
52cd3b07 1640/* Return a zonelist indicated by gfp for node representing a mempolicy */
2f5f9486
AK
1641static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1642 int nd)
1da177e4 1643{
45c4745a 1644 switch (policy->mode) {
1da177e4 1645 case MPOL_PREFERRED:
fc36b8d3
LS
1646 if (!(policy->flags & MPOL_F_LOCAL))
1647 nd = policy->v.preferred_node;
1da177e4
LT
1648 break;
1649 case MPOL_BIND:
19770b32 1650 /*
52cd3b07
LS
1651 * Normally, MPOL_BIND allocations are node-local within the
1652 * allowed nodemask. However, if __GFP_THISNODE is set and the
6eb27e1f 1653 * current node isn't part of the mask, we use the zonelist for
52cd3b07 1654 * the first node in the mask instead.
19770b32 1655 */
19770b32
MG
1656 if (unlikely(gfp & __GFP_THISNODE) &&
1657 unlikely(!node_isset(nd, policy->v.nodes)))
1658 nd = first_node(policy->v.nodes);
1659 break;
1da177e4 1660 default:
1da177e4
LT
1661 BUG();
1662 }
0e88460d 1663 return node_zonelist(nd, gfp);
1da177e4
LT
1664}
1665
1666/* Do dynamic interleaving for a process */
1667static unsigned interleave_nodes(struct mempolicy *policy)
1668{
1669 unsigned nid, next;
1670 struct task_struct *me = current;
1671
1672 nid = me->il_next;
dfcd3c0d 1673 next = next_node(nid, policy->v.nodes);
1da177e4 1674 if (next >= MAX_NUMNODES)
dfcd3c0d 1675 next = first_node(policy->v.nodes);
f5b087b5
DR
1676 if (next < MAX_NUMNODES)
1677 me->il_next = next;
1da177e4
LT
1678 return nid;
1679}
1680
dc85da15
CL
1681/*
1682 * Depending on the memory policy provide a node from which to allocate the
1683 * next slab entry.
52cd3b07
LS
1684 * @policy must be protected by freeing by the caller. If @policy is
1685 * the current task's mempolicy, this protection is implicit, as only the
1686 * task can change it's policy. The system default policy requires no
1687 * such protection.
dc85da15 1688 */
e7b691b0 1689unsigned slab_node(void)
dc85da15 1690{
e7b691b0
AK
1691 struct mempolicy *policy;
1692
1693 if (in_interrupt())
1694 return numa_node_id();
1695
1696 policy = current->mempolicy;
fc36b8d3 1697 if (!policy || policy->flags & MPOL_F_LOCAL)
bea904d5
LS
1698 return numa_node_id();
1699
1700 switch (policy->mode) {
1701 case MPOL_PREFERRED:
fc36b8d3
LS
1702 /*
1703 * handled MPOL_F_LOCAL above
1704 */
1705 return policy->v.preferred_node;
765c4507 1706
dc85da15
CL
1707 case MPOL_INTERLEAVE:
1708 return interleave_nodes(policy);
1709
dd1a239f 1710 case MPOL_BIND: {
dc85da15
CL
1711 /*
1712 * Follow bind policy behavior and start allocation at the
1713 * first node.
1714 */
19770b32
MG
1715 struct zonelist *zonelist;
1716 struct zone *zone;
1717 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1718 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1719 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1720 &policy->v.nodes,
1721 &zone);
800416f7 1722 return zone ? zone->node : numa_node_id();
dd1a239f 1723 }
dc85da15 1724
dc85da15 1725 default:
bea904d5 1726 BUG();
dc85da15
CL
1727 }
1728}
1729
1da177e4
LT
1730/* Do static interleaving for a VMA with known offset. */
1731static unsigned offset_il_node(struct mempolicy *pol,
1732 struct vm_area_struct *vma, unsigned long off)
1733{
dfcd3c0d 1734 unsigned nnodes = nodes_weight(pol->v.nodes);
f5b087b5 1735 unsigned target;
1da177e4
LT
1736 int c;
1737 int nid = -1;
1738
f5b087b5
DR
1739 if (!nnodes)
1740 return numa_node_id();
1741 target = (unsigned int)off % nnodes;
1da177e4
LT
1742 c = 0;
1743 do {
dfcd3c0d 1744 nid = next_node(nid, pol->v.nodes);
1da177e4
LT
1745 c++;
1746 } while (c <= target);
1da177e4
LT
1747 return nid;
1748}
1749
5da7ca86
CL
1750/* Determine a node number for interleave */
1751static inline unsigned interleave_nid(struct mempolicy *pol,
1752 struct vm_area_struct *vma, unsigned long addr, int shift)
1753{
1754 if (vma) {
1755 unsigned long off;
1756
3b98b087
NA
1757 /*
1758 * for small pages, there is no difference between
1759 * shift and PAGE_SHIFT, so the bit-shift is safe.
1760 * for huge pages, since vm_pgoff is in units of small
1761 * pages, we need to shift off the always 0 bits to get
1762 * a useful offset.
1763 */
1764 BUG_ON(shift < PAGE_SHIFT);
1765 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca86
CL
1766 off += (addr - vma->vm_start) >> shift;
1767 return offset_il_node(pol, vma, off);
1768 } else
1769 return interleave_nodes(pol);
1770}
1771
778d3b0f
MH
1772/*
1773 * Return the bit number of a random bit set in the nodemask.
1774 * (returns -1 if nodemask is empty)
1775 */
1776int node_random(const nodemask_t *maskp)
1777{
1778 int w, bit = -1;
1779
1780 w = nodes_weight(*maskp);
1781 if (w)
1782 bit = bitmap_ord_to_pos(maskp->bits,
1783 get_random_int() % w, MAX_NUMNODES);
1784 return bit;
1785}
1786
00ac59ad 1787#ifdef CONFIG_HUGETLBFS
480eccf9
LS
1788/*
1789 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1790 * @vma = virtual memory area whose policy is sought
1791 * @addr = address in @vma for shared policy lookup and interleave policy
1792 * @gfp_flags = for requested zone
19770b32
MG
1793 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1794 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
480eccf9 1795 *
52cd3b07
LS
1796 * Returns a zonelist suitable for a huge page allocation and a pointer
1797 * to the struct mempolicy for conditional unref after allocation.
1798 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1799 * @nodemask for filtering the zonelist.
c0ff7453
MX
1800 *
1801 * Must be protected by get_mems_allowed()
480eccf9 1802 */
396faf03 1803struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
19770b32
MG
1804 gfp_t gfp_flags, struct mempolicy **mpol,
1805 nodemask_t **nodemask)
5da7ca86 1806{
480eccf9 1807 struct zonelist *zl;
5da7ca86 1808
52cd3b07 1809 *mpol = get_vma_policy(current, vma, addr);
19770b32 1810 *nodemask = NULL; /* assume !MPOL_BIND */
5da7ca86 1811
52cd3b07
LS
1812 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1813 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
a5516438 1814 huge_page_shift(hstate_vma(vma))), gfp_flags);
52cd3b07 1815 } else {
2f5f9486 1816 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
52cd3b07
LS
1817 if ((*mpol)->mode == MPOL_BIND)
1818 *nodemask = &(*mpol)->v.nodes;
480eccf9
LS
1819 }
1820 return zl;
5da7ca86 1821}
06808b08
LS
1822
1823/*
1824 * init_nodemask_of_mempolicy
1825 *
1826 * If the current task's mempolicy is "default" [NULL], return 'false'
1827 * to indicate default policy. Otherwise, extract the policy nodemask
1828 * for 'bind' or 'interleave' policy into the argument nodemask, or
1829 * initialize the argument nodemask to contain the single node for
1830 * 'preferred' or 'local' policy and return 'true' to indicate presence
1831 * of non-default mempolicy.
1832 *
1833 * We don't bother with reference counting the mempolicy [mpol_get/put]
1834 * because the current task is examining it's own mempolicy and a task's
1835 * mempolicy is only ever changed by the task itself.
1836 *
1837 * N.B., it is the caller's responsibility to free a returned nodemask.
1838 */
1839bool init_nodemask_of_mempolicy(nodemask_t *mask)
1840{
1841 struct mempolicy *mempolicy;
1842 int nid;
1843
1844 if (!(mask && current->mempolicy))
1845 return false;
1846
c0ff7453 1847 task_lock(current);
06808b08
LS
1848 mempolicy = current->mempolicy;
1849 switch (mempolicy->mode) {
1850 case MPOL_PREFERRED:
1851 if (mempolicy->flags & MPOL_F_LOCAL)
1852 nid = numa_node_id();
1853 else
1854 nid = mempolicy->v.preferred_node;
1855 init_nodemask_of_node(mask, nid);
1856 break;
1857
1858 case MPOL_BIND:
1859 /* Fall through */
1860 case MPOL_INTERLEAVE:
1861 *mask = mempolicy->v.nodes;
1862 break;
1863
1864 default:
1865 BUG();
1866 }
c0ff7453 1867 task_unlock(current);
06808b08
LS
1868
1869 return true;
1870}
00ac59ad 1871#endif
5da7ca86 1872
6f48d0eb
DR
1873/*
1874 * mempolicy_nodemask_intersects
1875 *
1876 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1877 * policy. Otherwise, check for intersection between mask and the policy
1878 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1879 * policy, always return true since it may allocate elsewhere on fallback.
1880 *
1881 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1882 */
1883bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1884 const nodemask_t *mask)
1885{
1886 struct mempolicy *mempolicy;
1887 bool ret = true;
1888
1889 if (!mask)
1890 return ret;
1891 task_lock(tsk);
1892 mempolicy = tsk->mempolicy;
1893 if (!mempolicy)
1894 goto out;
1895
1896 switch (mempolicy->mode) {
1897 case MPOL_PREFERRED:
1898 /*
1899 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1900 * allocate from, they may fallback to other nodes when oom.
1901 * Thus, it's possible for tsk to have allocated memory from
1902 * nodes in mask.
1903 */
1904 break;
1905 case MPOL_BIND:
1906 case MPOL_INTERLEAVE:
1907 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1908 break;
1909 default:
1910 BUG();
1911 }
1912out:
1913 task_unlock(tsk);
1914 return ret;
1915}
1916
1da177e4
LT
1917/* Allocate a page in interleaved policy.
1918 Own path because it needs to do special accounting. */
662f3a0b
AK
1919static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1920 unsigned nid)
1da177e4
LT
1921{
1922 struct zonelist *zl;
1923 struct page *page;
1924
0e88460d 1925 zl = node_zonelist(nid, gfp);
1da177e4 1926 page = __alloc_pages(gfp, order, zl);
dd1a239f 1927 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
ca889e6c 1928 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1da177e4
LT
1929 return page;
1930}
1931
1932/**
0bbbc0b3 1933 * alloc_pages_vma - Allocate a page for a VMA.
1da177e4
LT
1934 *
1935 * @gfp:
1936 * %GFP_USER user allocation.
1937 * %GFP_KERNEL kernel allocations,
1938 * %GFP_HIGHMEM highmem/user allocations,
1939 * %GFP_FS allocation should not call back into a file system.
1940 * %GFP_ATOMIC don't sleep.
1941 *
0bbbc0b3 1942 * @order:Order of the GFP allocation.
1da177e4
LT
1943 * @vma: Pointer to VMA or NULL if not available.
1944 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1945 *
1946 * This function allocates a page from the kernel page pool and applies
1947 * a NUMA policy associated with the VMA or the current process.
1948 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1949 * mm_struct of the VMA to prevent it from going away. Should be used for
1950 * all allocations for pages that will be mapped into
1951 * user space. Returns NULL when no page can be allocated.
1952 *
1953 * Should be called with the mm_sem of the vma hold.
1954 */
1955struct page *
0bbbc0b3 1956alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2f5f9486 1957 unsigned long addr, int node)
1da177e4 1958{
cc9a6c87 1959 struct mempolicy *pol;
480eccf9 1960 struct zonelist *zl;
c0ff7453 1961 struct page *page;
cc9a6c87
MG
1962 unsigned int cpuset_mems_cookie;
1963
1964retry_cpuset:
1965 pol = get_vma_policy(current, vma, addr);
1966 cpuset_mems_cookie = get_mems_allowed();
1da177e4 1967
45c4745a 1968 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1da177e4 1969 unsigned nid;
5da7ca86 1970
8eac563c 1971 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
52cd3b07 1972 mpol_cond_put(pol);
0bbbc0b3 1973 page = alloc_page_interleave(gfp, order, nid);
cc9a6c87
MG
1974 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1975 goto retry_cpuset;
1976
c0ff7453 1977 return page;
1da177e4 1978 }
2f5f9486 1979 zl = policy_zonelist(gfp, pol, node);
52cd3b07 1980 if (unlikely(mpol_needs_cond_ref(pol))) {
480eccf9 1981 /*
52cd3b07 1982 * slow path: ref counted shared policy
480eccf9 1983 */
0bbbc0b3 1984 struct page *page = __alloc_pages_nodemask(gfp, order,
52cd3b07 1985 zl, policy_nodemask(gfp, pol));
f0be3d32 1986 __mpol_put(pol);
cc9a6c87
MG
1987 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1988 goto retry_cpuset;
480eccf9
LS
1989 return page;
1990 }
1991 /*
1992 * fast path: default or task policy
1993 */
0bbbc0b3
AA
1994 page = __alloc_pages_nodemask(gfp, order, zl,
1995 policy_nodemask(gfp, pol));
cc9a6c87
MG
1996 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1997 goto retry_cpuset;
c0ff7453 1998 return page;
1da177e4
LT
1999}
2000
2001/**
2002 * alloc_pages_current - Allocate pages.
2003 *
2004 * @gfp:
2005 * %GFP_USER user allocation,
2006 * %GFP_KERNEL kernel allocation,
2007 * %GFP_HIGHMEM highmem allocation,
2008 * %GFP_FS don't call back into a file system.
2009 * %GFP_ATOMIC don't sleep.
2010 * @order: Power of two of allocation size in pages. 0 is a single page.
2011 *
2012 * Allocate a page from the kernel page pool. When not in
2013 * interrupt context and apply the current process NUMA policy.
2014 * Returns NULL when no page can be allocated.
2015 *
cf2a473c 2016 * Don't call cpuset_update_task_memory_state() unless
1da177e4
LT
2017 * 1) it's ok to take cpuset_sem (can WAIT), and
2018 * 2) allocating for current task (not interrupt).
2019 */
dd0fc66f 2020struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4
LT
2021{
2022 struct mempolicy *pol = current->mempolicy;
c0ff7453 2023 struct page *page;
cc9a6c87 2024 unsigned int cpuset_mems_cookie;
1da177e4 2025
9b819d20 2026 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1da177e4 2027 pol = &default_policy;
52cd3b07 2028
cc9a6c87
MG
2029retry_cpuset:
2030 cpuset_mems_cookie = get_mems_allowed();
2031
52cd3b07
LS
2032 /*
2033 * No reference counting needed for current->mempolicy
2034 * nor system default_policy
2035 */
45c4745a 2036 if (pol->mode == MPOL_INTERLEAVE)
c0ff7453
MX
2037 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2038 else
2039 page = __alloc_pages_nodemask(gfp, order,
5c4b4be3
AK
2040 policy_zonelist(gfp, pol, numa_node_id()),
2041 policy_nodemask(gfp, pol));
cc9a6c87
MG
2042
2043 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2044 goto retry_cpuset;
2045
c0ff7453 2046 return page;
1da177e4
LT
2047}
2048EXPORT_SYMBOL(alloc_pages_current);
2049
4225399a 2050/*
846a16bf 2051 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a
PJ
2052 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2053 * with the mems_allowed returned by cpuset_mems_allowed(). This
2054 * keeps mempolicies cpuset relative after its cpuset moves. See
2055 * further kernel/cpuset.c update_nodemask().
708c1bbc
MX
2056 *
2057 * current's mempolicy may be rebinded by the other task(the task that changes
2058 * cpuset's mems), so we needn't do rebind work for current task.
4225399a 2059 */
4225399a 2060
846a16bf
LS
2061/* Slow path of a mempolicy duplicate */
2062struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4
LT
2063{
2064 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2065
2066 if (!new)
2067 return ERR_PTR(-ENOMEM);
708c1bbc
MX
2068
2069 /* task's mempolicy is protected by alloc_lock */
2070 if (old == current->mempolicy) {
2071 task_lock(current);
2072 *new = *old;
2073 task_unlock(current);
2074 } else
2075 *new = *old;
2076
99ee4ca7 2077 rcu_read_lock();
4225399a
PJ
2078 if (current_cpuset_is_being_rebound()) {
2079 nodemask_t mems = cpuset_mems_allowed(current);
708c1bbc
MX
2080 if (new->flags & MPOL_F_REBINDING)
2081 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2082 else
2083 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
4225399a 2084 }
99ee4ca7 2085 rcu_read_unlock();
1da177e4 2086 atomic_set(&new->refcnt, 1);
1da177e4
LT
2087 return new;
2088}
2089
52cd3b07
LS
2090/*
2091 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
2092 * eliminate the * MPOL_F_* flags that require conditional ref and
2093 * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
2094 * after return. Use the returned value.
2095 *
2096 * Allows use of a mempolicy for, e.g., multiple allocations with a single
2097 * policy lookup, even if the policy needs/has extra ref on lookup.
2098 * shmem_readahead needs this.
2099 */
2100struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
2101 struct mempolicy *frompol)
2102{
2103 if (!mpol_needs_cond_ref(frompol))
2104 return frompol;
2105
2106 *tompol = *frompol;
2107 tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
2108 __mpol_put(frompol);
2109 return tompol;
2110}
2111
1da177e4 2112/* Slow path of a mempolicy comparison */
fcfb4dcc 2113bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1da177e4
LT
2114{
2115 if (!a || !b)
fcfb4dcc 2116 return false;
45c4745a 2117 if (a->mode != b->mode)
fcfb4dcc 2118 return false;
19800502 2119 if (a->flags != b->flags)
fcfb4dcc 2120 return false;
19800502
BL
2121 if (mpol_store_user_nodemask(a))
2122 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
fcfb4dcc 2123 return false;
19800502 2124
45c4745a 2125 switch (a->mode) {
19770b32
MG
2126 case MPOL_BIND:
2127 /* Fall through */
1da177e4 2128 case MPOL_INTERLEAVE:
fcfb4dcc 2129 return !!nodes_equal(a->v.nodes, b->v.nodes);
1da177e4 2130 case MPOL_PREFERRED:
75719661 2131 return a->v.preferred_node == b->v.preferred_node;
1da177e4
LT
2132 default:
2133 BUG();
fcfb4dcc 2134 return false;
1da177e4
LT
2135 }
2136}
2137
1da177e4
LT
2138/*
2139 * Shared memory backing store policy support.
2140 *
2141 * Remember policies even when nobody has shared memory mapped.
2142 * The policies are kept in Red-Black tree linked from the inode.
2143 * They are protected by the sp->lock spinlock, which should be held
2144 * for any accesses to the tree.
2145 */
2146
2147/* lookup first element intersecting start-end */
b22d127a 2148/* Caller holds sp->mutex */
1da177e4
LT
2149static struct sp_node *
2150sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2151{
2152 struct rb_node *n = sp->root.rb_node;
2153
2154 while (n) {
2155 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2156
2157 if (start >= p->end)
2158 n = n->rb_right;
2159 else if (end <= p->start)
2160 n = n->rb_left;
2161 else
2162 break;
2163 }
2164 if (!n)
2165 return NULL;
2166 for (;;) {
2167 struct sp_node *w = NULL;
2168 struct rb_node *prev = rb_prev(n);
2169 if (!prev)
2170 break;
2171 w = rb_entry(prev, struct sp_node, nd);
2172 if (w->end <= start)
2173 break;
2174 n = prev;
2175 }
2176 return rb_entry(n, struct sp_node, nd);
2177}
2178
2179/* Insert a new shared policy into the list. */
2180/* Caller holds sp->lock */
2181static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2182{
2183 struct rb_node **p = &sp->root.rb_node;
2184 struct rb_node *parent = NULL;
2185 struct sp_node *nd;
2186
2187 while (*p) {
2188 parent = *p;
2189 nd = rb_entry(parent, struct sp_node, nd);
2190 if (new->start < nd->start)
2191 p = &(*p)->rb_left;
2192 else if (new->end > nd->end)
2193 p = &(*p)->rb_right;
2194 else
2195 BUG();
2196 }
2197 rb_link_node(&new->nd, parent, p);
2198 rb_insert_color(&new->nd, &sp->root);
140d5a49 2199 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
45c4745a 2200 new->policy ? new->policy->mode : 0);
1da177e4
LT
2201}
2202
2203/* Find shared policy intersecting idx */
2204struct mempolicy *
2205mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2206{
2207 struct mempolicy *pol = NULL;
2208 struct sp_node *sn;
2209
2210 if (!sp->root.rb_node)
2211 return NULL;
b22d127a 2212 mutex_lock(&sp->mutex);
1da177e4
LT
2213 sn = sp_lookup(sp, idx, idx+1);
2214 if (sn) {
2215 mpol_get(sn->policy);
2216 pol = sn->policy;
2217 }
b22d127a 2218 mutex_unlock(&sp->mutex);
1da177e4
LT
2219 return pol;
2220}
2221
63f74ca2
KM
2222static void sp_free(struct sp_node *n)
2223{
2224 mpol_put(n->policy);
2225 kmem_cache_free(sn_cache, n);
2226}
2227
771fb4d8
LS
2228/**
2229 * mpol_misplaced - check whether current page node is valid in policy
2230 *
2231 * @page - page to be checked
2232 * @vma - vm area where page mapped
2233 * @addr - virtual address where page mapped
2234 *
2235 * Lookup current policy node id for vma,addr and "compare to" page's
2236 * node id.
2237 *
2238 * Returns:
2239 * -1 - not misplaced, page is in the right node
2240 * node - node id where the page should be
2241 *
2242 * Policy determination "mimics" alloc_page_vma().
2243 * Called from fault path where we know the vma and faulting address.
2244 */
2245int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2246{
2247 struct mempolicy *pol;
2248 struct zone *zone;
2249 int curnid = page_to_nid(page);
2250 unsigned long pgoff;
2251 int polnid = -1;
2252 int ret = -1;
2253
2254 BUG_ON(!vma);
2255
2256 pol = get_vma_policy(current, vma, addr);
2257 if (!(pol->flags & MPOL_F_MOF))
2258 goto out;
2259
2260 switch (pol->mode) {
2261 case MPOL_INTERLEAVE:
2262 BUG_ON(addr >= vma->vm_end);
2263 BUG_ON(addr < vma->vm_start);
2264
2265 pgoff = vma->vm_pgoff;
2266 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2267 polnid = offset_il_node(pol, vma, pgoff);
2268 break;
2269
2270 case MPOL_PREFERRED:
2271 if (pol->flags & MPOL_F_LOCAL)
2272 polnid = numa_node_id();
2273 else
2274 polnid = pol->v.preferred_node;
2275 break;
2276
2277 case MPOL_BIND:
2278 /*
2279 * allows binding to multiple nodes.
2280 * use current page if in policy nodemask,
2281 * else select nearest allowed node, if any.
2282 * If no allowed nodes, use current [!misplaced].
2283 */
2284 if (node_isset(curnid, pol->v.nodes))
2285 goto out;
2286 (void)first_zones_zonelist(
2287 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2288 gfp_zone(GFP_HIGHUSER),
2289 &pol->v.nodes, &zone);
2290 polnid = zone->node;
2291 break;
2292
2293 default:
2294 BUG();
2295 }
2296 if (curnid != polnid)
2297 ret = polnid;
2298out:
2299 mpol_cond_put(pol);
2300
2301 return ret;
2302}
2303
1da177e4
LT
2304static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2305{
140d5a49 2306 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1da177e4 2307 rb_erase(&n->nd, &sp->root);
63f74ca2 2308 sp_free(n);
1da177e4
LT
2309}
2310
dbcb0f19
AB
2311static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2312 struct mempolicy *pol)
1da177e4 2313{
869833f2
KM
2314 struct sp_node *n;
2315 struct mempolicy *newpol;
1da177e4 2316
869833f2 2317 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1da177e4
LT
2318 if (!n)
2319 return NULL;
869833f2
KM
2320
2321 newpol = mpol_dup(pol);
2322 if (IS_ERR(newpol)) {
2323 kmem_cache_free(sn_cache, n);
2324 return NULL;
2325 }
2326 newpol->flags |= MPOL_F_SHARED;
2327
1da177e4
LT
2328 n->start = start;
2329 n->end = end;
869833f2
KM
2330 n->policy = newpol;
2331
1da177e4
LT
2332 return n;
2333}
2334
2335/* Replace a policy range. */
2336static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2337 unsigned long end, struct sp_node *new)
2338{
b22d127a
MG
2339 struct sp_node *n;
2340 int ret = 0;
1da177e4 2341
b22d127a 2342 mutex_lock(&sp->mutex);
1da177e4
LT
2343 n = sp_lookup(sp, start, end);
2344 /* Take care of old policies in the same range. */
2345 while (n && n->start < end) {
2346 struct rb_node *next = rb_next(&n->nd);
2347 if (n->start >= start) {
2348 if (n->end <= end)
2349 sp_delete(sp, n);
2350 else
2351 n->start = end;
2352 } else {
2353 /* Old policy spanning whole new range. */
2354 if (n->end > end) {
b22d127a
MG
2355 struct sp_node *new2;
2356 new2 = sp_alloc(end, n->end, n->policy);
1da177e4 2357 if (!new2) {
b22d127a
MG
2358 ret = -ENOMEM;
2359 goto out;
1da177e4
LT
2360 }
2361 n->end = start;
2362 sp_insert(sp, new2);
1da177e4
LT
2363 break;
2364 } else
2365 n->end = start;
2366 }
2367 if (!next)
2368 break;
2369 n = rb_entry(next, struct sp_node, nd);
2370 }
2371 if (new)
2372 sp_insert(sp, new);
b22d127a
MG
2373out:
2374 mutex_unlock(&sp->mutex);
2375 return ret;
1da177e4
LT
2376}
2377
71fe804b
LS
2378/**
2379 * mpol_shared_policy_init - initialize shared policy for inode
2380 * @sp: pointer to inode shared policy
2381 * @mpol: struct mempolicy to install
2382 *
2383 * Install non-NULL @mpol in inode's shared policy rb-tree.
2384 * On entry, the current task has a reference on a non-NULL @mpol.
2385 * This must be released on exit.
4bfc4495 2386 * This is called at get_inode() calls and we can use GFP_KERNEL.
71fe804b
LS
2387 */
2388void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2389{
58568d2a
MX
2390 int ret;
2391
71fe804b 2392 sp->root = RB_ROOT; /* empty tree == default mempolicy */
b22d127a 2393 mutex_init(&sp->mutex);
71fe804b
LS
2394
2395 if (mpol) {
2396 struct vm_area_struct pvma;
2397 struct mempolicy *new;
4bfc4495 2398 NODEMASK_SCRATCH(scratch);
71fe804b 2399
4bfc4495 2400 if (!scratch)
5c0c1654 2401 goto put_mpol;
71fe804b
LS
2402 /* contextualize the tmpfs mount point mempolicy */
2403 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
15d77835 2404 if (IS_ERR(new))
0cae3457 2405 goto free_scratch; /* no valid nodemask intersection */
58568d2a
MX
2406
2407 task_lock(current);
4bfc4495 2408 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
58568d2a 2409 task_unlock(current);
15d77835 2410 if (ret)
5c0c1654 2411 goto put_new;
71fe804b
LS
2412
2413 /* Create pseudo-vma that contains just the policy */
2414 memset(&pvma, 0, sizeof(struct vm_area_struct));
2415 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2416 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
15d77835 2417
5c0c1654 2418put_new:
71fe804b 2419 mpol_put(new); /* drop initial ref */
0cae3457 2420free_scratch:
4bfc4495 2421 NODEMASK_SCRATCH_FREE(scratch);
5c0c1654
LS
2422put_mpol:
2423 mpol_put(mpol); /* drop our incoming ref on sb mpol */
7339ff83
RH
2424 }
2425}
2426
1da177e4
LT
2427int mpol_set_shared_policy(struct shared_policy *info,
2428 struct vm_area_struct *vma, struct mempolicy *npol)
2429{
2430 int err;
2431 struct sp_node *new = NULL;
2432 unsigned long sz = vma_pages(vma);
2433
028fec41 2434 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1da177e4 2435 vma->vm_pgoff,
45c4745a 2436 sz, npol ? npol->mode : -1,
028fec41 2437 npol ? npol->flags : -1,
140d5a49 2438 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1da177e4
LT
2439
2440 if (npol) {
2441 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2442 if (!new)
2443 return -ENOMEM;
2444 }
2445 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2446 if (err && new)
63f74ca2 2447 sp_free(new);
1da177e4
LT
2448 return err;
2449}
2450
2451/* Free a backing policy store on inode delete. */
2452void mpol_free_shared_policy(struct shared_policy *p)
2453{
2454 struct sp_node *n;
2455 struct rb_node *next;
2456
2457 if (!p->root.rb_node)
2458 return;
b22d127a 2459 mutex_lock(&p->mutex);
1da177e4
LT
2460 next = rb_first(&p->root);
2461 while (next) {
2462 n = rb_entry(next, struct sp_node, nd);
2463 next = rb_next(&n->nd);
63f74ca2 2464 sp_delete(p, n);
1da177e4 2465 }
b22d127a 2466 mutex_unlock(&p->mutex);
1da177e4
LT
2467}
2468
2469/* assumes fs == KERNEL_DS */
2470void __init numa_policy_init(void)
2471{
b71636e2
PM
2472 nodemask_t interleave_nodes;
2473 unsigned long largest = 0;
2474 int nid, prefer = 0;
2475
1da177e4
LT
2476 policy_cache = kmem_cache_create("numa_policy",
2477 sizeof(struct mempolicy),
20c2df83 2478 0, SLAB_PANIC, NULL);
1da177e4
LT
2479
2480 sn_cache = kmem_cache_create("shared_policy_node",
2481 sizeof(struct sp_node),
20c2df83 2482 0, SLAB_PANIC, NULL);
1da177e4 2483
b71636e2
PM
2484 /*
2485 * Set interleaving policy for system init. Interleaving is only
2486 * enabled across suitably sized nodes (default is >= 16MB), or
2487 * fall back to the largest node if they're all smaller.
2488 */
2489 nodes_clear(interleave_nodes);
56bbd65d 2490 for_each_node_state(nid, N_HIGH_MEMORY) {
b71636e2
PM
2491 unsigned long total_pages = node_present_pages(nid);
2492
2493 /* Preserve the largest node */
2494 if (largest < total_pages) {
2495 largest = total_pages;
2496 prefer = nid;
2497 }
2498
2499 /* Interleave this node? */
2500 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2501 node_set(nid, interleave_nodes);
2502 }
2503
2504 /* All too small, use the largest */
2505 if (unlikely(nodes_empty(interleave_nodes)))
2506 node_set(prefer, interleave_nodes);
1da177e4 2507
028fec41 2508 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1da177e4
LT
2509 printk("numa_policy_init: interleaving failed\n");
2510}
2511
8bccd85f 2512/* Reset policy of current process to default */
1da177e4
LT
2513void numa_default_policy(void)
2514{
028fec41 2515 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 2516}
68860ec1 2517
095f1fc4
LS
2518/*
2519 * Parse and format mempolicy from/to strings
2520 */
2521
1a75a6c8 2522/*
fc36b8d3 2523 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
3f226aa1 2524 * Used only for mpol_parse_str() and mpol_to_str()
1a75a6c8 2525 */
345ace9c
LS
2526static const char * const policy_modes[] =
2527{
2528 [MPOL_DEFAULT] = "default",
2529 [MPOL_PREFERRED] = "prefer",
2530 [MPOL_BIND] = "bind",
2531 [MPOL_INTERLEAVE] = "interleave",
d3a71033
LS
2532 [MPOL_LOCAL] = "local",
2533 [MPOL_NOOP] = "noop", /* should not actually be used */
345ace9c 2534};
1a75a6c8 2535
095f1fc4
LS
2536
2537#ifdef CONFIG_TMPFS
2538/**
2539 * mpol_parse_str - parse string to mempolicy
2540 * @str: string containing mempolicy to parse
71fe804b
LS
2541 * @mpol: pointer to struct mempolicy pointer, returned on success.
2542 * @no_context: flag whether to "contextualize" the mempolicy
095f1fc4
LS
2543 *
2544 * Format of input:
2545 * <mode>[=<flags>][:<nodelist>]
2546 *
71fe804b
LS
2547 * if @no_context is true, save the input nodemask in w.user_nodemask in
2548 * the returned mempolicy. This will be used to "clone" the mempolicy in
2549 * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
2550 * mount option. Note that if 'static' or 'relative' mode flags were
2551 * specified, the input nodemask will already have been saved. Saving
2552 * it again is redundant, but safe.
2553 *
2554 * On success, returns 0, else 1
095f1fc4 2555 */
71fe804b 2556int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
095f1fc4 2557{
71fe804b 2558 struct mempolicy *new = NULL;
b4652e84 2559 unsigned short mode;
71fe804b
LS
2560 unsigned short uninitialized_var(mode_flags);
2561 nodemask_t nodes;
095f1fc4
LS
2562 char *nodelist = strchr(str, ':');
2563 char *flags = strchr(str, '=');
095f1fc4
LS
2564 int err = 1;
2565
2566 if (nodelist) {
2567 /* NUL-terminate mode or flags string */
2568 *nodelist++ = '\0';
71fe804b 2569 if (nodelist_parse(nodelist, nodes))
095f1fc4 2570 goto out;
71fe804b 2571 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
095f1fc4 2572 goto out;
71fe804b
LS
2573 } else
2574 nodes_clear(nodes);
2575
095f1fc4
LS
2576 if (flags)
2577 *flags++ = '\0'; /* terminate mode string */
2578
479e2802 2579 for (mode = 0; mode < MPOL_MAX; mode++) {
345ace9c 2580 if (!strcmp(str, policy_modes[mode])) {
095f1fc4
LS
2581 break;
2582 }
2583 }
d3a71033 2584 if (mode >= MPOL_MAX || mode == MPOL_NOOP)
095f1fc4
LS
2585 goto out;
2586
71fe804b 2587 switch (mode) {
095f1fc4 2588 case MPOL_PREFERRED:
71fe804b
LS
2589 /*
2590 * Insist on a nodelist of one node only
2591 */
095f1fc4
LS
2592 if (nodelist) {
2593 char *rest = nodelist;
2594 while (isdigit(*rest))
2595 rest++;
926f2ae0
KM
2596 if (*rest)
2597 goto out;
095f1fc4
LS
2598 }
2599 break;
095f1fc4
LS
2600 case MPOL_INTERLEAVE:
2601 /*
2602 * Default to online nodes with memory if no nodelist
2603 */
2604 if (!nodelist)
71fe804b 2605 nodes = node_states[N_HIGH_MEMORY];
3f226aa1 2606 break;
71fe804b 2607 case MPOL_LOCAL:
3f226aa1 2608 /*
71fe804b 2609 * Don't allow a nodelist; mpol_new() checks flags
3f226aa1 2610 */
71fe804b 2611 if (nodelist)
3f226aa1 2612 goto out;
71fe804b 2613 mode = MPOL_PREFERRED;
3f226aa1 2614 break;
413b43de
RT
2615 case MPOL_DEFAULT:
2616 /*
2617 * Insist on a empty nodelist
2618 */
2619 if (!nodelist)
2620 err = 0;
2621 goto out;
d69b2e63
KM
2622 case MPOL_BIND:
2623 /*
2624 * Insist on a nodelist
2625 */
2626 if (!nodelist)
2627 goto out;
095f1fc4
LS
2628 }
2629
71fe804b 2630 mode_flags = 0;
095f1fc4
LS
2631 if (flags) {
2632 /*
2633 * Currently, we only support two mutually exclusive
2634 * mode flags.
2635 */
2636 if (!strcmp(flags, "static"))
71fe804b 2637 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4 2638 else if (!strcmp(flags, "relative"))
71fe804b 2639 mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4 2640 else
926f2ae0 2641 goto out;
095f1fc4 2642 }
71fe804b
LS
2643
2644 new = mpol_new(mode, mode_flags, &nodes);
2645 if (IS_ERR(new))
926f2ae0
KM
2646 goto out;
2647
e17f74af
LS
2648 if (no_context) {
2649 /* save for contextualization */
2650 new->w.user_nodemask = nodes;
2651 } else {
58568d2a 2652 int ret;
4bfc4495
KH
2653 NODEMASK_SCRATCH(scratch);
2654 if (scratch) {
2655 task_lock(current);
2656 ret = mpol_set_nodemask(new, &nodes, scratch);
2657 task_unlock(current);
2658 } else
2659 ret = -ENOMEM;
2660 NODEMASK_SCRATCH_FREE(scratch);
2661 if (ret) {
4bfc4495 2662 mpol_put(new);
926f2ae0 2663 goto out;
58568d2a
MX
2664 }
2665 }
926f2ae0 2666 err = 0;
71fe804b 2667
095f1fc4
LS
2668out:
2669 /* Restore string for error message */
2670 if (nodelist)
2671 *--nodelist = ':';
2672 if (flags)
2673 *--flags = '=';
71fe804b
LS
2674 if (!err)
2675 *mpol = new;
095f1fc4
LS
2676 return err;
2677}
2678#endif /* CONFIG_TMPFS */
2679
71fe804b
LS
2680/**
2681 * mpol_to_str - format a mempolicy structure for printing
2682 * @buffer: to contain formatted mempolicy string
2683 * @maxlen: length of @buffer
2684 * @pol: pointer to mempolicy to be formatted
2685 * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
2686 *
1a75a6c8
CL
2687 * Convert a mempolicy into a string.
2688 * Returns the number of characters in buffer (if positive)
2689 * or an error (negative)
2690 */
71fe804b 2691int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
1a75a6c8
CL
2692{
2693 char *p = buffer;
2694 int l;
2695 nodemask_t nodes;
bea904d5 2696 unsigned short mode;
f5b087b5 2697 unsigned short flags = pol ? pol->flags : 0;
1a75a6c8 2698
2291990a
LS
2699 /*
2700 * Sanity check: room for longest mode, flag and some nodes
2701 */
2702 VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2703
bea904d5
LS
2704 if (!pol || pol == &default_policy)
2705 mode = MPOL_DEFAULT;
2706 else
2707 mode = pol->mode;
2708
1a75a6c8
CL
2709 switch (mode) {
2710 case MPOL_DEFAULT:
2711 nodes_clear(nodes);
2712 break;
2713
2714 case MPOL_PREFERRED:
2715 nodes_clear(nodes);
fc36b8d3 2716 if (flags & MPOL_F_LOCAL)
53f2556b
LS
2717 mode = MPOL_LOCAL; /* pseudo-policy */
2718 else
fc36b8d3 2719 node_set(pol->v.preferred_node, nodes);
1a75a6c8
CL
2720 break;
2721
2722 case MPOL_BIND:
19770b32 2723 /* Fall through */
1a75a6c8 2724 case MPOL_INTERLEAVE:
71fe804b
LS
2725 if (no_context)
2726 nodes = pol->w.user_nodemask;
2727 else
2728 nodes = pol->v.nodes;
1a75a6c8
CL
2729 break;
2730
2731 default:
80de7c31 2732 return -EINVAL;
1a75a6c8
CL
2733 }
2734
345ace9c 2735 l = strlen(policy_modes[mode]);
53f2556b
LS
2736 if (buffer + maxlen < p + l + 1)
2737 return -ENOSPC;
1a75a6c8 2738
345ace9c 2739 strcpy(p, policy_modes[mode]);
1a75a6c8
CL
2740 p += l;
2741
fc36b8d3 2742 if (flags & MPOL_MODE_FLAGS) {
f5b087b5
DR
2743 if (buffer + maxlen < p + 2)
2744 return -ENOSPC;
2745 *p++ = '=';
2746
2291990a
LS
2747 /*
2748 * Currently, the only defined flags are mutually exclusive
2749 */
f5b087b5 2750 if (flags & MPOL_F_STATIC_NODES)
2291990a
LS
2751 p += snprintf(p, buffer + maxlen - p, "static");
2752 else if (flags & MPOL_F_RELATIVE_NODES)
2753 p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b5
DR
2754 }
2755
1a75a6c8
CL
2756 if (!nodes_empty(nodes)) {
2757 if (buffer + maxlen < p + 2)
2758 return -ENOSPC;
095f1fc4 2759 *p++ = ':';
1a75a6c8
CL
2760 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2761 }
2762 return p - buffer;
2763}