]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - mm/mempolicy.c
[PATCH] memory hotplug: ppc64 specific hot-add functions
[mirror_ubuntu-zesty-kernel.git] / mm / mempolicy.c
CommitLineData
1da177e4
LT
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
6 *
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
9 *
10 * Support four policies per VMA and per process:
11 *
12 * The VMA policy has priority over the process policy for a page fault.
13 *
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
30 *
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
35 *
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
39 *
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
44 *
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
47 */
48
49/* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
60*/
61
62#include <linux/mempolicy.h>
63#include <linux/mm.h>
64#include <linux/highmem.h>
65#include <linux/hugetlb.h>
66#include <linux/kernel.h>
67#include <linux/sched.h>
68#include <linux/mm.h>
69#include <linux/nodemask.h>
70#include <linux/cpuset.h>
71#include <linux/gfp.h>
72#include <linux/slab.h>
73#include <linux/string.h>
74#include <linux/module.h>
75#include <linux/interrupt.h>
76#include <linux/init.h>
77#include <linux/compat.h>
78#include <linux/mempolicy.h>
79#include <asm/tlbflush.h>
80#include <asm/uaccess.h>
81
82static kmem_cache_t *policy_cache;
83static kmem_cache_t *sn_cache;
84
85#define PDprintk(fmt...)
86
87/* Highest zone. An specific allocation for a zone below that is not
88 policied. */
89static int policy_zone;
90
d42c6997 91struct mempolicy default_policy = {
1da177e4
LT
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
94};
95
1da177e4 96/* Do sanity checking on a policy */
dfcd3c0d 97static int mpol_check_policy(int mode, nodemask_t *nodes)
1da177e4 98{
dfcd3c0d 99 int empty = nodes_empty(*nodes);
1da177e4
LT
100
101 switch (mode) {
102 case MPOL_DEFAULT:
103 if (!empty)
104 return -EINVAL;
105 break;
106 case MPOL_BIND:
107 case MPOL_INTERLEAVE:
108 /* Preferred will only use the first bit, but allow
109 more for now. */
110 if (empty)
111 return -EINVAL;
112 break;
113 }
dfcd3c0d 114 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
1da177e4
LT
115}
116
117/* Copy a node mask from user space. */
dfcd3c0d 118static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
1da177e4
LT
119 unsigned long maxnode, int mode)
120{
121 unsigned long k;
122 unsigned long nlongs;
123 unsigned long endmask;
124
125 --maxnode;
dfcd3c0d 126 nodes_clear(*nodes);
1da177e4
LT
127 if (maxnode == 0 || !nmask)
128 return 0;
129
130 nlongs = BITS_TO_LONGS(maxnode);
131 if ((maxnode % BITS_PER_LONG) == 0)
132 endmask = ~0UL;
133 else
134 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
135
136 /* When the user specified more nodes than supported just check
137 if the non supported part is all zero. */
138 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
139 if (nlongs > PAGE_SIZE/sizeof(long))
140 return -EINVAL;
141 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
142 unsigned long t;
dfcd3c0d 143 if (get_user(t, nmask + k))
1da177e4
LT
144 return -EFAULT;
145 if (k == nlongs - 1) {
146 if (t & endmask)
147 return -EINVAL;
148 } else if (t)
149 return -EINVAL;
150 }
151 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
152 endmask = ~0UL;
153 }
154
dfcd3c0d 155 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1da177e4 156 return -EFAULT;
dfcd3c0d 157 nodes_addr(*nodes)[nlongs-1] &= endmask;
1da177e4
LT
158 /* Update current mems_allowed */
159 cpuset_update_current_mems_allowed();
160 /* Ignore nodes not set in current->mems_allowed */
dfcd3c0d
AK
161 /* AK: shouldn't this error out instead? */
162 cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
1da177e4
LT
163 return mpol_check_policy(mode, nodes);
164}
165
166/* Generate a custom zonelist for the BIND policy. */
dfcd3c0d 167static struct zonelist *bind_zonelist(nodemask_t *nodes)
1da177e4
LT
168{
169 struct zonelist *zl;
170 int num, max, nd;
171
dfcd3c0d 172 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
1da177e4
LT
173 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
174 if (!zl)
175 return NULL;
176 num = 0;
dfcd3c0d 177 for_each_node_mask(nd, *nodes) {
1da177e4
LT
178 int k;
179 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
180 struct zone *z = &NODE_DATA(nd)->node_zones[k];
181 if (!z->present_pages)
182 continue;
183 zl->zones[num++] = z;
184 if (k > policy_zone)
185 policy_zone = k;
186 }
187 }
1da177e4
LT
188 zl->zones[num] = NULL;
189 return zl;
190}
191
192/* Create a new policy */
dfcd3c0d 193static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
1da177e4
LT
194{
195 struct mempolicy *policy;
196
dfcd3c0d 197 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
1da177e4
LT
198 if (mode == MPOL_DEFAULT)
199 return NULL;
200 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
201 if (!policy)
202 return ERR_PTR(-ENOMEM);
203 atomic_set(&policy->refcnt, 1);
204 switch (mode) {
205 case MPOL_INTERLEAVE:
dfcd3c0d 206 policy->v.nodes = *nodes;
1da177e4
LT
207 break;
208 case MPOL_PREFERRED:
dfcd3c0d 209 policy->v.preferred_node = first_node(*nodes);
1da177e4
LT
210 if (policy->v.preferred_node >= MAX_NUMNODES)
211 policy->v.preferred_node = -1;
212 break;
213 case MPOL_BIND:
214 policy->v.zonelist = bind_zonelist(nodes);
215 if (policy->v.zonelist == NULL) {
216 kmem_cache_free(policy_cache, policy);
217 return ERR_PTR(-ENOMEM);
218 }
219 break;
220 }
221 policy->policy = mode;
222 return policy;
223}
224
225/* Ensure all existing pages follow the policy. */
b5810039 226static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
dfcd3c0d 227 unsigned long addr, unsigned long end, nodemask_t *nodes)
1da177e4 228{
91612e0d
HD
229 pte_t *orig_pte;
230 pte_t *pte;
705e87c0 231 spinlock_t *ptl;
941150a3 232
705e87c0 233 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
91612e0d
HD
234 do {
235 unsigned long pfn;
236 unsigned int nid;
237
238 if (!pte_present(*pte))
1da177e4 239 continue;
91612e0d 240 pfn = pte_pfn(*pte);
b5810039
NP
241 if (!pfn_valid(pfn)) {
242 print_bad_pte(vma, *pte, addr);
1da177e4 243 continue;
b5810039 244 }
91612e0d 245 nid = pfn_to_nid(pfn);
dfcd3c0d 246 if (!node_isset(nid, *nodes))
91612e0d
HD
247 break;
248 } while (pte++, addr += PAGE_SIZE, addr != end);
705e87c0 249 pte_unmap_unlock(orig_pte, ptl);
91612e0d
HD
250 return addr != end;
251}
252
b5810039 253static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
dfcd3c0d 254 unsigned long addr, unsigned long end, nodemask_t *nodes)
91612e0d
HD
255{
256 pmd_t *pmd;
257 unsigned long next;
258
259 pmd = pmd_offset(pud, addr);
260 do {
261 next = pmd_addr_end(addr, end);
262 if (pmd_none_or_clear_bad(pmd))
263 continue;
b5810039 264 if (check_pte_range(vma, pmd, addr, next, nodes))
91612e0d
HD
265 return -EIO;
266 } while (pmd++, addr = next, addr != end);
267 return 0;
268}
269
b5810039 270static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
dfcd3c0d 271 unsigned long addr, unsigned long end, nodemask_t *nodes)
91612e0d
HD
272{
273 pud_t *pud;
274 unsigned long next;
275
276 pud = pud_offset(pgd, addr);
277 do {
278 next = pud_addr_end(addr, end);
279 if (pud_none_or_clear_bad(pud))
280 continue;
b5810039 281 if (check_pmd_range(vma, pud, addr, next, nodes))
91612e0d
HD
282 return -EIO;
283 } while (pud++, addr = next, addr != end);
284 return 0;
285}
286
b5810039 287static inline int check_pgd_range(struct vm_area_struct *vma,
dfcd3c0d 288 unsigned long addr, unsigned long end, nodemask_t *nodes)
91612e0d
HD
289{
290 pgd_t *pgd;
291 unsigned long next;
292
b5810039 293 pgd = pgd_offset(vma->vm_mm, addr);
91612e0d
HD
294 do {
295 next = pgd_addr_end(addr, end);
296 if (pgd_none_or_clear_bad(pgd))
297 continue;
b5810039 298 if (check_pud_range(vma, pgd, addr, next, nodes))
91612e0d
HD
299 return -EIO;
300 } while (pgd++, addr = next, addr != end);
301 return 0;
1da177e4
LT
302}
303
304/* Step 1: check the range */
305static struct vm_area_struct *
306check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
dfcd3c0d 307 nodemask_t *nodes, unsigned long flags)
1da177e4
LT
308{
309 int err;
310 struct vm_area_struct *first, *vma, *prev;
311
312 first = find_vma(mm, start);
313 if (!first)
314 return ERR_PTR(-EFAULT);
b5810039
NP
315 if (first->vm_flags & VM_RESERVED)
316 return ERR_PTR(-EACCES);
1da177e4
LT
317 prev = NULL;
318 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
319 if (!vma->vm_next && vma->vm_end < end)
320 return ERR_PTR(-EFAULT);
321 if (prev && prev->vm_end < vma->vm_start)
322 return ERR_PTR(-EFAULT);
323 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
5b952b3c
AK
324 unsigned long endvma = vma->vm_end;
325 if (endvma > end)
326 endvma = end;
327 if (vma->vm_start > start)
328 start = vma->vm_start;
b5810039 329 err = check_pgd_range(vma, start, endvma, nodes);
1da177e4
LT
330 if (err) {
331 first = ERR_PTR(err);
332 break;
333 }
334 }
335 prev = vma;
336 }
337 return first;
338}
339
340/* Apply policy to a single VMA */
341static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
342{
343 int err = 0;
344 struct mempolicy *old = vma->vm_policy;
345
346 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
347 vma->vm_start, vma->vm_end, vma->vm_pgoff,
348 vma->vm_ops, vma->vm_file,
349 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
350
351 if (vma->vm_ops && vma->vm_ops->set_policy)
352 err = vma->vm_ops->set_policy(vma, new);
353 if (!err) {
354 mpol_get(new);
355 vma->vm_policy = new;
356 mpol_free(old);
357 }
358 return err;
359}
360
361/* Step 2: apply policy to a range and do splits. */
362static int mbind_range(struct vm_area_struct *vma, unsigned long start,
363 unsigned long end, struct mempolicy *new)
364{
365 struct vm_area_struct *next;
366 int err;
367
368 err = 0;
369 for (; vma && vma->vm_start < end; vma = next) {
370 next = vma->vm_next;
371 if (vma->vm_start < start)
372 err = split_vma(vma->vm_mm, vma, start, 1);
373 if (!err && vma->vm_end > end)
374 err = split_vma(vma->vm_mm, vma, end, 0);
375 if (!err)
376 err = policy_vma(vma, new);
377 if (err)
378 break;
379 }
380 return err;
381}
382
383/* Change policy for a memory range */
384asmlinkage long sys_mbind(unsigned long start, unsigned long len,
385 unsigned long mode,
386 unsigned long __user *nmask, unsigned long maxnode,
387 unsigned flags)
388{
389 struct vm_area_struct *vma;
390 struct mm_struct *mm = current->mm;
391 struct mempolicy *new;
392 unsigned long end;
dfcd3c0d 393 nodemask_t nodes;
1da177e4
LT
394 int err;
395
396 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
397 return -EINVAL;
398 if (start & ~PAGE_MASK)
399 return -EINVAL;
400 if (mode == MPOL_DEFAULT)
401 flags &= ~MPOL_MF_STRICT;
402 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
403 end = start + len;
404 if (end < start)
405 return -EINVAL;
406 if (end == start)
407 return 0;
408
dfcd3c0d 409 err = get_nodes(&nodes, nmask, maxnode, mode);
1da177e4
LT
410 if (err)
411 return err;
412
dfcd3c0d 413 new = mpol_new(mode, &nodes);
1da177e4
LT
414 if (IS_ERR(new))
415 return PTR_ERR(new);
416
417 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
dfcd3c0d 418 mode,nodes_addr(nodes)[0]);
1da177e4
LT
419
420 down_write(&mm->mmap_sem);
dfcd3c0d 421 vma = check_range(mm, start, end, &nodes, flags);
1da177e4
LT
422 err = PTR_ERR(vma);
423 if (!IS_ERR(vma))
424 err = mbind_range(vma, start, end, new);
425 up_write(&mm->mmap_sem);
426 mpol_free(new);
427 return err;
428}
429
430/* Set the process memory policy */
431asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
432 unsigned long maxnode)
433{
434 int err;
435 struct mempolicy *new;
dfcd3c0d 436 nodemask_t nodes;
1da177e4 437
ba17101b 438 if (mode < 0 || mode > MPOL_MAX)
1da177e4 439 return -EINVAL;
dfcd3c0d 440 err = get_nodes(&nodes, nmask, maxnode, mode);
1da177e4
LT
441 if (err)
442 return err;
dfcd3c0d 443 new = mpol_new(mode, &nodes);
1da177e4
LT
444 if (IS_ERR(new))
445 return PTR_ERR(new);
446 mpol_free(current->mempolicy);
447 current->mempolicy = new;
448 if (new && new->policy == MPOL_INTERLEAVE)
dfcd3c0d 449 current->il_next = first_node(new->v.nodes);
1da177e4
LT
450 return 0;
451}
452
453/* Fill a zone bitmap for a policy */
dfcd3c0d 454static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4
LT
455{
456 int i;
457
dfcd3c0d 458 nodes_clear(*nodes);
1da177e4
LT
459 switch (p->policy) {
460 case MPOL_BIND:
461 for (i = 0; p->v.zonelist->zones[i]; i++)
dfcd3c0d 462 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
1da177e4
LT
463 break;
464 case MPOL_DEFAULT:
465 break;
466 case MPOL_INTERLEAVE:
dfcd3c0d 467 *nodes = p->v.nodes;
1da177e4
LT
468 break;
469 case MPOL_PREFERRED:
470 /* or use current node instead of online map? */
471 if (p->v.preferred_node < 0)
dfcd3c0d 472 *nodes = node_online_map;
1da177e4 473 else
dfcd3c0d 474 node_set(p->v.preferred_node, *nodes);
1da177e4
LT
475 break;
476 default:
477 BUG();
478 }
479}
480
481static int lookup_node(struct mm_struct *mm, unsigned long addr)
482{
483 struct page *p;
484 int err;
485
486 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
487 if (err >= 0) {
488 err = page_to_nid(p);
489 put_page(p);
490 }
491 return err;
492}
493
494/* Copy a kernel node mask to user space */
495static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
dfcd3c0d 496 nodemask_t *nodes)
1da177e4
LT
497{
498 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
dfcd3c0d 499 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1da177e4
LT
500
501 if (copy > nbytes) {
502 if (copy > PAGE_SIZE)
503 return -EINVAL;
504 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
505 return -EFAULT;
506 copy = nbytes;
507 }
dfcd3c0d 508 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1da177e4
LT
509}
510
511/* Retrieve NUMA policy */
512asmlinkage long sys_get_mempolicy(int __user *policy,
513 unsigned long __user *nmask,
514 unsigned long maxnode,
515 unsigned long addr, unsigned long flags)
516{
517 int err, pval;
518 struct mm_struct *mm = current->mm;
519 struct vm_area_struct *vma = NULL;
520 struct mempolicy *pol = current->mempolicy;
521
522 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
523 return -EINVAL;
524 if (nmask != NULL && maxnode < MAX_NUMNODES)
525 return -EINVAL;
526 if (flags & MPOL_F_ADDR) {
527 down_read(&mm->mmap_sem);
528 vma = find_vma_intersection(mm, addr, addr+1);
529 if (!vma) {
530 up_read(&mm->mmap_sem);
531 return -EFAULT;
532 }
533 if (vma->vm_ops && vma->vm_ops->get_policy)
534 pol = vma->vm_ops->get_policy(vma, addr);
535 else
536 pol = vma->vm_policy;
537 } else if (addr)
538 return -EINVAL;
539
540 if (!pol)
541 pol = &default_policy;
542
543 if (flags & MPOL_F_NODE) {
544 if (flags & MPOL_F_ADDR) {
545 err = lookup_node(mm, addr);
546 if (err < 0)
547 goto out;
548 pval = err;
549 } else if (pol == current->mempolicy &&
550 pol->policy == MPOL_INTERLEAVE) {
551 pval = current->il_next;
552 } else {
553 err = -EINVAL;
554 goto out;
555 }
556 } else
557 pval = pol->policy;
558
559 if (vma) {
560 up_read(&current->mm->mmap_sem);
561 vma = NULL;
562 }
563
564 if (policy && put_user(pval, policy))
565 return -EFAULT;
566
567 err = 0;
568 if (nmask) {
dfcd3c0d
AK
569 nodemask_t nodes;
570 get_zonemask(pol, &nodes);
571 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1da177e4
LT
572 }
573
574 out:
575 if (vma)
576 up_read(&current->mm->mmap_sem);
577 return err;
578}
579
580#ifdef CONFIG_COMPAT
581
582asmlinkage long compat_sys_get_mempolicy(int __user *policy,
583 compat_ulong_t __user *nmask,
584 compat_ulong_t maxnode,
585 compat_ulong_t addr, compat_ulong_t flags)
586{
587 long err;
588 unsigned long __user *nm = NULL;
589 unsigned long nr_bits, alloc_size;
590 DECLARE_BITMAP(bm, MAX_NUMNODES);
591
592 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
593 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
594
595 if (nmask)
596 nm = compat_alloc_user_space(alloc_size);
597
598 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
599
600 if (!err && nmask) {
601 err = copy_from_user(bm, nm, alloc_size);
602 /* ensure entire bitmap is zeroed */
603 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
604 err |= compat_put_bitmap(nmask, bm, nr_bits);
605 }
606
607 return err;
608}
609
610asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
611 compat_ulong_t maxnode)
612{
613 long err = 0;
614 unsigned long __user *nm = NULL;
615 unsigned long nr_bits, alloc_size;
616 DECLARE_BITMAP(bm, MAX_NUMNODES);
617
618 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
619 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
620
621 if (nmask) {
622 err = compat_get_bitmap(bm, nmask, nr_bits);
623 nm = compat_alloc_user_space(alloc_size);
624 err |= copy_to_user(nm, bm, alloc_size);
625 }
626
627 if (err)
628 return -EFAULT;
629
630 return sys_set_mempolicy(mode, nm, nr_bits+1);
631}
632
633asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
634 compat_ulong_t mode, compat_ulong_t __user *nmask,
635 compat_ulong_t maxnode, compat_ulong_t flags)
636{
637 long err = 0;
638 unsigned long __user *nm = NULL;
639 unsigned long nr_bits, alloc_size;
dfcd3c0d 640 nodemask_t bm;
1da177e4
LT
641
642 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
643 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
644
645 if (nmask) {
dfcd3c0d 646 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4 647 nm = compat_alloc_user_space(alloc_size);
dfcd3c0d 648 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4
LT
649 }
650
651 if (err)
652 return -EFAULT;
653
654 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
655}
656
657#endif
658
659/* Return effective policy for a VMA */
6e21c8f1
CL
660struct mempolicy *
661get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
1da177e4 662{
6e21c8f1 663 struct mempolicy *pol = task->mempolicy;
1da177e4
LT
664
665 if (vma) {
666 if (vma->vm_ops && vma->vm_ops->get_policy)
667 pol = vma->vm_ops->get_policy(vma, addr);
668 else if (vma->vm_policy &&
669 vma->vm_policy->policy != MPOL_DEFAULT)
670 pol = vma->vm_policy;
671 }
672 if (!pol)
673 pol = &default_policy;
674 return pol;
675}
676
677/* Return a zonelist representing a mempolicy */
dd0fc66f 678static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1da177e4
LT
679{
680 int nd;
681
682 switch (policy->policy) {
683 case MPOL_PREFERRED:
684 nd = policy->v.preferred_node;
685 if (nd < 0)
686 nd = numa_node_id();
687 break;
688 case MPOL_BIND:
689 /* Lower zones don't get a policy applied */
690 /* Careful: current->mems_allowed might have moved */
af4ca457 691 if (gfp_zone(gfp) >= policy_zone)
1da177e4
LT
692 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
693 return policy->v.zonelist;
694 /*FALL THROUGH*/
695 case MPOL_INTERLEAVE: /* should not happen */
696 case MPOL_DEFAULT:
697 nd = numa_node_id();
698 break;
699 default:
700 nd = 0;
701 BUG();
702 }
af4ca457 703 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1da177e4
LT
704}
705
706/* Do dynamic interleaving for a process */
707static unsigned interleave_nodes(struct mempolicy *policy)
708{
709 unsigned nid, next;
710 struct task_struct *me = current;
711
712 nid = me->il_next;
dfcd3c0d 713 next = next_node(nid, policy->v.nodes);
1da177e4 714 if (next >= MAX_NUMNODES)
dfcd3c0d 715 next = first_node(policy->v.nodes);
1da177e4
LT
716 me->il_next = next;
717 return nid;
718}
719
720/* Do static interleaving for a VMA with known offset. */
721static unsigned offset_il_node(struct mempolicy *pol,
722 struct vm_area_struct *vma, unsigned long off)
723{
dfcd3c0d 724 unsigned nnodes = nodes_weight(pol->v.nodes);
1da177e4
LT
725 unsigned target = (unsigned)off % nnodes;
726 int c;
727 int nid = -1;
728
729 c = 0;
730 do {
dfcd3c0d 731 nid = next_node(nid, pol->v.nodes);
1da177e4
LT
732 c++;
733 } while (c <= target);
1da177e4
LT
734 return nid;
735}
736
737/* Allocate a page in interleaved policy.
738 Own path because it needs to do special accounting. */
662f3a0b
AK
739static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
740 unsigned nid)
1da177e4
LT
741{
742 struct zonelist *zl;
743 struct page *page;
744
af4ca457 745 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1da177e4
LT
746 page = __alloc_pages(gfp, order, zl);
747 if (page && page_zone(page) == zl->zones[0]) {
e7c8d5c9 748 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1da177e4
LT
749 put_cpu();
750 }
751 return page;
752}
753
754/**
755 * alloc_page_vma - Allocate a page for a VMA.
756 *
757 * @gfp:
758 * %GFP_USER user allocation.
759 * %GFP_KERNEL kernel allocations,
760 * %GFP_HIGHMEM highmem/user allocations,
761 * %GFP_FS allocation should not call back into a file system.
762 * %GFP_ATOMIC don't sleep.
763 *
764 * @vma: Pointer to VMA or NULL if not available.
765 * @addr: Virtual Address of the allocation. Must be inside the VMA.
766 *
767 * This function allocates a page from the kernel page pool and applies
768 * a NUMA policy associated with the VMA or the current process.
769 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
770 * mm_struct of the VMA to prevent it from going away. Should be used for
771 * all allocations for pages that will be mapped into
772 * user space. Returns NULL when no page can be allocated.
773 *
774 * Should be called with the mm_sem of the vma hold.
775 */
776struct page *
dd0fc66f 777alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1da177e4 778{
6e21c8f1 779 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
780
781 cpuset_update_current_mems_allowed();
782
783 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
784 unsigned nid;
785 if (vma) {
786 unsigned long off;
1da177e4
LT
787 off = vma->vm_pgoff;
788 off += (addr - vma->vm_start) >> PAGE_SHIFT;
789 nid = offset_il_node(pol, vma, off);
790 } else {
791 /* fall back to process interleaving */
792 nid = interleave_nodes(pol);
793 }
794 return alloc_page_interleave(gfp, 0, nid);
795 }
796 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
797}
798
799/**
800 * alloc_pages_current - Allocate pages.
801 *
802 * @gfp:
803 * %GFP_USER user allocation,
804 * %GFP_KERNEL kernel allocation,
805 * %GFP_HIGHMEM highmem allocation,
806 * %GFP_FS don't call back into a file system.
807 * %GFP_ATOMIC don't sleep.
808 * @order: Power of two of allocation size in pages. 0 is a single page.
809 *
810 * Allocate a page from the kernel page pool. When not in
811 * interrupt context and apply the current process NUMA policy.
812 * Returns NULL when no page can be allocated.
813 *
814 * Don't call cpuset_update_current_mems_allowed() unless
815 * 1) it's ok to take cpuset_sem (can WAIT), and
816 * 2) allocating for current task (not interrupt).
817 */
dd0fc66f 818struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4
LT
819{
820 struct mempolicy *pol = current->mempolicy;
821
822 if ((gfp & __GFP_WAIT) && !in_interrupt())
823 cpuset_update_current_mems_allowed();
824 if (!pol || in_interrupt())
825 pol = &default_policy;
826 if (pol->policy == MPOL_INTERLEAVE)
827 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
828 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
829}
830EXPORT_SYMBOL(alloc_pages_current);
831
832/* Slow path of a mempolicy copy */
833struct mempolicy *__mpol_copy(struct mempolicy *old)
834{
835 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
836
837 if (!new)
838 return ERR_PTR(-ENOMEM);
839 *new = *old;
840 atomic_set(&new->refcnt, 1);
841 if (new->policy == MPOL_BIND) {
842 int sz = ksize(old->v.zonelist);
843 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
844 if (!new->v.zonelist) {
845 kmem_cache_free(policy_cache, new);
846 return ERR_PTR(-ENOMEM);
847 }
848 memcpy(new->v.zonelist, old->v.zonelist, sz);
849 }
850 return new;
851}
852
853/* Slow path of a mempolicy comparison */
854int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
855{
856 if (!a || !b)
857 return 0;
858 if (a->policy != b->policy)
859 return 0;
860 switch (a->policy) {
861 case MPOL_DEFAULT:
862 return 1;
863 case MPOL_INTERLEAVE:
dfcd3c0d 864 return nodes_equal(a->v.nodes, b->v.nodes);
1da177e4
LT
865 case MPOL_PREFERRED:
866 return a->v.preferred_node == b->v.preferred_node;
867 case MPOL_BIND: {
868 int i;
869 for (i = 0; a->v.zonelist->zones[i]; i++)
870 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
871 return 0;
872 return b->v.zonelist->zones[i] == NULL;
873 }
874 default:
875 BUG();
876 return 0;
877 }
878}
879
880/* Slow path of a mpol destructor. */
881void __mpol_free(struct mempolicy *p)
882{
883 if (!atomic_dec_and_test(&p->refcnt))
884 return;
885 if (p->policy == MPOL_BIND)
886 kfree(p->v.zonelist);
887 p->policy = MPOL_DEFAULT;
888 kmem_cache_free(policy_cache, p);
889}
890
891/*
892 * Hugetlb policy. Same as above, just works with node numbers instead of
893 * zonelists.
894 */
895
896/* Find first node suitable for an allocation */
897int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
898{
6e21c8f1 899 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
900
901 switch (pol->policy) {
902 case MPOL_DEFAULT:
903 return numa_node_id();
904 case MPOL_BIND:
905 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
906 case MPOL_INTERLEAVE:
907 return interleave_nodes(pol);
908 case MPOL_PREFERRED:
909 return pol->v.preferred_node >= 0 ?
910 pol->v.preferred_node : numa_node_id();
911 }
912 BUG();
913 return 0;
914}
915
916/* Find secondary valid nodes for an allocation */
917int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
918{
6e21c8f1 919 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
920
921 switch (pol->policy) {
922 case MPOL_PREFERRED:
923 case MPOL_DEFAULT:
924 case MPOL_INTERLEAVE:
925 return 1;
926 case MPOL_BIND: {
927 struct zone **z;
928 for (z = pol->v.zonelist->zones; *z; z++)
929 if ((*z)->zone_pgdat->node_id == nid)
930 return 1;
931 return 0;
932 }
933 default:
934 BUG();
935 return 0;
936 }
937}
938
939/*
940 * Shared memory backing store policy support.
941 *
942 * Remember policies even when nobody has shared memory mapped.
943 * The policies are kept in Red-Black tree linked from the inode.
944 * They are protected by the sp->lock spinlock, which should be held
945 * for any accesses to the tree.
946 */
947
948/* lookup first element intersecting start-end */
949/* Caller holds sp->lock */
950static struct sp_node *
951sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
952{
953 struct rb_node *n = sp->root.rb_node;
954
955 while (n) {
956 struct sp_node *p = rb_entry(n, struct sp_node, nd);
957
958 if (start >= p->end)
959 n = n->rb_right;
960 else if (end <= p->start)
961 n = n->rb_left;
962 else
963 break;
964 }
965 if (!n)
966 return NULL;
967 for (;;) {
968 struct sp_node *w = NULL;
969 struct rb_node *prev = rb_prev(n);
970 if (!prev)
971 break;
972 w = rb_entry(prev, struct sp_node, nd);
973 if (w->end <= start)
974 break;
975 n = prev;
976 }
977 return rb_entry(n, struct sp_node, nd);
978}
979
980/* Insert a new shared policy into the list. */
981/* Caller holds sp->lock */
982static void sp_insert(struct shared_policy *sp, struct sp_node *new)
983{
984 struct rb_node **p = &sp->root.rb_node;
985 struct rb_node *parent = NULL;
986 struct sp_node *nd;
987
988 while (*p) {
989 parent = *p;
990 nd = rb_entry(parent, struct sp_node, nd);
991 if (new->start < nd->start)
992 p = &(*p)->rb_left;
993 else if (new->end > nd->end)
994 p = &(*p)->rb_right;
995 else
996 BUG();
997 }
998 rb_link_node(&new->nd, parent, p);
999 rb_insert_color(&new->nd, &sp->root);
1000 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1001 new->policy ? new->policy->policy : 0);
1002}
1003
1004/* Find shared policy intersecting idx */
1005struct mempolicy *
1006mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1007{
1008 struct mempolicy *pol = NULL;
1009 struct sp_node *sn;
1010
1011 if (!sp->root.rb_node)
1012 return NULL;
1013 spin_lock(&sp->lock);
1014 sn = sp_lookup(sp, idx, idx+1);
1015 if (sn) {
1016 mpol_get(sn->policy);
1017 pol = sn->policy;
1018 }
1019 spin_unlock(&sp->lock);
1020 return pol;
1021}
1022
1023static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1024{
1025 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1026 rb_erase(&n->nd, &sp->root);
1027 mpol_free(n->policy);
1028 kmem_cache_free(sn_cache, n);
1029}
1030
1031struct sp_node *
1032sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1033{
1034 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1035
1036 if (!n)
1037 return NULL;
1038 n->start = start;
1039 n->end = end;
1040 mpol_get(pol);
1041 n->policy = pol;
1042 return n;
1043}
1044
1045/* Replace a policy range. */
1046static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1047 unsigned long end, struct sp_node *new)
1048{
1049 struct sp_node *n, *new2 = NULL;
1050
1051restart:
1052 spin_lock(&sp->lock);
1053 n = sp_lookup(sp, start, end);
1054 /* Take care of old policies in the same range. */
1055 while (n && n->start < end) {
1056 struct rb_node *next = rb_next(&n->nd);
1057 if (n->start >= start) {
1058 if (n->end <= end)
1059 sp_delete(sp, n);
1060 else
1061 n->start = end;
1062 } else {
1063 /* Old policy spanning whole new range. */
1064 if (n->end > end) {
1065 if (!new2) {
1066 spin_unlock(&sp->lock);
1067 new2 = sp_alloc(end, n->end, n->policy);
1068 if (!new2)
1069 return -ENOMEM;
1070 goto restart;
1071 }
1072 n->end = start;
1073 sp_insert(sp, new2);
1074 new2 = NULL;
1075 break;
1076 } else
1077 n->end = start;
1078 }
1079 if (!next)
1080 break;
1081 n = rb_entry(next, struct sp_node, nd);
1082 }
1083 if (new)
1084 sp_insert(sp, new);
1085 spin_unlock(&sp->lock);
1086 if (new2) {
1087 mpol_free(new2->policy);
1088 kmem_cache_free(sn_cache, new2);
1089 }
1090 return 0;
1091}
1092
1093int mpol_set_shared_policy(struct shared_policy *info,
1094 struct vm_area_struct *vma, struct mempolicy *npol)
1095{
1096 int err;
1097 struct sp_node *new = NULL;
1098 unsigned long sz = vma_pages(vma);
1099
1100 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1101 vma->vm_pgoff,
1102 sz, npol? npol->policy : -1,
dfcd3c0d 1103 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1da177e4
LT
1104
1105 if (npol) {
1106 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1107 if (!new)
1108 return -ENOMEM;
1109 }
1110 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1111 if (err && new)
1112 kmem_cache_free(sn_cache, new);
1113 return err;
1114}
1115
1116/* Free a backing policy store on inode delete. */
1117void mpol_free_shared_policy(struct shared_policy *p)
1118{
1119 struct sp_node *n;
1120 struct rb_node *next;
1121
1122 if (!p->root.rb_node)
1123 return;
1124 spin_lock(&p->lock);
1125 next = rb_first(&p->root);
1126 while (next) {
1127 n = rb_entry(next, struct sp_node, nd);
1128 next = rb_next(&n->nd);
90c5029e 1129 rb_erase(&n->nd, &p->root);
1da177e4
LT
1130 mpol_free(n->policy);
1131 kmem_cache_free(sn_cache, n);
1132 }
1133 spin_unlock(&p->lock);
1da177e4
LT
1134}
1135
1136/* assumes fs == KERNEL_DS */
1137void __init numa_policy_init(void)
1138{
1139 policy_cache = kmem_cache_create("numa_policy",
1140 sizeof(struct mempolicy),
1141 0, SLAB_PANIC, NULL, NULL);
1142
1143 sn_cache = kmem_cache_create("shared_policy_node",
1144 sizeof(struct sp_node),
1145 0, SLAB_PANIC, NULL, NULL);
1146
1147 /* Set interleaving policy for system init. This way not all
1148 the data structures allocated at system boot end up in node zero. */
1149
1150 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1151 MAX_NUMNODES) < 0)
1152 printk("numa_policy_init: interleaving failed\n");
1153}
1154
1155/* Reset policy of current process to default.
1156 * Assumes fs == KERNEL_DS */
1157void numa_default_policy(void)
1158{
1159 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1160}