]>
git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - mm/mempolicy.c
2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
8 * NUMA policy allows the user to give hints in which node(s) memory should
11 * Support four policies per VMA and per process:
13 * The VMA policy has priority over the process policy for a page fault.
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
22 * bind Only allocate memory on a specific set of nodes,
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
57 fix mmap readahead to honour policy and enable policy for any page cache
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
62 handle mremap for shared memory (currently ignored for the policy)
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
69 #include <linux/mempolicy.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <linux/swap.h>
88 #include <asm/tlbflush.h>
89 #include <asm/uaccess.h>
92 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
93 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
95 static kmem_cache_t
*policy_cache
;
96 static kmem_cache_t
*sn_cache
;
98 #define PDprintk(fmt...)
100 /* Highest zone. An specific allocation for a zone below that is not
102 int policy_zone
= ZONE_DMA
;
104 struct mempolicy default_policy
= {
105 .refcnt
= ATOMIC_INIT(1), /* never free it */
106 .policy
= MPOL_DEFAULT
,
109 /* Do sanity checking on a policy */
110 static int mpol_check_policy(int mode
, nodemask_t
*nodes
)
112 int empty
= nodes_empty(*nodes
);
120 case MPOL_INTERLEAVE
:
121 /* Preferred will only use the first bit, but allow
127 return nodes_subset(*nodes
, node_online_map
) ? 0 : -EINVAL
;
129 /* Generate a custom zonelist for the BIND policy. */
130 static struct zonelist
*bind_zonelist(nodemask_t
*nodes
)
135 max
= 1 + MAX_NR_ZONES
* nodes_weight(*nodes
);
136 zl
= kmalloc(sizeof(void *) * max
, GFP_KERNEL
);
140 for_each_node_mask(nd
, *nodes
)
141 zl
->zones
[num
++] = &NODE_DATA(nd
)->node_zones
[policy_zone
];
142 zl
->zones
[num
] = NULL
;
146 /* Create a new policy */
147 static struct mempolicy
*mpol_new(int mode
, nodemask_t
*nodes
)
149 struct mempolicy
*policy
;
151 PDprintk("setting mode %d nodes[0] %lx\n", mode
, nodes_addr(*nodes
)[0]);
152 if (mode
== MPOL_DEFAULT
)
154 policy
= kmem_cache_alloc(policy_cache
, GFP_KERNEL
);
156 return ERR_PTR(-ENOMEM
);
157 atomic_set(&policy
->refcnt
, 1);
159 case MPOL_INTERLEAVE
:
160 policy
->v
.nodes
= *nodes
;
161 if (nodes_weight(*nodes
) == 0) {
162 kmem_cache_free(policy_cache
, policy
);
163 return ERR_PTR(-EINVAL
);
167 policy
->v
.preferred_node
= first_node(*nodes
);
168 if (policy
->v
.preferred_node
>= MAX_NUMNODES
)
169 policy
->v
.preferred_node
= -1;
172 policy
->v
.zonelist
= bind_zonelist(nodes
);
173 if (policy
->v
.zonelist
== NULL
) {
174 kmem_cache_free(policy_cache
, policy
);
175 return ERR_PTR(-ENOMEM
);
179 policy
->policy
= mode
;
183 /* Check if we are the only process mapping the page in question */
184 static inline int single_mm_mapping(struct mm_struct
*mm
,
185 struct address_space
*mapping
)
187 struct vm_area_struct
*vma
;
188 struct prio_tree_iter iter
;
191 spin_lock(&mapping
->i_mmap_lock
);
192 vma_prio_tree_foreach(vma
, &iter
, &mapping
->i_mmap
, 0, ULONG_MAX
)
193 if (mm
!= vma
->vm_mm
) {
197 list_for_each_entry(vma
, &mapping
->i_mmap_nonlinear
, shared
.vm_set
.list
)
198 if (mm
!= vma
->vm_mm
) {
203 spin_unlock(&mapping
->i_mmap_lock
);
208 * Add a page to be migrated to the pagelist
210 static void migrate_page_add(struct vm_area_struct
*vma
,
211 struct page
*page
, struct list_head
*pagelist
, unsigned long flags
)
214 * Avoid migrating a page that is shared by others and not writable.
216 if ((flags
& MPOL_MF_MOVE_ALL
) || !page
->mapping
|| PageAnon(page
) ||
217 mapping_writably_mapped(page
->mapping
) ||
218 single_mm_mapping(vma
->vm_mm
, page
->mapping
)) {
219 int rc
= isolate_lru_page(page
);
222 list_add(&page
->lru
, pagelist
);
224 * If the isolate attempt was not successful then we just
225 * encountered an unswappable page. Something must be wrong.
231 /* Scan through pages checking if pages follow certain conditions. */
232 static int check_pte_range(struct vm_area_struct
*vma
, pmd_t
*pmd
,
233 unsigned long addr
, unsigned long end
,
234 const nodemask_t
*nodes
, unsigned long flags
,
241 orig_pte
= pte
= pte_offset_map_lock(vma
->vm_mm
, pmd
, addr
, &ptl
);
246 if (!pte_present(*pte
))
248 page
= vm_normal_page(vma
, addr
, *pte
);
251 nid
= page_to_nid(page
);
252 if (node_isset(nid
, *nodes
) == !!(flags
& MPOL_MF_INVERT
))
255 if (flags
& (MPOL_MF_MOVE
| MPOL_MF_MOVE_ALL
))
256 migrate_page_add(vma
, page
, private, flags
);
259 } while (pte
++, addr
+= PAGE_SIZE
, addr
!= end
);
260 pte_unmap_unlock(orig_pte
, ptl
);
264 static inline int check_pmd_range(struct vm_area_struct
*vma
, pud_t
*pud
,
265 unsigned long addr
, unsigned long end
,
266 const nodemask_t
*nodes
, unsigned long flags
,
272 pmd
= pmd_offset(pud
, addr
);
274 next
= pmd_addr_end(addr
, end
);
275 if (pmd_none_or_clear_bad(pmd
))
277 if (check_pte_range(vma
, pmd
, addr
, next
, nodes
,
280 } while (pmd
++, addr
= next
, addr
!= end
);
284 static inline int check_pud_range(struct vm_area_struct
*vma
, pgd_t
*pgd
,
285 unsigned long addr
, unsigned long end
,
286 const nodemask_t
*nodes
, unsigned long flags
,
292 pud
= pud_offset(pgd
, addr
);
294 next
= pud_addr_end(addr
, end
);
295 if (pud_none_or_clear_bad(pud
))
297 if (check_pmd_range(vma
, pud
, addr
, next
, nodes
,
300 } while (pud
++, addr
= next
, addr
!= end
);
304 static inline int check_pgd_range(struct vm_area_struct
*vma
,
305 unsigned long addr
, unsigned long end
,
306 const nodemask_t
*nodes
, unsigned long flags
,
312 pgd
= pgd_offset(vma
->vm_mm
, addr
);
314 next
= pgd_addr_end(addr
, end
);
315 if (pgd_none_or_clear_bad(pgd
))
317 if (check_pud_range(vma
, pgd
, addr
, next
, nodes
,
320 } while (pgd
++, addr
= next
, addr
!= end
);
324 /* Check if a vma is migratable */
325 static inline int vma_migratable(struct vm_area_struct
*vma
)
327 if (vma
->vm_flags
& (
328 VM_LOCKED
|VM_IO
|VM_HUGETLB
|VM_PFNMAP
))
334 * Check if all pages in a range are on a set of nodes.
335 * If pagelist != NULL then isolate pages from the LRU and
336 * put them on the pagelist.
338 static struct vm_area_struct
*
339 check_range(struct mm_struct
*mm
, unsigned long start
, unsigned long end
,
340 const nodemask_t
*nodes
, unsigned long flags
, void *private)
343 struct vm_area_struct
*first
, *vma
, *prev
;
345 first
= find_vma(mm
, start
);
347 return ERR_PTR(-EFAULT
);
349 for (vma
= first
; vma
&& vma
->vm_start
< end
; vma
= vma
->vm_next
) {
350 if (!(flags
& MPOL_MF_DISCONTIG_OK
)) {
351 if (!vma
->vm_next
&& vma
->vm_end
< end
)
352 return ERR_PTR(-EFAULT
);
353 if (prev
&& prev
->vm_end
< vma
->vm_start
)
354 return ERR_PTR(-EFAULT
);
356 if (!is_vm_hugetlb_page(vma
) &&
357 ((flags
& MPOL_MF_STRICT
) ||
358 ((flags
& (MPOL_MF_MOVE
| MPOL_MF_MOVE_ALL
)) &&
359 vma_migratable(vma
)))) {
360 unsigned long endvma
= vma
->vm_end
;
364 if (vma
->vm_start
> start
)
365 start
= vma
->vm_start
;
366 err
= check_pgd_range(vma
, start
, endvma
, nodes
,
369 first
= ERR_PTR(err
);
378 /* Apply policy to a single VMA */
379 static int policy_vma(struct vm_area_struct
*vma
, struct mempolicy
*new)
382 struct mempolicy
*old
= vma
->vm_policy
;
384 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
385 vma
->vm_start
, vma
->vm_end
, vma
->vm_pgoff
,
386 vma
->vm_ops
, vma
->vm_file
,
387 vma
->vm_ops
? vma
->vm_ops
->set_policy
: NULL
);
389 if (vma
->vm_ops
&& vma
->vm_ops
->set_policy
)
390 err
= vma
->vm_ops
->set_policy(vma
, new);
393 vma
->vm_policy
= new;
399 /* Step 2: apply policy to a range and do splits. */
400 static int mbind_range(struct vm_area_struct
*vma
, unsigned long start
,
401 unsigned long end
, struct mempolicy
*new)
403 struct vm_area_struct
*next
;
407 for (; vma
&& vma
->vm_start
< end
; vma
= next
) {
409 if (vma
->vm_start
< start
)
410 err
= split_vma(vma
->vm_mm
, vma
, start
, 1);
411 if (!err
&& vma
->vm_end
> end
)
412 err
= split_vma(vma
->vm_mm
, vma
, end
, 0);
414 err
= policy_vma(vma
, new);
421 static int contextualize_policy(int mode
, nodemask_t
*nodes
)
426 /* Update current mems_allowed */
427 cpuset_update_current_mems_allowed();
428 /* Ignore nodes not set in current->mems_allowed */
429 cpuset_restrict_to_mems_allowed(nodes
->bits
);
430 return mpol_check_policy(mode
, nodes
);
433 static int swap_pages(struct list_head
*pagelist
)
439 n
= migrate_pages(pagelist
, NULL
, &moved
, &failed
);
440 putback_lru_pages(&failed
);
441 putback_lru_pages(&moved
);
446 long do_mbind(unsigned long start
, unsigned long len
,
447 unsigned long mode
, nodemask_t
*nmask
, unsigned long flags
)
449 struct vm_area_struct
*vma
;
450 struct mm_struct
*mm
= current
->mm
;
451 struct mempolicy
*new;
456 if ((flags
& ~(unsigned long)(MPOL_MF_STRICT
|
457 MPOL_MF_MOVE
| MPOL_MF_MOVE_ALL
))
460 if ((flags
& MPOL_MF_MOVE_ALL
) && !capable(CAP_SYS_RESOURCE
))
463 if (start
& ~PAGE_MASK
)
466 if (mode
== MPOL_DEFAULT
)
467 flags
&= ~MPOL_MF_STRICT
;
469 len
= (len
+ PAGE_SIZE
- 1) & PAGE_MASK
;
477 if (mpol_check_policy(mode
, nmask
))
480 new = mpol_new(mode
, nmask
);
485 * If we are using the default policy then operation
486 * on discontinuous address spaces is okay after all
489 flags
|= MPOL_MF_DISCONTIG_OK
;
491 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start
,start
+len
,
492 mode
,nodes_addr(nodes
)[0]);
494 down_write(&mm
->mmap_sem
);
495 vma
= check_range(mm
, start
, end
, nmask
,
496 flags
| MPOL_MF_INVERT
, &pagelist
);
502 err
= mbind_range(vma
, start
, end
, new);
503 if (!list_empty(&pagelist
))
504 nr_failed
= swap_pages(&pagelist
);
506 if (!err
&& nr_failed
&& (flags
& MPOL_MF_STRICT
))
509 if (!list_empty(&pagelist
))
510 putback_lru_pages(&pagelist
);
512 up_write(&mm
->mmap_sem
);
517 /* Set the process memory policy */
518 long do_set_mempolicy(int mode
, nodemask_t
*nodes
)
520 struct mempolicy
*new;
522 if (contextualize_policy(mode
, nodes
))
524 new = mpol_new(mode
, nodes
);
527 mpol_free(current
->mempolicy
);
528 current
->mempolicy
= new;
529 if (new && new->policy
== MPOL_INTERLEAVE
)
530 current
->il_next
= first_node(new->v
.nodes
);
534 /* Fill a zone bitmap for a policy */
535 static void get_zonemask(struct mempolicy
*p
, nodemask_t
*nodes
)
542 for (i
= 0; p
->v
.zonelist
->zones
[i
]; i
++)
543 node_set(p
->v
.zonelist
->zones
[i
]->zone_pgdat
->node_id
,
548 case MPOL_INTERLEAVE
:
552 /* or use current node instead of online map? */
553 if (p
->v
.preferred_node
< 0)
554 *nodes
= node_online_map
;
556 node_set(p
->v
.preferred_node
, *nodes
);
563 static int lookup_node(struct mm_struct
*mm
, unsigned long addr
)
568 err
= get_user_pages(current
, mm
, addr
& PAGE_MASK
, 1, 0, 0, &p
, NULL
);
570 err
= page_to_nid(p
);
576 /* Retrieve NUMA policy */
577 long do_get_mempolicy(int *policy
, nodemask_t
*nmask
,
578 unsigned long addr
, unsigned long flags
)
581 struct mm_struct
*mm
= current
->mm
;
582 struct vm_area_struct
*vma
= NULL
;
583 struct mempolicy
*pol
= current
->mempolicy
;
585 cpuset_update_current_mems_allowed();
586 if (flags
& ~(unsigned long)(MPOL_F_NODE
|MPOL_F_ADDR
))
588 if (flags
& MPOL_F_ADDR
) {
589 down_read(&mm
->mmap_sem
);
590 vma
= find_vma_intersection(mm
, addr
, addr
+1);
592 up_read(&mm
->mmap_sem
);
595 if (vma
->vm_ops
&& vma
->vm_ops
->get_policy
)
596 pol
= vma
->vm_ops
->get_policy(vma
, addr
);
598 pol
= vma
->vm_policy
;
603 pol
= &default_policy
;
605 if (flags
& MPOL_F_NODE
) {
606 if (flags
& MPOL_F_ADDR
) {
607 err
= lookup_node(mm
, addr
);
611 } else if (pol
== current
->mempolicy
&&
612 pol
->policy
== MPOL_INTERLEAVE
) {
613 *policy
= current
->il_next
;
619 *policy
= pol
->policy
;
622 up_read(¤t
->mm
->mmap_sem
);
628 get_zonemask(pol
, nmask
);
632 up_read(¤t
->mm
->mmap_sem
);
637 * For now migrate_pages simply swaps out the pages from nodes that are in
638 * the source set but not in the target set. In the future, we would
639 * want a function that moves pages between the two nodesets in such
640 * a way as to preserve the physical layout as much as possible.
642 * Returns the number of page that could not be moved.
644 int do_migrate_pages(struct mm_struct
*mm
,
645 const nodemask_t
*from_nodes
, const nodemask_t
*to_nodes
, int flags
)
651 nodes_andnot(nodes
, *from_nodes
, *to_nodes
);
653 down_read(&mm
->mmap_sem
);
654 check_range(mm
, mm
->mmap
->vm_start
, TASK_SIZE
, &nodes
,
655 flags
| MPOL_MF_DISCONTIG_OK
, &pagelist
);
657 if (!list_empty(&pagelist
)) {
658 count
= swap_pages(&pagelist
);
659 putback_lru_pages(&pagelist
);
662 up_read(&mm
->mmap_sem
);
667 * User space interface with variable sized bitmaps for nodelists.
670 /* Copy a node mask from user space. */
671 static int get_nodes(nodemask_t
*nodes
, const unsigned long __user
*nmask
,
672 unsigned long maxnode
)
675 unsigned long nlongs
;
676 unsigned long endmask
;
680 if (maxnode
== 0 || !nmask
)
683 nlongs
= BITS_TO_LONGS(maxnode
);
684 if ((maxnode
% BITS_PER_LONG
) == 0)
687 endmask
= (1UL << (maxnode
% BITS_PER_LONG
)) - 1;
689 /* When the user specified more nodes than supported just check
690 if the non supported part is all zero. */
691 if (nlongs
> BITS_TO_LONGS(MAX_NUMNODES
)) {
692 if (nlongs
> PAGE_SIZE
/sizeof(long))
694 for (k
= BITS_TO_LONGS(MAX_NUMNODES
); k
< nlongs
; k
++) {
696 if (get_user(t
, nmask
+ k
))
698 if (k
== nlongs
- 1) {
704 nlongs
= BITS_TO_LONGS(MAX_NUMNODES
);
708 if (copy_from_user(nodes_addr(*nodes
), nmask
, nlongs
*sizeof(unsigned long)))
710 nodes_addr(*nodes
)[nlongs
-1] &= endmask
;
714 /* Copy a kernel node mask to user space */
715 static int copy_nodes_to_user(unsigned long __user
*mask
, unsigned long maxnode
,
718 unsigned long copy
= ALIGN(maxnode
-1, 64) / 8;
719 const int nbytes
= BITS_TO_LONGS(MAX_NUMNODES
) * sizeof(long);
722 if (copy
> PAGE_SIZE
)
724 if (clear_user((char __user
*)mask
+ nbytes
, copy
- nbytes
))
728 return copy_to_user(mask
, nodes_addr(*nodes
), copy
) ? -EFAULT
: 0;
731 asmlinkage
long sys_mbind(unsigned long start
, unsigned long len
,
733 unsigned long __user
*nmask
, unsigned long maxnode
,
739 err
= get_nodes(&nodes
, nmask
, maxnode
);
742 return do_mbind(start
, len
, mode
, &nodes
, flags
);
745 /* Set the process memory policy */
746 asmlinkage
long sys_set_mempolicy(int mode
, unsigned long __user
*nmask
,
747 unsigned long maxnode
)
752 if (mode
< 0 || mode
> MPOL_MAX
)
754 err
= get_nodes(&nodes
, nmask
, maxnode
);
757 return do_set_mempolicy(mode
, &nodes
);
760 /* Macro needed until Paul implements this function in kernel/cpusets.c */
761 #define cpuset_mems_allowed(task) node_online_map
763 asmlinkage
long sys_migrate_pages(pid_t pid
, unsigned long maxnode
,
764 const unsigned long __user
*old_nodes
,
765 const unsigned long __user
*new_nodes
)
767 struct mm_struct
*mm
;
768 struct task_struct
*task
;
771 nodemask_t task_nodes
;
774 err
= get_nodes(&old
, old_nodes
, maxnode
);
778 err
= get_nodes(&new, new_nodes
, maxnode
);
782 /* Find the mm_struct */
783 read_lock(&tasklist_lock
);
784 task
= pid
? find_task_by_pid(pid
) : current
;
786 read_unlock(&tasklist_lock
);
789 mm
= get_task_mm(task
);
790 read_unlock(&tasklist_lock
);
796 * Check if this process has the right to modify the specified
797 * process. The right exists if the process has administrative
798 * capabilities, superuser priviledges or the same
799 * userid as the target process.
801 if ((current
->euid
!= task
->suid
) && (current
->euid
!= task
->uid
) &&
802 (current
->uid
!= task
->suid
) && (current
->uid
!= task
->uid
) &&
803 !capable(CAP_SYS_ADMIN
)) {
808 task_nodes
= cpuset_mems_allowed(task
);
809 /* Is the user allowed to access the target nodes? */
810 if (!nodes_subset(new, task_nodes
) && !capable(CAP_SYS_ADMIN
)) {
815 err
= do_migrate_pages(mm
, &old
, &new, MPOL_MF_MOVE
);
822 /* Retrieve NUMA policy */
823 asmlinkage
long sys_get_mempolicy(int __user
*policy
,
824 unsigned long __user
*nmask
,
825 unsigned long maxnode
,
826 unsigned long addr
, unsigned long flags
)
831 if (nmask
!= NULL
&& maxnode
< MAX_NUMNODES
)
834 err
= do_get_mempolicy(&pval
, &nodes
, addr
, flags
);
839 if (policy
&& put_user(pval
, policy
))
843 err
= copy_nodes_to_user(nmask
, maxnode
, &nodes
);
850 asmlinkage
long compat_sys_get_mempolicy(int __user
*policy
,
851 compat_ulong_t __user
*nmask
,
852 compat_ulong_t maxnode
,
853 compat_ulong_t addr
, compat_ulong_t flags
)
856 unsigned long __user
*nm
= NULL
;
857 unsigned long nr_bits
, alloc_size
;
858 DECLARE_BITMAP(bm
, MAX_NUMNODES
);
860 nr_bits
= min_t(unsigned long, maxnode
-1, MAX_NUMNODES
);
861 alloc_size
= ALIGN(nr_bits
, BITS_PER_LONG
) / 8;
864 nm
= compat_alloc_user_space(alloc_size
);
866 err
= sys_get_mempolicy(policy
, nm
, nr_bits
+1, addr
, flags
);
869 err
= copy_from_user(bm
, nm
, alloc_size
);
870 /* ensure entire bitmap is zeroed */
871 err
|= clear_user(nmask
, ALIGN(maxnode
-1, 8) / 8);
872 err
|= compat_put_bitmap(nmask
, bm
, nr_bits
);
878 asmlinkage
long compat_sys_set_mempolicy(int mode
, compat_ulong_t __user
*nmask
,
879 compat_ulong_t maxnode
)
882 unsigned long __user
*nm
= NULL
;
883 unsigned long nr_bits
, alloc_size
;
884 DECLARE_BITMAP(bm
, MAX_NUMNODES
);
886 nr_bits
= min_t(unsigned long, maxnode
-1, MAX_NUMNODES
);
887 alloc_size
= ALIGN(nr_bits
, BITS_PER_LONG
) / 8;
890 err
= compat_get_bitmap(bm
, nmask
, nr_bits
);
891 nm
= compat_alloc_user_space(alloc_size
);
892 err
|= copy_to_user(nm
, bm
, alloc_size
);
898 return sys_set_mempolicy(mode
, nm
, nr_bits
+1);
901 asmlinkage
long compat_sys_mbind(compat_ulong_t start
, compat_ulong_t len
,
902 compat_ulong_t mode
, compat_ulong_t __user
*nmask
,
903 compat_ulong_t maxnode
, compat_ulong_t flags
)
906 unsigned long __user
*nm
= NULL
;
907 unsigned long nr_bits
, alloc_size
;
910 nr_bits
= min_t(unsigned long, maxnode
-1, MAX_NUMNODES
);
911 alloc_size
= ALIGN(nr_bits
, BITS_PER_LONG
) / 8;
914 err
= compat_get_bitmap(nodes_addr(bm
), nmask
, nr_bits
);
915 nm
= compat_alloc_user_space(alloc_size
);
916 err
|= copy_to_user(nm
, nodes_addr(bm
), alloc_size
);
922 return sys_mbind(start
, len
, mode
, nm
, nr_bits
+1, flags
);
927 /* Return effective policy for a VMA */
929 get_vma_policy(struct task_struct
*task
, struct vm_area_struct
*vma
, unsigned long addr
)
931 struct mempolicy
*pol
= task
->mempolicy
;
934 if (vma
->vm_ops
&& vma
->vm_ops
->get_policy
)
935 pol
= vma
->vm_ops
->get_policy(vma
, addr
);
936 else if (vma
->vm_policy
&&
937 vma
->vm_policy
->policy
!= MPOL_DEFAULT
)
938 pol
= vma
->vm_policy
;
941 pol
= &default_policy
;
945 /* Return a zonelist representing a mempolicy */
946 static struct zonelist
*zonelist_policy(gfp_t gfp
, struct mempolicy
*policy
)
950 switch (policy
->policy
) {
952 nd
= policy
->v
.preferred_node
;
957 /* Lower zones don't get a policy applied */
958 /* Careful: current->mems_allowed might have moved */
959 if (gfp_zone(gfp
) >= policy_zone
)
960 if (cpuset_zonelist_valid_mems_allowed(policy
->v
.zonelist
))
961 return policy
->v
.zonelist
;
963 case MPOL_INTERLEAVE
: /* should not happen */
971 return NODE_DATA(nd
)->node_zonelists
+ gfp_zone(gfp
);
974 /* Do dynamic interleaving for a process */
975 static unsigned interleave_nodes(struct mempolicy
*policy
)
978 struct task_struct
*me
= current
;
981 next
= next_node(nid
, policy
->v
.nodes
);
982 if (next
>= MAX_NUMNODES
)
983 next
= first_node(policy
->v
.nodes
);
988 /* Do static interleaving for a VMA with known offset. */
989 static unsigned offset_il_node(struct mempolicy
*pol
,
990 struct vm_area_struct
*vma
, unsigned long off
)
992 unsigned nnodes
= nodes_weight(pol
->v
.nodes
);
993 unsigned target
= (unsigned)off
% nnodes
;
999 nid
= next_node(nid
, pol
->v
.nodes
);
1001 } while (c
<= target
);
1005 /* Determine a node number for interleave */
1006 static inline unsigned interleave_nid(struct mempolicy
*pol
,
1007 struct vm_area_struct
*vma
, unsigned long addr
, int shift
)
1012 off
= vma
->vm_pgoff
;
1013 off
+= (addr
- vma
->vm_start
) >> shift
;
1014 return offset_il_node(pol
, vma
, off
);
1016 return interleave_nodes(pol
);
1019 /* Return a zonelist suitable for a huge page allocation. */
1020 struct zonelist
*huge_zonelist(struct vm_area_struct
*vma
, unsigned long addr
)
1022 struct mempolicy
*pol
= get_vma_policy(current
, vma
, addr
);
1024 if (pol
->policy
== MPOL_INTERLEAVE
) {
1027 nid
= interleave_nid(pol
, vma
, addr
, HPAGE_SHIFT
);
1028 return NODE_DATA(nid
)->node_zonelists
+ gfp_zone(GFP_HIGHUSER
);
1030 return zonelist_policy(GFP_HIGHUSER
, pol
);
1033 /* Allocate a page in interleaved policy.
1034 Own path because it needs to do special accounting. */
1035 static struct page
*alloc_page_interleave(gfp_t gfp
, unsigned order
,
1038 struct zonelist
*zl
;
1041 zl
= NODE_DATA(nid
)->node_zonelists
+ gfp_zone(gfp
);
1042 page
= __alloc_pages(gfp
, order
, zl
);
1043 if (page
&& page_zone(page
) == zl
->zones
[0]) {
1044 zone_pcp(zl
->zones
[0],get_cpu())->interleave_hit
++;
1051 * alloc_page_vma - Allocate a page for a VMA.
1054 * %GFP_USER user allocation.
1055 * %GFP_KERNEL kernel allocations,
1056 * %GFP_HIGHMEM highmem/user allocations,
1057 * %GFP_FS allocation should not call back into a file system.
1058 * %GFP_ATOMIC don't sleep.
1060 * @vma: Pointer to VMA or NULL if not available.
1061 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1063 * This function allocates a page from the kernel page pool and applies
1064 * a NUMA policy associated with the VMA or the current process.
1065 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1066 * mm_struct of the VMA to prevent it from going away. Should be used for
1067 * all allocations for pages that will be mapped into
1068 * user space. Returns NULL when no page can be allocated.
1070 * Should be called with the mm_sem of the vma hold.
1073 alloc_page_vma(gfp_t gfp
, struct vm_area_struct
*vma
, unsigned long addr
)
1075 struct mempolicy
*pol
= get_vma_policy(current
, vma
, addr
);
1077 cpuset_update_current_mems_allowed();
1079 if (unlikely(pol
->policy
== MPOL_INTERLEAVE
)) {
1082 nid
= interleave_nid(pol
, vma
, addr
, PAGE_SHIFT
);
1083 return alloc_page_interleave(gfp
, 0, nid
);
1085 return __alloc_pages(gfp
, 0, zonelist_policy(gfp
, pol
));
1089 * alloc_pages_current - Allocate pages.
1092 * %GFP_USER user allocation,
1093 * %GFP_KERNEL kernel allocation,
1094 * %GFP_HIGHMEM highmem allocation,
1095 * %GFP_FS don't call back into a file system.
1096 * %GFP_ATOMIC don't sleep.
1097 * @order: Power of two of allocation size in pages. 0 is a single page.
1099 * Allocate a page from the kernel page pool. When not in
1100 * interrupt context and apply the current process NUMA policy.
1101 * Returns NULL when no page can be allocated.
1103 * Don't call cpuset_update_current_mems_allowed() unless
1104 * 1) it's ok to take cpuset_sem (can WAIT), and
1105 * 2) allocating for current task (not interrupt).
1107 struct page
*alloc_pages_current(gfp_t gfp
, unsigned order
)
1109 struct mempolicy
*pol
= current
->mempolicy
;
1111 if ((gfp
& __GFP_WAIT
) && !in_interrupt())
1112 cpuset_update_current_mems_allowed();
1113 if (!pol
|| in_interrupt())
1114 pol
= &default_policy
;
1115 if (pol
->policy
== MPOL_INTERLEAVE
)
1116 return alloc_page_interleave(gfp
, order
, interleave_nodes(pol
));
1117 return __alloc_pages(gfp
, order
, zonelist_policy(gfp
, pol
));
1119 EXPORT_SYMBOL(alloc_pages_current
);
1121 /* Slow path of a mempolicy copy */
1122 struct mempolicy
*__mpol_copy(struct mempolicy
*old
)
1124 struct mempolicy
*new = kmem_cache_alloc(policy_cache
, GFP_KERNEL
);
1127 return ERR_PTR(-ENOMEM
);
1129 atomic_set(&new->refcnt
, 1);
1130 if (new->policy
== MPOL_BIND
) {
1131 int sz
= ksize(old
->v
.zonelist
);
1132 new->v
.zonelist
= kmalloc(sz
, SLAB_KERNEL
);
1133 if (!new->v
.zonelist
) {
1134 kmem_cache_free(policy_cache
, new);
1135 return ERR_PTR(-ENOMEM
);
1137 memcpy(new->v
.zonelist
, old
->v
.zonelist
, sz
);
1142 /* Slow path of a mempolicy comparison */
1143 int __mpol_equal(struct mempolicy
*a
, struct mempolicy
*b
)
1147 if (a
->policy
!= b
->policy
)
1149 switch (a
->policy
) {
1152 case MPOL_INTERLEAVE
:
1153 return nodes_equal(a
->v
.nodes
, b
->v
.nodes
);
1154 case MPOL_PREFERRED
:
1155 return a
->v
.preferred_node
== b
->v
.preferred_node
;
1158 for (i
= 0; a
->v
.zonelist
->zones
[i
]; i
++)
1159 if (a
->v
.zonelist
->zones
[i
] != b
->v
.zonelist
->zones
[i
])
1161 return b
->v
.zonelist
->zones
[i
] == NULL
;
1169 /* Slow path of a mpol destructor. */
1170 void __mpol_free(struct mempolicy
*p
)
1172 if (!atomic_dec_and_test(&p
->refcnt
))
1174 if (p
->policy
== MPOL_BIND
)
1175 kfree(p
->v
.zonelist
);
1176 p
->policy
= MPOL_DEFAULT
;
1177 kmem_cache_free(policy_cache
, p
);
1181 * Shared memory backing store policy support.
1183 * Remember policies even when nobody has shared memory mapped.
1184 * The policies are kept in Red-Black tree linked from the inode.
1185 * They are protected by the sp->lock spinlock, which should be held
1186 * for any accesses to the tree.
1189 /* lookup first element intersecting start-end */
1190 /* Caller holds sp->lock */
1191 static struct sp_node
*
1192 sp_lookup(struct shared_policy
*sp
, unsigned long start
, unsigned long end
)
1194 struct rb_node
*n
= sp
->root
.rb_node
;
1197 struct sp_node
*p
= rb_entry(n
, struct sp_node
, nd
);
1199 if (start
>= p
->end
)
1201 else if (end
<= p
->start
)
1209 struct sp_node
*w
= NULL
;
1210 struct rb_node
*prev
= rb_prev(n
);
1213 w
= rb_entry(prev
, struct sp_node
, nd
);
1214 if (w
->end
<= start
)
1218 return rb_entry(n
, struct sp_node
, nd
);
1221 /* Insert a new shared policy into the list. */
1222 /* Caller holds sp->lock */
1223 static void sp_insert(struct shared_policy
*sp
, struct sp_node
*new)
1225 struct rb_node
**p
= &sp
->root
.rb_node
;
1226 struct rb_node
*parent
= NULL
;
1231 nd
= rb_entry(parent
, struct sp_node
, nd
);
1232 if (new->start
< nd
->start
)
1234 else if (new->end
> nd
->end
)
1235 p
= &(*p
)->rb_right
;
1239 rb_link_node(&new->nd
, parent
, p
);
1240 rb_insert_color(&new->nd
, &sp
->root
);
1241 PDprintk("inserting %lx-%lx: %d\n", new->start
, new->end
,
1242 new->policy
? new->policy
->policy
: 0);
1245 /* Find shared policy intersecting idx */
1247 mpol_shared_policy_lookup(struct shared_policy
*sp
, unsigned long idx
)
1249 struct mempolicy
*pol
= NULL
;
1252 if (!sp
->root
.rb_node
)
1254 spin_lock(&sp
->lock
);
1255 sn
= sp_lookup(sp
, idx
, idx
+1);
1257 mpol_get(sn
->policy
);
1260 spin_unlock(&sp
->lock
);
1264 static void sp_delete(struct shared_policy
*sp
, struct sp_node
*n
)
1266 PDprintk("deleting %lx-l%x\n", n
->start
, n
->end
);
1267 rb_erase(&n
->nd
, &sp
->root
);
1268 mpol_free(n
->policy
);
1269 kmem_cache_free(sn_cache
, n
);
1273 sp_alloc(unsigned long start
, unsigned long end
, struct mempolicy
*pol
)
1275 struct sp_node
*n
= kmem_cache_alloc(sn_cache
, GFP_KERNEL
);
1286 /* Replace a policy range. */
1287 static int shared_policy_replace(struct shared_policy
*sp
, unsigned long start
,
1288 unsigned long end
, struct sp_node
*new)
1290 struct sp_node
*n
, *new2
= NULL
;
1293 spin_lock(&sp
->lock
);
1294 n
= sp_lookup(sp
, start
, end
);
1295 /* Take care of old policies in the same range. */
1296 while (n
&& n
->start
< end
) {
1297 struct rb_node
*next
= rb_next(&n
->nd
);
1298 if (n
->start
>= start
) {
1304 /* Old policy spanning whole new range. */
1307 spin_unlock(&sp
->lock
);
1308 new2
= sp_alloc(end
, n
->end
, n
->policy
);
1314 sp_insert(sp
, new2
);
1322 n
= rb_entry(next
, struct sp_node
, nd
);
1326 spin_unlock(&sp
->lock
);
1328 mpol_free(new2
->policy
);
1329 kmem_cache_free(sn_cache
, new2
);
1334 int mpol_set_shared_policy(struct shared_policy
*info
,
1335 struct vm_area_struct
*vma
, struct mempolicy
*npol
)
1338 struct sp_node
*new = NULL
;
1339 unsigned long sz
= vma_pages(vma
);
1341 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1343 sz
, npol
? npol
->policy
: -1,
1344 npol
? nodes_addr(npol
->v
.nodes
)[0] : -1);
1347 new = sp_alloc(vma
->vm_pgoff
, vma
->vm_pgoff
+ sz
, npol
);
1351 err
= shared_policy_replace(info
, vma
->vm_pgoff
, vma
->vm_pgoff
+sz
, new);
1353 kmem_cache_free(sn_cache
, new);
1357 /* Free a backing policy store on inode delete. */
1358 void mpol_free_shared_policy(struct shared_policy
*p
)
1361 struct rb_node
*next
;
1363 if (!p
->root
.rb_node
)
1365 spin_lock(&p
->lock
);
1366 next
= rb_first(&p
->root
);
1368 n
= rb_entry(next
, struct sp_node
, nd
);
1369 next
= rb_next(&n
->nd
);
1370 rb_erase(&n
->nd
, &p
->root
);
1371 mpol_free(n
->policy
);
1372 kmem_cache_free(sn_cache
, n
);
1374 spin_unlock(&p
->lock
);
1377 /* assumes fs == KERNEL_DS */
1378 void __init
numa_policy_init(void)
1380 policy_cache
= kmem_cache_create("numa_policy",
1381 sizeof(struct mempolicy
),
1382 0, SLAB_PANIC
, NULL
, NULL
);
1384 sn_cache
= kmem_cache_create("shared_policy_node",
1385 sizeof(struct sp_node
),
1386 0, SLAB_PANIC
, NULL
, NULL
);
1388 /* Set interleaving policy for system init. This way not all
1389 the data structures allocated at system boot end up in node zero. */
1391 if (do_set_mempolicy(MPOL_INTERLEAVE
, &node_online_map
))
1392 printk("numa_policy_init: interleaving failed\n");
1395 /* Reset policy of current process to default */
1396 void numa_default_policy(void)
1398 do_set_mempolicy(MPOL_DEFAULT
, NULL
);
1401 /* Migrate a policy to a different set of nodes */
1402 static void rebind_policy(struct mempolicy
*pol
, const nodemask_t
*old
,
1403 const nodemask_t
*new)
1410 switch (pol
->policy
) {
1413 case MPOL_INTERLEAVE
:
1414 nodes_remap(tmp
, pol
->v
.nodes
, *old
, *new);
1416 current
->il_next
= node_remap(current
->il_next
, *old
, *new);
1418 case MPOL_PREFERRED
:
1419 pol
->v
.preferred_node
= node_remap(pol
->v
.preferred_node
,
1425 struct zonelist
*zonelist
;
1428 for (z
= pol
->v
.zonelist
->zones
; *z
; z
++)
1429 node_set((*z
)->zone_pgdat
->node_id
, nodes
);
1430 nodes_remap(tmp
, nodes
, *old
, *new);
1433 zonelist
= bind_zonelist(&nodes
);
1435 /* If no mem, then zonelist is NULL and we keep old zonelist.
1436 * If that old zonelist has no remaining mems_allowed nodes,
1437 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1441 /* Good - got mem - substitute new zonelist */
1442 kfree(pol
->v
.zonelist
);
1443 pol
->v
.zonelist
= zonelist
;
1454 * Someone moved this task to different nodes. Fixup mempolicies.
1456 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1457 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1459 void numa_policy_rebind(const nodemask_t
*old
, const nodemask_t
*new)
1461 rebind_policy(current
->mempolicy
, old
, new);