mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/gfp.h>
  77 #include <linux/slab.h>
  78 #include <linux/string.h>
  79 #include <linux/module.h>
  80 #include <linux/nsproxy.h>
  81 #include <linux/interrupt.h>
  82 #include <linux/init.h>
  83 #include <linux/compat.h>
  84 #include <linux/swap.h>
  85 #include <linux/seq_file.h>
  86 #include <linux/proc_fs.h>
  87 #include <linux/migrate.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91
  92 #include <asm/tlbflush.h>
  93 #include <asm/uaccess.h>
  94
  95 /* Internal flags */
  96 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  97 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  98 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  99
 100 static struct kmem_cache *policy_cache;
 101 static struct kmem_cache *sn_cache;
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 enum zone_type policy_zone = 0;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 static const struct mempolicy_operations {
 113         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 114         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 115 } mpol_ops[MPOL_MAX];
 116
 117 /* Check that the nodemask contains at least one populated zone */
 118 static int is_valid_nodemask(const nodemask_t *nodemask)
 119 {
 120         int nd, k;
 121
 122         /* Check that there is something useful in this mask */
 123         k = policy_zone;
 124
 125         for_each_node_mask(nd, *nodemask) {
 126                 struct zone *z;
 127
 128                 for (k = 0; k <= policy_zone; k++) {
 129                         z = &NODE_DATA(nd)->node_zones[k];
 130                         if (z->present_pages > 0)
 131                                 return 1;
 132                 }
 133         }
 134
 135         return 0;
 136 }
 137
 138 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 139 {
 140         return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 141 }
 142
 143 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 144                                    const nodemask_t *rel)
 145 {
 146         nodemask_t tmp;
 147         nodes_fold(tmp, *orig, nodes_weight(*rel));
 148         nodes_onto(*ret, tmp, *rel);
 149 }
 150
 151 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 152 {
 153         if (nodes_empty(*nodes))
 154                 return -EINVAL;
 155         pol->v.nodes = *nodes;
 156         return 0;
 157 }
 158
 159 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 160 {
 161         if (!nodes)
 162                 pol->v.preferred_node = -1;     /* local allocation */
 163         else if (nodes_empty(*nodes))
 164                 return -EINVAL;                 /*  no allowed nodes */
 165         else
 166                 pol->v.preferred_node = first_node(*nodes);
 167         return 0;
 168 }
 169
 170 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 171 {
 172         if (!is_valid_nodemask(nodes))
 173                 return -EINVAL;
 174         pol->v.nodes = *nodes;
 175         return 0;
 176 }
 177
 178 /* Create a new policy */
 179 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 180                                   nodemask_t *nodes)
 181 {
 182         struct mempolicy *policy;
 183         nodemask_t cpuset_context_nmask;
 184         int ret;
 185
 186         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 187                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 188
 189         if (mode == MPOL_DEFAULT) {
 190                 if (nodes && !nodes_empty(*nodes))
 191                         return ERR_PTR(-EINVAL);
 192                 return NULL;
 193         }
 194         VM_BUG_ON(!nodes);
 195
 196         /*
 197          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 198          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 199          * All other modes require a valid pointer to a non-empty nodemask.
 200          */
 201         if (mode == MPOL_PREFERRED) {
 202                 if (nodes_empty(*nodes)) {
 203                         if (((flags & MPOL_F_STATIC_NODES) ||
 204                              (flags & MPOL_F_RELATIVE_NODES)))
 205                                 return ERR_PTR(-EINVAL);
 206                         nodes = NULL;   /* flag local alloc */
 207                 }
 208         } else if (nodes_empty(*nodes))
 209                 return ERR_PTR(-EINVAL);
 210         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 211         if (!policy)
 212                 return ERR_PTR(-ENOMEM);
 213         atomic_set(&policy->refcnt, 1);
 214         policy->policy = mode;
 215         policy->flags = flags;
 216
 217         if (nodes) {
 218                 /*
 219                  * cpuset related setup doesn't apply to local allocation
 220                  */
 221                 cpuset_update_task_memory_state();
 222                 if (flags & MPOL_F_RELATIVE_NODES)
 223                         mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 224                                                &cpuset_current_mems_allowed);
 225                 else
 226                         nodes_and(cpuset_context_nmask, *nodes,
 227                                   cpuset_current_mems_allowed);
 228                 if (mpol_store_user_nodemask(policy))
 229                         policy->w.user_nodemask = *nodes;
 230                 else
 231                         policy->w.cpuset_mems_allowed =
 232                                                 cpuset_mems_allowed(current);
 233         }
 234
 235         ret = mpol_ops[mode].create(policy,
 236                                 nodes ? &cpuset_context_nmask : NULL);
 237         if (ret < 0) {
 238                 kmem_cache_free(policy_cache, policy);
 239                 return ERR_PTR(ret);
 240         }
 241         return policy;
 242 }
 243
 244 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 245 {
 246 }
 247
 248 static void mpol_rebind_nodemask(struct mempolicy *pol,
 249                                  const nodemask_t *nodes)
 250 {
 251         nodemask_t tmp;
 252
 253         if (pol->flags & MPOL_F_STATIC_NODES)
 254                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 255         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 256                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 257         else {
 258                 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 259                             *nodes);
 260                 pol->w.cpuset_mems_allowed = *nodes;
 261         }
 262
 263         pol->v.nodes = tmp;
 264         if (!node_isset(current->il_next, tmp)) {
 265                 current->il_next = next_node(current->il_next, tmp);
 266                 if (current->il_next >= MAX_NUMNODES)
 267                         current->il_next = first_node(tmp);
 268                 if (current->il_next >= MAX_NUMNODES)
 269                         current->il_next = numa_node_id();
 270         }
 271 }
 272
 273 static void mpol_rebind_preferred(struct mempolicy *pol,
 274                                   const nodemask_t *nodes)
 275 {
 276         nodemask_t tmp;
 277
 278         if (pol->flags & MPOL_F_STATIC_NODES) {
 279                 int node = first_node(pol->w.user_nodemask);
 280
 281                 if (node_isset(node, *nodes))
 282                         pol->v.preferred_node = node;
 283                 else
 284                         pol->v.preferred_node = -1;
 285         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 286                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 287                 pol->v.preferred_node = first_node(tmp);
 288         } else if (pol->v.preferred_node != -1) {
 289                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 290                                                    pol->w.cpuset_mems_allowed,
 291                                                    *nodes);
 292                 pol->w.cpuset_mems_allowed = *nodes;
 293         }
 294 }
 295
 296 /* Migrate a policy to a different set of nodes */
 297 static void mpol_rebind_policy(struct mempolicy *pol,
 298                                const nodemask_t *newmask)
 299 {
 300         if (!pol)
 301                 return;
 302         if (!mpol_store_user_nodemask(pol) &&
 303             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 304                 return;
 305         mpol_ops[pol->policy].rebind(pol, newmask);
 306 }
 307
 308 /*
 309  * Wrapper for mpol_rebind_policy() that just requires task
 310  * pointer, and updates task mempolicy.
 311  */
 312
 313 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 314 {
 315         mpol_rebind_policy(tsk->mempolicy, new);
 316 }
 317
 318 /*
 319  * Rebind each vma in mm to new nodemask.
 320  *
 321  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 322  */
 323
 324 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 325 {
 326         struct vm_area_struct *vma;
 327
 328         down_write(&mm->mmap_sem);
 329         for (vma = mm->mmap; vma; vma = vma->vm_next)
 330                 mpol_rebind_policy(vma->vm_policy, new);
 331         up_write(&mm->mmap_sem);
 332 }
 333
 334 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 335         [MPOL_DEFAULT] = {
 336                 .rebind = mpol_rebind_default,
 337         },
 338         [MPOL_INTERLEAVE] = {
 339                 .create = mpol_new_interleave,
 340                 .rebind = mpol_rebind_nodemask,
 341         },
 342         [MPOL_PREFERRED] = {
 343                 .create = mpol_new_preferred,
 344                 .rebind = mpol_rebind_preferred,
 345         },
 346         [MPOL_BIND] = {
 347                 .create = mpol_new_bind,
 348                 .rebind = mpol_rebind_nodemask,
 349         },
 350 };
 351
 352 static void gather_stats(struct page *, void *, int pte_dirty);
 353 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 354                                 unsigned long flags);
 355
 356 /* Scan through pages checking if pages follow certain conditions. */
 357 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 358                 unsigned long addr, unsigned long end,
 359                 const nodemask_t *nodes, unsigned long flags,
 360                 void *private)
 361 {
 362         pte_t *orig_pte;
 363         pte_t *pte;
 364         spinlock_t *ptl;
 365
 366         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 367         do {
 368                 struct page *page;
 369                 int nid;
 370
 371                 if (!pte_present(*pte))
 372                         continue;
 373                 page = vm_normal_page(vma, addr, *pte);
 374                 if (!page)
 375                         continue;
 376                 /*
 377                  * The check for PageReserved here is important to avoid
 378                  * handling zero pages and other pages that may have been
 379                  * marked special by the system.
 380                  *
 381                  * If the PageReserved would not be checked here then f.e.
 382                  * the location of the zero page could have an influence
 383                  * on MPOL_MF_STRICT, zero pages would be counted for
 384                  * the per node stats, and there would be useless attempts
 385                  * to put zero pages on the migration list.
 386                  */
 387                 if (PageReserved(page))
 388                         continue;
 389                 nid = page_to_nid(page);
 390                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 391                         continue;
 392
 393                 if (flags & MPOL_MF_STATS)
 394                         gather_stats(page, private, pte_dirty(*pte));
 395                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 396                         migrate_page_add(page, private, flags);
 397                 else
 398                         break;
 399         } while (pte++, addr += PAGE_SIZE, addr != end);
 400         pte_unmap_unlock(orig_pte, ptl);
 401         return addr != end;
 402 }
 403
 404 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 405                 unsigned long addr, unsigned long end,
 406                 const nodemask_t *nodes, unsigned long flags,
 407                 void *private)
 408 {
 409         pmd_t *pmd;
 410         unsigned long next;
 411
 412         pmd = pmd_offset(pud, addr);
 413         do {
 414                 next = pmd_addr_end(addr, end);
 415                 if (pmd_none_or_clear_bad(pmd))
 416                         continue;
 417                 if (check_pte_range(vma, pmd, addr, next, nodes,
 418                                     flags, private))
 419                         return -EIO;
 420         } while (pmd++, addr = next, addr != end);
 421         return 0;
 422 }
 423
 424 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 425                 unsigned long addr, unsigned long end,
 426                 const nodemask_t *nodes, unsigned long flags,
 427                 void *private)
 428 {
 429         pud_t *pud;
 430         unsigned long next;
 431
 432         pud = pud_offset(pgd, addr);
 433         do {
 434                 next = pud_addr_end(addr, end);
 435                 if (pud_none_or_clear_bad(pud))
 436                         continue;
 437                 if (check_pmd_range(vma, pud, addr, next, nodes,
 438                                     flags, private))
 439                         return -EIO;
 440         } while (pud++, addr = next, addr != end);
 441         return 0;
 442 }
 443
 444 static inline int check_pgd_range(struct vm_area_struct *vma,
 445                 unsigned long addr, unsigned long end,
 446                 const nodemask_t *nodes, unsigned long flags,
 447                 void *private)
 448 {
 449         pgd_t *pgd;
 450         unsigned long next;
 451
 452         pgd = pgd_offset(vma->vm_mm, addr);
 453         do {
 454                 next = pgd_addr_end(addr, end);
 455                 if (pgd_none_or_clear_bad(pgd))
 456                         continue;
 457                 if (check_pud_range(vma, pgd, addr, next, nodes,
 458                                     flags, private))
 459                         return -EIO;
 460         } while (pgd++, addr = next, addr != end);
 461         return 0;
 462 }
 463
 464 /*
 465  * Check if all pages in a range are on a set of nodes.
 466  * If pagelist != NULL then isolate pages from the LRU and
 467  * put them on the pagelist.
 468  */
 469 static struct vm_area_struct *
 470 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 471                 const nodemask_t *nodes, unsigned long flags, void *private)
 472 {
 473         int err;
 474         struct vm_area_struct *first, *vma, *prev;
 475
 476         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 477
 478                 err = migrate_prep();
 479                 if (err)
 480                         return ERR_PTR(err);
 481         }
 482
 483         first = find_vma(mm, start);
 484         if (!first)
 485                 return ERR_PTR(-EFAULT);
 486         prev = NULL;
 487         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 488                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 489                         if (!vma->vm_next && vma->vm_end < end)
 490                                 return ERR_PTR(-EFAULT);
 491                         if (prev && prev->vm_end < vma->vm_start)
 492                                 return ERR_PTR(-EFAULT);
 493                 }
 494                 if (!is_vm_hugetlb_page(vma) &&
 495                     ((flags & MPOL_MF_STRICT) ||
 496                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 497                                 vma_migratable(vma)))) {
 498                         unsigned long endvma = vma->vm_end;
 499
 500                         if (endvma > end)
 501                                 endvma = end;
 502                         if (vma->vm_start > start)
 503                                 start = vma->vm_start;
 504                         err = check_pgd_range(vma, start, endvma, nodes,
 505                                                 flags, private);
 506                         if (err) {
 507                                 first = ERR_PTR(err);
 508                                 break;
 509                         }
 510                 }
 511                 prev = vma;
 512         }
 513         return first;
 514 }
 515
 516 /* Apply policy to a single VMA */
 517 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 518 {
 519         int err = 0;
 520         struct mempolicy *old = vma->vm_policy;
 521
 522         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 523                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 524                  vma->vm_ops, vma->vm_file,
 525                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 526
 527         if (vma->vm_ops && vma->vm_ops->set_policy)
 528                 err = vma->vm_ops->set_policy(vma, new);
 529         if (!err) {
 530                 mpol_get(new);
 531                 vma->vm_policy = new;
 532                 mpol_put(old);
 533         }
 534         return err;
 535 }
 536
 537 /* Step 2: apply policy to a range and do splits. */
 538 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 539                        unsigned long end, struct mempolicy *new)
 540 {
 541         struct vm_area_struct *next;
 542         int err;
 543
 544         err = 0;
 545         for (; vma && vma->vm_start < end; vma = next) {
 546                 next = vma->vm_next;
 547                 if (vma->vm_start < start)
 548                         err = split_vma(vma->vm_mm, vma, start, 1);
 549                 if (!err && vma->vm_end > end)
 550                         err = split_vma(vma->vm_mm, vma, end, 0);
 551                 if (!err)
 552                         err = policy_vma(vma, new);
 553                 if (err)
 554                         break;
 555         }
 556         return err;
 557 }
 558
 559 /*
 560  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 561  * mempolicy.  Allows more rapid checking of this (combined perhaps
 562  * with other PF_* flag bits) on memory allocation hot code paths.
 563  *
 564  * If called from outside this file, the task 'p' should -only- be
 565  * a newly forked child not yet visible on the task list, because
 566  * manipulating the task flags of a visible task is not safe.
 567  *
 568  * The above limitation is why this routine has the funny name
 569  * mpol_fix_fork_child_flag().
 570  *
 571  * It is also safe to call this with a task pointer of current,
 572  * which the static wrapper mpol_set_task_struct_flag() does,
 573  * for use within this file.
 574  */
 575
 576 void mpol_fix_fork_child_flag(struct task_struct *p)
 577 {
 578         if (p->mempolicy)
 579                 p->flags |= PF_MEMPOLICY;
 580         else
 581                 p->flags &= ~PF_MEMPOLICY;
 582 }
 583
 584 static void mpol_set_task_struct_flag(void)
 585 {
 586         mpol_fix_fork_child_flag(current);
 587 }
 588
 589 /* Set the process memory policy */
 590 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 591                              nodemask_t *nodes)
 592 {
 593         struct mempolicy *new;
 594
 595         new = mpol_new(mode, flags, nodes);
 596         if (IS_ERR(new))
 597                 return PTR_ERR(new);
 598         mpol_put(current->mempolicy);
 599         current->mempolicy = new;
 600         mpol_set_task_struct_flag();
 601         if (new && new->policy == MPOL_INTERLEAVE &&
 602             nodes_weight(new->v.nodes))
 603                 current->il_next = first_node(new->v.nodes);
 604         return 0;
 605 }
 606
 607 /* Fill a zone bitmap for a policy */
 608 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 609 {
 610         nodes_clear(*nodes);
 611         switch (p->policy) {
 612         case MPOL_DEFAULT:
 613                 break;
 614         case MPOL_BIND:
 615                 /* Fall through */
 616         case MPOL_INTERLEAVE:
 617                 *nodes = p->v.nodes;
 618                 break;
 619         case MPOL_PREFERRED:
 620                 /* or use current node instead of memory_map? */
 621                 if (p->v.preferred_node < 0)
 622                         *nodes = node_states[N_HIGH_MEMORY];
 623                 else
 624                         node_set(p->v.preferred_node, *nodes);
 625                 break;
 626         default:
 627                 BUG();
 628         }
 629 }
 630
 631 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 632 {
 633         struct page *p;
 634         int err;
 635
 636         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 637         if (err >= 0) {
 638                 err = page_to_nid(p);
 639                 put_page(p);
 640         }
 641         return err;
 642 }
 643
 644 /* Retrieve NUMA policy */
 645 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 646                              unsigned long addr, unsigned long flags)
 647 {
 648         int err;
 649         struct mm_struct *mm = current->mm;
 650         struct vm_area_struct *vma = NULL;
 651         struct mempolicy *pol = current->mempolicy;
 652
 653         cpuset_update_task_memory_state();
 654         if (flags &
 655                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 656                 return -EINVAL;
 657
 658         if (flags & MPOL_F_MEMS_ALLOWED) {
 659                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 660                         return -EINVAL;
 661                 *policy = 0;    /* just so it's initialized */
 662                 *nmask  = cpuset_current_mems_allowed;
 663                 return 0;
 664         }
 665
 666         if (flags & MPOL_F_ADDR) {
 667                 down_read(&mm->mmap_sem);
 668                 vma = find_vma_intersection(mm, addr, addr+1);
 669                 if (!vma) {
 670                         up_read(&mm->mmap_sem);
 671                         return -EFAULT;
 672                 }
 673                 if (vma->vm_ops && vma->vm_ops->get_policy)
 674                         pol = vma->vm_ops->get_policy(vma, addr);
 675                 else
 676                         pol = vma->vm_policy;
 677         } else if (addr)
 678                 return -EINVAL;
 679
 680         if (!pol)
 681                 pol = &default_policy;
 682
 683         if (flags & MPOL_F_NODE) {
 684                 if (flags & MPOL_F_ADDR) {
 685                         err = lookup_node(mm, addr);
 686                         if (err < 0)
 687                                 goto out;
 688                         *policy = err;
 689                 } else if (pol == current->mempolicy &&
 690                                 pol->policy == MPOL_INTERLEAVE) {
 691                         *policy = current->il_next;
 692                 } else {
 693                         err = -EINVAL;
 694                         goto out;
 695                 }
 696         } else
 697                 *policy = pol->policy | pol->flags;
 698
 699         if (vma) {
 700                 up_read(&current->mm->mmap_sem);
 701                 vma = NULL;
 702         }
 703
 704         err = 0;
 705         if (nmask)
 706                 get_zonemask(pol, nmask);
 707
 708  out:
 709         if (vma)
 710                 up_read(&current->mm->mmap_sem);
 711         return err;
 712 }
 713
 714 #ifdef CONFIG_MIGRATION
 715 /*
 716  * page migration
 717  */
 718 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 719                                 unsigned long flags)
 720 {
 721         /*
 722          * Avoid migrating a page that is shared with others.
 723          */
 724         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 725                 isolate_lru_page(page, pagelist);
 726 }
 727
 728 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 729 {
 730         return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 731 }
 732
 733 /*
 734  * Migrate pages from one node to a target node.
 735  * Returns error or the number of pages not migrated.
 736  */
 737 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 738                            int flags)
 739 {
 740         nodemask_t nmask;
 741         LIST_HEAD(pagelist);
 742         int err = 0;
 743
 744         nodes_clear(nmask);
 745         node_set(source, nmask);
 746
 747         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 748                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 749
 750         if (!list_empty(&pagelist))
 751                 err = migrate_pages(&pagelist, new_node_page, dest);
 752
 753         return err;
 754 }
 755
 756 /*
 757  * Move pages between the two nodesets so as to preserve the physical
 758  * layout as much as possible.
 759  *
 760  * Returns the number of page that could not be moved.
 761  */
 762 int do_migrate_pages(struct mm_struct *mm,
 763         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 764 {
 765         LIST_HEAD(pagelist);
 766         int busy = 0;
 767         int err = 0;
 768         nodemask_t tmp;
 769
 770         down_read(&mm->mmap_sem);
 771
 772         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 773         if (err)
 774                 goto out;
 775
 776 /*
 777  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 778  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 779  * bit in 'tmp', and return that <source, dest> pair for migration.
 780  * The pair of nodemasks 'to' and 'from' define the map.
 781  *
 782  * If no pair of bits is found that way, fallback to picking some
 783  * pair of 'source' and 'dest' bits that are not the same.  If the
 784  * 'source' and 'dest' bits are the same, this represents a node
 785  * that will be migrating to itself, so no pages need move.
 786  *
 787  * If no bits are left in 'tmp', or if all remaining bits left
 788  * in 'tmp' correspond to the same bit in 'to', return false
 789  * (nothing left to migrate).
 790  *
 791  * This lets us pick a pair of nodes to migrate between, such that
 792  * if possible the dest node is not already occupied by some other
 793  * source node, minimizing the risk of overloading the memory on a
 794  * node that would happen if we migrated incoming memory to a node
 795  * before migrating outgoing memory source that same node.
 796  *
 797  * A single scan of tmp is sufficient.  As we go, we remember the
 798  * most recent <s, d> pair that moved (s != d).  If we find a pair
 799  * that not only moved, but what's better, moved to an empty slot
 800  * (d is not set in tmp), then we break out then, with that pair.
 801  * Otherwise when we finish scannng from_tmp, we at least have the
 802  * most recent <s, d> pair that moved.  If we get all the way through
 803  * the scan of tmp without finding any node that moved, much less
 804  * moved to an empty node, then there is nothing left worth migrating.
 805  */
 806
 807         tmp = *from_nodes;
 808         while (!nodes_empty(tmp)) {
 809                 int s,d;
 810                 int source = -1;
 811                 int dest = 0;
 812
 813                 for_each_node_mask(s, tmp) {
 814                         d = node_remap(s, *from_nodes, *to_nodes);
 815                         if (s == d)
 816                                 continue;
 817
 818                         source = s;     /* Node moved. Memorize */
 819                         dest = d;
 820
 821                         /* dest not in remaining from nodes? */
 822                         if (!node_isset(dest, tmp))
 823                                 break;
 824                 }
 825                 if (source == -1)
 826                         break;
 827
 828                 node_clear(source, tmp);
 829                 err = migrate_to_node(mm, source, dest, flags);
 830                 if (err > 0)
 831                         busy += err;
 832                 if (err < 0)
 833                         break;
 834         }
 835 out:
 836         up_read(&mm->mmap_sem);
 837         if (err < 0)
 838                 return err;
 839         return busy;
 840
 841 }
 842
 843 /*
 844  * Allocate a new page for page migration based on vma policy.
 845  * Start assuming that page is mapped by vma pointed to by @private.
 846  * Search forward from there, if not.  N.B., this assumes that the
 847  * list of pages handed to migrate_pages()--which is how we get here--
 848  * is in virtual address order.
 849  */
 850 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 851 {
 852         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 853         unsigned long uninitialized_var(address);
 854
 855         while (vma) {
 856                 address = page_address_in_vma(page, vma);
 857                 if (address != -EFAULT)
 858                         break;
 859                 vma = vma->vm_next;
 860         }
 861
 862         /*
 863          * if !vma, alloc_page_vma() will use task or system default policy
 864          */
 865         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 866 }
 867 #else
 868
 869 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 870                                 unsigned long flags)
 871 {
 872 }
 873
 874 int do_migrate_pages(struct mm_struct *mm,
 875         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 876 {
 877         return -ENOSYS;
 878 }
 879
 880 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 881 {
 882         return NULL;
 883 }
 884 #endif
 885
 886 static long do_mbind(unsigned long start, unsigned long len,
 887                      unsigned short mode, unsigned short mode_flags,
 888                      nodemask_t *nmask, unsigned long flags)
 889 {
 890         struct vm_area_struct *vma;
 891         struct mm_struct *mm = current->mm;
 892         struct mempolicy *new;
 893         unsigned long end;
 894         int err;
 895         LIST_HEAD(pagelist);
 896
 897         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 898                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 899                 return -EINVAL;
 900         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 901                 return -EPERM;
 902
 903         if (start & ~PAGE_MASK)
 904                 return -EINVAL;
 905
 906         if (mode == MPOL_DEFAULT)
 907                 flags &= ~MPOL_MF_STRICT;
 908
 909         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 910         end = start + len;
 911
 912         if (end < start)
 913                 return -EINVAL;
 914         if (end == start)
 915                 return 0;
 916
 917         new = mpol_new(mode, mode_flags, nmask);
 918         if (IS_ERR(new))
 919                 return PTR_ERR(new);
 920
 921         /*
 922          * If we are using the default policy then operation
 923          * on discontinuous address spaces is okay after all
 924          */
 925         if (!new)
 926                 flags |= MPOL_MF_DISCONTIG_OK;
 927
 928         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 929                  start, start + len, mode, mode_flags,
 930                  nmask ? nodes_addr(*nmask)[0] : -1);
 931
 932         down_write(&mm->mmap_sem);
 933         vma = check_range(mm, start, end, nmask,
 934                           flags | MPOL_MF_INVERT, &pagelist);
 935
 936         err = PTR_ERR(vma);
 937         if (!IS_ERR(vma)) {
 938                 int nr_failed = 0;
 939
 940                 err = mbind_range(vma, start, end, new);
 941
 942                 if (!list_empty(&pagelist))
 943                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 944                                                 (unsigned long)vma);
 945
 946                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 947                         err = -EIO;
 948         }
 949
 950         up_write(&mm->mmap_sem);
 951         mpol_put(new);
 952         return err;
 953 }
 954
 955 /*
 956  * User space interface with variable sized bitmaps for nodelists.
 957  */
 958
 959 /* Copy a node mask from user space. */
 960 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 961                      unsigned long maxnode)
 962 {
 963         unsigned long k;
 964         unsigned long nlongs;
 965         unsigned long endmask;
 966
 967         --maxnode;
 968         nodes_clear(*nodes);
 969         if (maxnode == 0 || !nmask)
 970                 return 0;
 971         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 972                 return -EINVAL;
 973
 974         nlongs = BITS_TO_LONGS(maxnode);
 975         if ((maxnode % BITS_PER_LONG) == 0)
 976                 endmask = ~0UL;
 977         else
 978                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 979
 980         /* When the user specified more nodes than supported just check
 981            if the non supported part is all zero. */
 982         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 983                 if (nlongs > PAGE_SIZE/sizeof(long))
 984                         return -EINVAL;
 985                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 986                         unsigned long t;
 987                         if (get_user(t, nmask + k))
 988                                 return -EFAULT;
 989                         if (k == nlongs - 1) {
 990                                 if (t & endmask)
 991                                         return -EINVAL;
 992                         } else if (t)
 993                                 return -EINVAL;
 994                 }
 995                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 996                 endmask = ~0UL;
 997         }
 998
 999         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1000                 return -EFAULT;
1001         nodes_addr(*nodes)[nlongs-1] &= endmask;
1002         return 0;
1003 }
1004
1005 /* Copy a kernel node mask to user space */
1006 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1007                               nodemask_t *nodes)
1008 {
1009         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1010         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1011
1012         if (copy > nbytes) {
1013                 if (copy > PAGE_SIZE)
1014                         return -EINVAL;
1015                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1016                         return -EFAULT;
1017                 copy = nbytes;
1018         }
1019         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1020 }
1021
1022 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1023                         unsigned long mode,
1024                         unsigned long __user *nmask, unsigned long maxnode,
1025                         unsigned flags)
1026 {
1027         nodemask_t nodes;
1028         int err;
1029         unsigned short mode_flags;
1030
1031         mode_flags = mode & MPOL_MODE_FLAGS;
1032         mode &= ~MPOL_MODE_FLAGS;
1033         if (mode >= MPOL_MAX)
1034                 return -EINVAL;
1035         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1036             (mode_flags & MPOL_F_RELATIVE_NODES))
1037                 return -EINVAL;
1038         err = get_nodes(&nodes, nmask, maxnode);
1039         if (err)
1040                 return err;
1041         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1042 }
1043
1044 /* Set the process memory policy */
1045 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1046                 unsigned long maxnode)
1047 {
1048         int err;
1049         nodemask_t nodes;
1050         unsigned short flags;
1051
1052         flags = mode & MPOL_MODE_FLAGS;
1053         mode &= ~MPOL_MODE_FLAGS;
1054         if ((unsigned int)mode >= MPOL_MAX)
1055                 return -EINVAL;
1056         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1057                 return -EINVAL;
1058         err = get_nodes(&nodes, nmask, maxnode);
1059         if (err)
1060                 return err;
1061         return do_set_mempolicy(mode, flags, &nodes);
1062 }
1063
1064 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1065                 const unsigned long __user *old_nodes,
1066                 const unsigned long __user *new_nodes)
1067 {
1068         struct mm_struct *mm;
1069         struct task_struct *task;
1070         nodemask_t old;
1071         nodemask_t new;
1072         nodemask_t task_nodes;
1073         int err;
1074
1075         err = get_nodes(&old, old_nodes, maxnode);
1076         if (err)
1077                 return err;
1078
1079         err = get_nodes(&new, new_nodes, maxnode);
1080         if (err)
1081                 return err;
1082
1083         /* Find the mm_struct */
1084         read_lock(&tasklist_lock);
1085         task = pid ? find_task_by_vpid(pid) : current;
1086         if (!task) {
1087                 read_unlock(&tasklist_lock);
1088                 return -ESRCH;
1089         }
1090         mm = get_task_mm(task);
1091         read_unlock(&tasklist_lock);
1092
1093         if (!mm)
1094                 return -EINVAL;
1095
1096         /*
1097          * Check if this process has the right to modify the specified
1098          * process. The right exists if the process has administrative
1099          * capabilities, superuser privileges or the same
1100          * userid as the target process.
1101          */
1102         if ((current->euid != task->suid) && (current->euid != task->uid) &&
1103             (current->uid != task->suid) && (current->uid != task->uid) &&
1104             !capable(CAP_SYS_NICE)) {
1105                 err = -EPERM;
1106                 goto out;
1107         }
1108
1109         task_nodes = cpuset_mems_allowed(task);
1110         /* Is the user allowed to access the target nodes? */
1111         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1112                 err = -EPERM;
1113                 goto out;
1114         }
1115
1116         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1117                 err = -EINVAL;
1118                 goto out;
1119         }
1120
1121         err = security_task_movememory(task);
1122         if (err)
1123                 goto out;
1124
1125         err = do_migrate_pages(mm, &old, &new,
1126                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1127 out:
1128         mmput(mm);
1129         return err;
1130 }
1131
1132
1133 /* Retrieve NUMA policy */
1134 asmlinkage long sys_get_mempolicy(int __user *policy,
1135                                 unsigned long __user *nmask,
1136                                 unsigned long maxnode,
1137                                 unsigned long addr, unsigned long flags)
1138 {
1139         int err;
1140         int uninitialized_var(pval);
1141         nodemask_t nodes;
1142
1143         if (nmask != NULL && maxnode < MAX_NUMNODES)
1144                 return -EINVAL;
1145
1146         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1147
1148         if (err)
1149                 return err;
1150
1151         if (policy && put_user(pval, policy))
1152                 return -EFAULT;
1153
1154         if (nmask)
1155                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1156
1157         return err;
1158 }
1159
1160 #ifdef CONFIG_COMPAT
1161
1162 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1163                                      compat_ulong_t __user *nmask,
1164                                      compat_ulong_t maxnode,
1165                                      compat_ulong_t addr, compat_ulong_t flags)
1166 {
1167         long err;
1168         unsigned long __user *nm = NULL;
1169         unsigned long nr_bits, alloc_size;
1170         DECLARE_BITMAP(bm, MAX_NUMNODES);
1171
1172         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1173         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1174
1175         if (nmask)
1176                 nm = compat_alloc_user_space(alloc_size);
1177
1178         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1179
1180         if (!err && nmask) {
1181                 err = copy_from_user(bm, nm, alloc_size);
1182                 /* ensure entire bitmap is zeroed */
1183                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1184                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1185         }
1186
1187         return err;
1188 }
1189
1190 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1191                                      compat_ulong_t maxnode)
1192 {
1193         long err = 0;
1194         unsigned long __user *nm = NULL;
1195         unsigned long nr_bits, alloc_size;
1196         DECLARE_BITMAP(bm, MAX_NUMNODES);
1197
1198         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1199         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1200
1201         if (nmask) {
1202                 err = compat_get_bitmap(bm, nmask, nr_bits);
1203                 nm = compat_alloc_user_space(alloc_size);
1204                 err |= copy_to_user(nm, bm, alloc_size);
1205         }
1206
1207         if (err)
1208                 return -EFAULT;
1209
1210         return sys_set_mempolicy(mode, nm, nr_bits+1);
1211 }
1212
1213 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1214                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1215                              compat_ulong_t maxnode, compat_ulong_t flags)
1216 {
1217         long err = 0;
1218         unsigned long __user *nm = NULL;
1219         unsigned long nr_bits, alloc_size;
1220         nodemask_t bm;
1221
1222         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1223         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1224
1225         if (nmask) {
1226                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1227                 nm = compat_alloc_user_space(alloc_size);
1228                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1229         }
1230
1231         if (err)
1232                 return -EFAULT;
1233
1234         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1235 }
1236
1237 #endif
1238
1239 /*
1240  * get_vma_policy(@task, @vma, @addr)
1241  * @task - task for fallback if vma policy == default
1242  * @vma   - virtual memory area whose policy is sought
1243  * @addr  - address in @vma for shared policy lookup
1244  *
1245  * Returns effective policy for a VMA at specified address.
1246  * Falls back to @task or system default policy, as necessary.
1247  * Returned policy has extra reference count if shared, vma,
1248  * or some other task's policy [show_numa_maps() can pass
1249  * @task != current].  It is the caller's responsibility to
1250  * free the reference in these cases.
1251  */
1252 static struct mempolicy * get_vma_policy(struct task_struct *task,
1253                 struct vm_area_struct *vma, unsigned long addr)
1254 {
1255         struct mempolicy *pol = task->mempolicy;
1256         int shared_pol = 0;
1257
1258         if (vma) {
1259                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1260                         pol = vma->vm_ops->get_policy(vma, addr);
1261                         shared_pol = 1; /* if pol non-NULL, add ref below */
1262                 } else if (vma->vm_policy &&
1263                                 vma->vm_policy->policy != MPOL_DEFAULT)
1264                         pol = vma->vm_policy;
1265         }
1266         if (!pol)
1267                 pol = &default_policy;
1268         else if (!shared_pol && pol != current->mempolicy)
1269                 mpol_get(pol);  /* vma or other task's policy */
1270         return pol;
1271 }
1272
1273 /* Return a nodemask representing a mempolicy */
1274 static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1275 {
1276         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1277         if (unlikely(policy->policy == MPOL_BIND) &&
1278                         gfp_zone(gfp) >= policy_zone &&
1279                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1280                 return &policy->v.nodes;
1281
1282         return NULL;
1283 }
1284
1285 /* Return a zonelist representing a mempolicy */
1286 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1287 {
1288         int nd;
1289
1290         switch (policy->policy) {
1291         case MPOL_PREFERRED:
1292                 nd = policy->v.preferred_node;
1293                 if (nd < 0)
1294                         nd = numa_node_id();
1295                 break;
1296         case MPOL_BIND:
1297                 /*
1298                  * Normally, MPOL_BIND allocations node-local are node-local
1299                  * within the allowed nodemask. However, if __GFP_THISNODE is
1300                  * set and the current node is part of the mask, we use the
1301                  * the zonelist for the first node in the mask instead.
1302                  */
1303                 nd = numa_node_id();
1304                 if (unlikely(gfp & __GFP_THISNODE) &&
1305                                 unlikely(!node_isset(nd, policy->v.nodes)))
1306                         nd = first_node(policy->v.nodes);
1307                 break;
1308         case MPOL_INTERLEAVE: /* should not happen */
1309         case MPOL_DEFAULT:
1310                 nd = numa_node_id();
1311                 break;
1312         default:
1313                 nd = 0;
1314                 BUG();
1315         }
1316         return node_zonelist(nd, gfp);
1317 }
1318
1319 /* Do dynamic interleaving for a process */
1320 static unsigned interleave_nodes(struct mempolicy *policy)
1321 {
1322         unsigned nid, next;
1323         struct task_struct *me = current;
1324
1325         nid = me->il_next;
1326         next = next_node(nid, policy->v.nodes);
1327         if (next >= MAX_NUMNODES)
1328                 next = first_node(policy->v.nodes);
1329         if (next < MAX_NUMNODES)
1330                 me->il_next = next;
1331         return nid;
1332 }
1333
1334 /*
1335  * Depending on the memory policy provide a node from which to allocate the
1336  * next slab entry.
1337  */
1338 unsigned slab_node(struct mempolicy *policy)
1339 {
1340         unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
1341
1342         switch (pol) {
1343         case MPOL_INTERLEAVE:
1344                 return interleave_nodes(policy);
1345
1346         case MPOL_BIND: {
1347                 /*
1348                  * Follow bind policy behavior and start allocation at the
1349                  * first node.
1350                  */
1351                 struct zonelist *zonelist;
1352                 struct zone *zone;
1353                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1354                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1355                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1356                                                         &policy->v.nodes,
1357                                                         &zone);
1358                 return zone->node;
1359         }
1360
1361         case MPOL_PREFERRED:
1362                 if (policy->v.preferred_node >= 0)
1363                         return policy->v.preferred_node;
1364                 /* Fall through */
1365
1366         default:
1367                 return numa_node_id();
1368         }
1369 }
1370
1371 /* Do static interleaving for a VMA with known offset. */
1372 static unsigned offset_il_node(struct mempolicy *pol,
1373                 struct vm_area_struct *vma, unsigned long off)
1374 {
1375         unsigned nnodes = nodes_weight(pol->v.nodes);
1376         unsigned target;
1377         int c;
1378         int nid = -1;
1379
1380         if (!nnodes)
1381                 return numa_node_id();
1382         target = (unsigned int)off % nnodes;
1383         c = 0;
1384         do {
1385                 nid = next_node(nid, pol->v.nodes);
1386                 c++;
1387         } while (c <= target);
1388         return nid;
1389 }
1390
1391 /* Determine a node number for interleave */
1392 static inline unsigned interleave_nid(struct mempolicy *pol,
1393                  struct vm_area_struct *vma, unsigned long addr, int shift)
1394 {
1395         if (vma) {
1396                 unsigned long off;
1397
1398                 /*
1399                  * for small pages, there is no difference between
1400                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1401                  * for huge pages, since vm_pgoff is in units of small
1402                  * pages, we need to shift off the always 0 bits to get
1403                  * a useful offset.
1404                  */
1405                 BUG_ON(shift < PAGE_SHIFT);
1406                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1407                 off += (addr - vma->vm_start) >> shift;
1408                 return offset_il_node(pol, vma, off);
1409         } else
1410                 return interleave_nodes(pol);
1411 }
1412
1413 #ifdef CONFIG_HUGETLBFS
1414 /*
1415  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1416  * @vma = virtual memory area whose policy is sought
1417  * @addr = address in @vma for shared policy lookup and interleave policy
1418  * @gfp_flags = for requested zone
1419  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1420  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1421  *
1422  * Returns a zonelist suitable for a huge page allocation.
1423  * If the effective policy is 'BIND, returns pointer to local node's zonelist,
1424  * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
1425  * If it is also a policy for which get_vma_policy() returns an extra
1426  * reference, we must hold that reference until after the allocation.
1427  * In that case, return policy via @mpol so hugetlb allocation can drop
1428  * the reference. For non-'BIND referenced policies, we can/do drop the
1429  * reference here, so the caller doesn't need to know about the special case
1430  * for default and current task policy.
1431  */
1432 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1433                                 gfp_t gfp_flags, struct mempolicy **mpol,
1434                                 nodemask_t **nodemask)
1435 {
1436         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1437         struct zonelist *zl;
1438
1439         *mpol = NULL;           /* probably no unref needed */
1440         *nodemask = NULL;       /* assume !MPOL_BIND */
1441         if (pol->policy == MPOL_BIND) {
1442                         *nodemask = &pol->v.nodes;
1443         } else if (pol->policy == MPOL_INTERLEAVE) {
1444                 unsigned nid;
1445
1446                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1447                 if (unlikely(pol != &default_policy &&
1448                                 pol != current->mempolicy))
1449                         __mpol_put(pol);        /* finished with pol */
1450                 return node_zonelist(nid, gfp_flags);
1451         }
1452
1453         zl = zonelist_policy(GFP_HIGHUSER, pol);
1454         if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1455                 if (pol->policy != MPOL_BIND)
1456                         __mpol_put(pol);        /* finished with pol */
1457                 else
1458                         *mpol = pol;    /* unref needed after allocation */
1459         }
1460         return zl;
1461 }
1462 #endif
1463
1464 /* Allocate a page in interleaved policy.
1465    Own path because it needs to do special accounting. */
1466 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1467                                         unsigned nid)
1468 {
1469         struct zonelist *zl;
1470         struct page *page;
1471
1472         zl = node_zonelist(nid, gfp);
1473         page = __alloc_pages(gfp, order, zl);
1474         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1475                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1476         return page;
1477 }
1478
1479 /**
1480  *      alloc_page_vma  - Allocate a page for a VMA.
1481  *
1482  *      @gfp:
1483  *      %GFP_USER    user allocation.
1484  *      %GFP_KERNEL  kernel allocations,
1485  *      %GFP_HIGHMEM highmem/user allocations,
1486  *      %GFP_FS      allocation should not call back into a file system.
1487  *      %GFP_ATOMIC  don't sleep.
1488  *
1489  *      @vma:  Pointer to VMA or NULL if not available.
1490  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1491  *
1492  *      This function allocates a page from the kernel page pool and applies
1493  *      a NUMA policy associated with the VMA or the current process.
1494  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1495  *      mm_struct of the VMA to prevent it from going away. Should be used for
1496  *      all allocations for pages that will be mapped into
1497  *      user space. Returns NULL when no page can be allocated.
1498  *
1499  *      Should be called with the mm_sem of the vma hold.
1500  */
1501 struct page *
1502 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1503 {
1504         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1505         struct zonelist *zl;
1506
1507         cpuset_update_task_memory_state();
1508
1509         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1510                 unsigned nid;
1511
1512                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1513                 if (unlikely(pol != &default_policy &&
1514                                 pol != current->mempolicy))
1515                         __mpol_put(pol);        /* finished with pol */
1516                 return alloc_page_interleave(gfp, 0, nid);
1517         }
1518         zl = zonelist_policy(gfp, pol);
1519         if (pol != &default_policy && pol != current->mempolicy) {
1520                 /*
1521                  * slow path: ref counted policy -- shared or vma
1522                  */
1523                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1524                                                 zl, nodemask_policy(gfp, pol));
1525                 __mpol_put(pol);
1526                 return page;
1527         }
1528         /*
1529          * fast path:  default or task policy
1530          */
1531         return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
1532 }
1533
1534 /**
1535  *      alloc_pages_current - Allocate pages.
1536  *
1537  *      @gfp:
1538  *              %GFP_USER   user allocation,
1539  *              %GFP_KERNEL kernel allocation,
1540  *              %GFP_HIGHMEM highmem allocation,
1541  *              %GFP_FS     don't call back into a file system.
1542  *              %GFP_ATOMIC don't sleep.
1543  *      @order: Power of two of allocation size in pages. 0 is a single page.
1544  *
1545  *      Allocate a page from the kernel page pool.  When not in
1546  *      interrupt context and apply the current process NUMA policy.
1547  *      Returns NULL when no page can be allocated.
1548  *
1549  *      Don't call cpuset_update_task_memory_state() unless
1550  *      1) it's ok to take cpuset_sem (can WAIT), and
1551  *      2) allocating for current task (not interrupt).
1552  */
1553 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1554 {
1555         struct mempolicy *pol = current->mempolicy;
1556
1557         if ((gfp & __GFP_WAIT) && !in_interrupt())
1558                 cpuset_update_task_memory_state();
1559         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1560                 pol = &default_policy;
1561         if (pol->policy == MPOL_INTERLEAVE)
1562                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1563         return __alloc_pages_nodemask(gfp, order,
1564                         zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
1565 }
1566 EXPORT_SYMBOL(alloc_pages_current);
1567
1568 /*
1569  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1570  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1571  * with the mems_allowed returned by cpuset_mems_allowed().  This
1572  * keeps mempolicies cpuset relative after its cpuset moves.  See
1573  * further kernel/cpuset.c update_nodemask().
1574  */
1575
1576 /* Slow path of a mempolicy duplicate */
1577 struct mempolicy *__mpol_dup(struct mempolicy *old)
1578 {
1579         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1580
1581         if (!new)
1582                 return ERR_PTR(-ENOMEM);
1583         if (current_cpuset_is_being_rebound()) {
1584                 nodemask_t mems = cpuset_mems_allowed(current);
1585                 mpol_rebind_policy(old, &mems);
1586         }
1587         *new = *old;
1588         atomic_set(&new->refcnt, 1);
1589         return new;
1590 }
1591
1592 static int mpol_match_intent(const struct mempolicy *a,
1593                              const struct mempolicy *b)
1594 {
1595         if (a->flags != b->flags)
1596                 return 0;
1597         if (!mpol_store_user_nodemask(a))
1598                 return 1;
1599         return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1600 }
1601
1602 /* Slow path of a mempolicy comparison */
1603 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1604 {
1605         if (!a || !b)
1606                 return 0;
1607         if (a->policy != b->policy)
1608                 return 0;
1609         if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
1610                 return 0;
1611         switch (a->policy) {
1612         case MPOL_DEFAULT:
1613                 return 1;
1614         case MPOL_BIND:
1615                 /* Fall through */
1616         case MPOL_INTERLEAVE:
1617                 return nodes_equal(a->v.nodes, b->v.nodes);
1618         case MPOL_PREFERRED:
1619                 return a->v.preferred_node == b->v.preferred_node;
1620         default:
1621                 BUG();
1622                 return 0;
1623         }
1624 }
1625
1626 /* Slow path of a mpol destructor. */
1627 void __mpol_put(struct mempolicy *p)
1628 {
1629         if (!atomic_dec_and_test(&p->refcnt))
1630                 return;
1631         p->policy = MPOL_DEFAULT;
1632         kmem_cache_free(policy_cache, p);
1633 }
1634
1635 /*
1636  * Shared memory backing store policy support.
1637  *
1638  * Remember policies even when nobody has shared memory mapped.
1639  * The policies are kept in Red-Black tree linked from the inode.
1640  * They are protected by the sp->lock spinlock, which should be held
1641  * for any accesses to the tree.
1642  */
1643
1644 /* lookup first element intersecting start-end */
1645 /* Caller holds sp->lock */
1646 static struct sp_node *
1647 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1648 {
1649         struct rb_node *n = sp->root.rb_node;
1650
1651         while (n) {
1652                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1653
1654                 if (start >= p->end)
1655                         n = n->rb_right;
1656                 else if (end <= p->start)
1657                         n = n->rb_left;
1658                 else
1659                         break;
1660         }
1661         if (!n)
1662                 return NULL;
1663         for (;;) {
1664                 struct sp_node *w = NULL;
1665                 struct rb_node *prev = rb_prev(n);
1666                 if (!prev)
1667                         break;
1668                 w = rb_entry(prev, struct sp_node, nd);
1669                 if (w->end <= start)
1670                         break;
1671                 n = prev;
1672         }
1673         return rb_entry(n, struct sp_node, nd);
1674 }
1675
1676 /* Insert a new shared policy into the list. */
1677 /* Caller holds sp->lock */
1678 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1679 {
1680         struct rb_node **p = &sp->root.rb_node;
1681         struct rb_node *parent = NULL;
1682         struct sp_node *nd;
1683
1684         while (*p) {
1685                 parent = *p;
1686                 nd = rb_entry(parent, struct sp_node, nd);
1687                 if (new->start < nd->start)
1688                         p = &(*p)->rb_left;
1689                 else if (new->end > nd->end)
1690                         p = &(*p)->rb_right;
1691                 else
1692                         BUG();
1693         }
1694         rb_link_node(&new->nd, parent, p);
1695         rb_insert_color(&new->nd, &sp->root);
1696         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1697                  new->policy ? new->policy->policy : 0);
1698 }
1699
1700 /* Find shared policy intersecting idx */
1701 struct mempolicy *
1702 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1703 {
1704         struct mempolicy *pol = NULL;
1705         struct sp_node *sn;
1706
1707         if (!sp->root.rb_node)
1708                 return NULL;
1709         spin_lock(&sp->lock);
1710         sn = sp_lookup(sp, idx, idx+1);
1711         if (sn) {
1712                 mpol_get(sn->policy);
1713                 pol = sn->policy;
1714         }
1715         spin_unlock(&sp->lock);
1716         return pol;
1717 }
1718
1719 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1720 {
1721         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1722         rb_erase(&n->nd, &sp->root);
1723         mpol_put(n->policy);
1724         kmem_cache_free(sn_cache, n);
1725 }
1726
1727 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1728                                 struct mempolicy *pol)
1729 {
1730         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1731
1732         if (!n)
1733                 return NULL;
1734         n->start = start;
1735         n->end = end;
1736         mpol_get(pol);
1737         n->policy = pol;
1738         return n;
1739 }
1740
1741 /* Replace a policy range. */
1742 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1743                                  unsigned long end, struct sp_node *new)
1744 {
1745         struct sp_node *n, *new2 = NULL;
1746
1747 restart:
1748         spin_lock(&sp->lock);
1749         n = sp_lookup(sp, start, end);
1750         /* Take care of old policies in the same range. */
1751         while (n && n->start < end) {
1752                 struct rb_node *next = rb_next(&n->nd);
1753                 if (n->start >= start) {
1754                         if (n->end <= end)
1755                                 sp_delete(sp, n);
1756                         else
1757                                 n->start = end;
1758                 } else {
1759                         /* Old policy spanning whole new range. */
1760                         if (n->end > end) {
1761                                 if (!new2) {
1762                                         spin_unlock(&sp->lock);
1763                                         new2 = sp_alloc(end, n->end, n->policy);
1764                                         if (!new2)
1765                                                 return -ENOMEM;
1766                                         goto restart;
1767                                 }
1768                                 n->end = start;
1769                                 sp_insert(sp, new2);
1770                                 new2 = NULL;
1771                                 break;
1772                         } else
1773                                 n->end = start;
1774                 }
1775                 if (!next)
1776                         break;
1777                 n = rb_entry(next, struct sp_node, nd);
1778         }
1779         if (new)
1780                 sp_insert(sp, new);
1781         spin_unlock(&sp->lock);
1782         if (new2) {
1783                 mpol_put(new2->policy);
1784                 kmem_cache_free(sn_cache, new2);
1785         }
1786         return 0;
1787 }
1788
1789 void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1790                         unsigned short flags, nodemask_t *policy_nodes)
1791 {
1792         info->root = RB_ROOT;
1793         spin_lock_init(&info->lock);
1794
1795         if (policy != MPOL_DEFAULT) {
1796                 struct mempolicy *newpol;
1797
1798                 /* Falls back to MPOL_DEFAULT on any error */
1799                 newpol = mpol_new(policy, flags, policy_nodes);
1800                 if (!IS_ERR(newpol)) {
1801                         /* Create pseudo-vma that contains just the policy */
1802                         struct vm_area_struct pvma;
1803
1804                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1805                         /* Policy covers entire file */
1806                         pvma.vm_end = TASK_SIZE;
1807                         mpol_set_shared_policy(info, &pvma, newpol);
1808                         mpol_put(newpol);
1809                 }
1810         }
1811 }
1812
1813 int mpol_set_shared_policy(struct shared_policy *info,
1814                         struct vm_area_struct *vma, struct mempolicy *npol)
1815 {
1816         int err;
1817         struct sp_node *new = NULL;
1818         unsigned long sz = vma_pages(vma);
1819
1820         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1821                  vma->vm_pgoff,
1822                  sz, npol ? npol->policy : -1,
1823                  npol ? npol->flags : -1,
1824                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
1825
1826         if (npol) {
1827                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1828                 if (!new)
1829                         return -ENOMEM;
1830         }
1831         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1832         if (err && new)
1833                 kmem_cache_free(sn_cache, new);
1834         return err;
1835 }
1836
1837 /* Free a backing policy store on inode delete. */
1838 void mpol_free_shared_policy(struct shared_policy *p)
1839 {
1840         struct sp_node *n;
1841         struct rb_node *next;
1842
1843         if (!p->root.rb_node)
1844                 return;
1845         spin_lock(&p->lock);
1846         next = rb_first(&p->root);
1847         while (next) {
1848                 n = rb_entry(next, struct sp_node, nd);
1849                 next = rb_next(&n->nd);
1850                 rb_erase(&n->nd, &p->root);
1851                 mpol_put(n->policy);
1852                 kmem_cache_free(sn_cache, n);
1853         }
1854         spin_unlock(&p->lock);
1855 }
1856
1857 /* assumes fs == KERNEL_DS */
1858 void __init numa_policy_init(void)
1859 {
1860         nodemask_t interleave_nodes;
1861         unsigned long largest = 0;
1862         int nid, prefer = 0;
1863
1864         policy_cache = kmem_cache_create("numa_policy",
1865                                          sizeof(struct mempolicy),
1866                                          0, SLAB_PANIC, NULL);
1867
1868         sn_cache = kmem_cache_create("shared_policy_node",
1869                                      sizeof(struct sp_node),
1870                                      0, SLAB_PANIC, NULL);
1871
1872         /*
1873          * Set interleaving policy for system init. Interleaving is only
1874          * enabled across suitably sized nodes (default is >= 16MB), or
1875          * fall back to the largest node if they're all smaller.
1876          */
1877         nodes_clear(interleave_nodes);
1878         for_each_node_state(nid, N_HIGH_MEMORY) {
1879                 unsigned long total_pages = node_present_pages(nid);
1880
1881                 /* Preserve the largest node */
1882                 if (largest < total_pages) {
1883                         largest = total_pages;
1884                         prefer = nid;
1885                 }
1886
1887                 /* Interleave this node? */
1888                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1889                         node_set(nid, interleave_nodes);
1890         }
1891
1892         /* All too small, use the largest */
1893         if (unlikely(nodes_empty(interleave_nodes)))
1894                 node_set(prefer, interleave_nodes);
1895
1896         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1897                 printk("numa_policy_init: interleaving failed\n");
1898 }
1899
1900 /* Reset policy of current process to default */
1901 void numa_default_policy(void)
1902 {
1903         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1904 }
1905
1906 /*
1907  * Display pages allocated per node and memory policy via /proc.
1908  */
1909 static const char * const policy_types[] =
1910         { "default", "prefer", "bind", "interleave" };
1911
1912 /*
1913  * Convert a mempolicy into a string.
1914  * Returns the number of characters in buffer (if positive)
1915  * or an error (negative)
1916  */
1917 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1918 {
1919         char *p = buffer;
1920         int l;
1921         nodemask_t nodes;
1922         unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
1923         unsigned short flags = pol ? pol->flags : 0;
1924
1925         switch (mode) {
1926         case MPOL_DEFAULT:
1927                 nodes_clear(nodes);
1928                 break;
1929
1930         case MPOL_PREFERRED:
1931                 nodes_clear(nodes);
1932                 node_set(pol->v.preferred_node, nodes);
1933                 break;
1934
1935         case MPOL_BIND:
1936                 /* Fall through */
1937         case MPOL_INTERLEAVE:
1938                 nodes = pol->v.nodes;
1939                 break;
1940
1941         default:
1942                 BUG();
1943                 return -EFAULT;
1944         }
1945
1946         l = strlen(policy_types[mode]);
1947         if (buffer + maxlen < p + l + 1)
1948                 return -ENOSPC;
1949
1950         strcpy(p, policy_types[mode]);
1951         p += l;
1952
1953         if (flags) {
1954                 int need_bar = 0;
1955
1956                 if (buffer + maxlen < p + 2)
1957                         return -ENOSPC;
1958                 *p++ = '=';
1959
1960                 if (flags & MPOL_F_STATIC_NODES)
1961                         p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
1962                 if (flags & MPOL_F_RELATIVE_NODES)
1963                         p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
1964         }
1965
1966         if (!nodes_empty(nodes)) {
1967                 if (buffer + maxlen < p + 2)
1968                         return -ENOSPC;
1969                 *p++ = '=';
1970                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1971         }
1972         return p - buffer;
1973 }
1974
1975 struct numa_maps {
1976         unsigned long pages;
1977         unsigned long anon;
1978         unsigned long active;
1979         unsigned long writeback;
1980         unsigned long mapcount_max;
1981         unsigned long dirty;
1982         unsigned long swapcache;
1983         unsigned long node[MAX_NUMNODES];
1984 };
1985
1986 static void gather_stats(struct page *page, void *private, int pte_dirty)
1987 {
1988         struct numa_maps *md = private;
1989         int count = page_mapcount(page);
1990
1991         md->pages++;
1992         if (pte_dirty || PageDirty(page))
1993                 md->dirty++;
1994
1995         if (PageSwapCache(page))
1996                 md->swapcache++;
1997
1998         if (PageActive(page))
1999                 md->active++;
2000
2001         if (PageWriteback(page))
2002                 md->writeback++;
2003
2004         if (PageAnon(page))
2005                 md->anon++;
2006
2007         if (count > md->mapcount_max)
2008                 md->mapcount_max = count;
2009
2010         md->node[page_to_nid(page)]++;
2011 }
2012
2013 #ifdef CONFIG_HUGETLB_PAGE
2014 static void check_huge_range(struct vm_area_struct *vma,
2015                 unsigned long start, unsigned long end,
2016                 struct numa_maps *md)
2017 {
2018         unsigned long addr;
2019         struct page *page;
2020
2021         for (addr = start; addr < end; addr += HPAGE_SIZE) {
2022                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2023                 pte_t pte;
2024
2025                 if (!ptep)
2026                         continue;
2027
2028                 pte = *ptep;
2029                 if (pte_none(pte))
2030                         continue;
2031
2032                 page = pte_page(pte);
2033                 if (!page)
2034                         continue;
2035
2036                 gather_stats(page, md, pte_dirty(*ptep));
2037         }
2038 }
2039 #else
2040 static inline void check_huge_range(struct vm_area_struct *vma,
2041                 unsigned long start, unsigned long end,
2042                 struct numa_maps *md)
2043 {
2044 }
2045 #endif
2046
2047 int show_numa_map(struct seq_file *m, void *v)
2048 {
2049         struct proc_maps_private *priv = m->private;
2050         struct vm_area_struct *vma = v;
2051         struct numa_maps *md;
2052         struct file *file = vma->vm_file;
2053         struct mm_struct *mm = vma->vm_mm;
2054         struct mempolicy *pol;
2055         int n;
2056         char buffer[50];
2057
2058         if (!mm)
2059                 return 0;
2060
2061         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2062         if (!md)
2063                 return 0;
2064
2065         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2066         mpol_to_str(buffer, sizeof(buffer), pol);
2067         /*
2068          * unref shared or other task's mempolicy
2069          */
2070         if (pol != &default_policy && pol != current->mempolicy)
2071                 __mpol_put(pol);
2072
2073         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2074
2075         if (file) {
2076                 seq_printf(m, " file=");
2077                 seq_path(m, &file->f_path, "\n\t= ");
2078         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2079                 seq_printf(m, " heap");
2080         } else if (vma->vm_start <= mm->start_stack &&
2081                         vma->vm_end >= mm->start_stack) {
2082                 seq_printf(m, " stack");
2083         }
2084
2085         if (is_vm_hugetlb_page(vma)) {
2086                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2087                 seq_printf(m, " huge");
2088         } else {
2089                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2090                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2091         }
2092
2093         if (!md->pages)
2094                 goto out;
2095
2096         if (md->anon)
2097                 seq_printf(m," anon=%lu",md->anon);
2098
2099         if (md->dirty)
2100                 seq_printf(m," dirty=%lu",md->dirty);
2101
2102         if (md->pages != md->anon && md->pages != md->dirty)
2103                 seq_printf(m, " mapped=%lu", md->pages);
2104
2105         if (md->mapcount_max > 1)
2106                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2107
2108         if (md->swapcache)
2109                 seq_printf(m," swapcache=%lu", md->swapcache);
2110
2111         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2112                 seq_printf(m," active=%lu", md->active);
2113
2114         if (md->writeback)
2115                 seq_printf(m," writeback=%lu", md->writeback);
2116
2117         for_each_node_state(n, N_HIGH_MEMORY)
2118                 if (md->node[n])
2119                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2120 out:
2121         seq_putc(m, '\n');
2122         kfree(md);
2123
2124         if (m->count < m->size)
2125                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2126         return 0;
2127 }