mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  */
  19
  20 #include <linux/res_counter.h>
  21 #include <linux/memcontrol.h>
  22 #include <linux/cgroup.h>
  23 #include <linux/mm.h>
  24 #include <linux/pagemap.h>
  25 #include <linux/smp.h>
  26 #include <linux/page-flags.h>
  27 #include <linux/backing-dev.h>
  28 #include <linux/bit_spinlock.h>
  29 #include <linux/rcupdate.h>
  30 #include <linux/slab.h>
  31 #include <linux/swap.h>
  32 #include <linux/spinlock.h>
  33 #include <linux/fs.h>
  34 #include <linux/seq_file.h>
  35 #include <linux/vmalloc.h>
  36 #include <linux/mm_inline.h>
  37 #include <linux/page_cgroup.h>
  38
  39 #include <asm/uaccess.h>
  40
  41 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  42 #define MEM_CGROUP_RECLAIM_RETRIES      5
  43
  44 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  45 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
  46 int do_swap_account __read_mostly;
  47 static int really_do_swap_account __initdata = 1; /* for remember boot option*/
  48 #else
  49 #define do_swap_account         (0)
  50 #endif
  51
  52
  53 /*
  54  * Statistics for memory cgroup.
  55  */
  56 enum mem_cgroup_stat_index {
  57         /*
  58          * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  59          */
  60         MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
  61         MEM_CGROUP_STAT_RSS,       /* # of pages charged as rss */
  62         MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
  63         MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
  64
  65         MEM_CGROUP_STAT_NSTATS,
  66 };
  67
  68 struct mem_cgroup_stat_cpu {
  69         s64 count[MEM_CGROUP_STAT_NSTATS];
  70 } ____cacheline_aligned_in_smp;
  71
  72 struct mem_cgroup_stat {
  73         struct mem_cgroup_stat_cpu cpustat[0];
  74 };
  75
  76 /*
  77  * For accounting under irq disable, no need for increment preempt count.
  78  */
  79 static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
  80                 enum mem_cgroup_stat_index idx, int val)
  81 {
  82         stat->count[idx] += val;
  83 }
  84
  85 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
  86                 enum mem_cgroup_stat_index idx)
  87 {
  88         int cpu;
  89         s64 ret = 0;
  90         for_each_possible_cpu(cpu)
  91                 ret += stat->cpustat[cpu].count[idx];
  92         return ret;
  93 }
  94
  95 /*
  96  * per-zone information in memory controller.
  97  */
  98 struct mem_cgroup_per_zone {
  99         /*
 100          * spin_lock to protect the per cgroup LRU
 101          */
 102         spinlock_t              lru_lock;
 103         struct list_head        lists[NR_LRU_LISTS];
 104         unsigned long           count[NR_LRU_LISTS];
 105 };
 106 /* Macro for accessing counter */
 107 #define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
 108
 109 struct mem_cgroup_per_node {
 110         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 111 };
 112
 113 struct mem_cgroup_lru_info {
 114         struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 115 };
 116
 117 /*
 118  * The memory controller data structure. The memory controller controls both
 119  * page cache and RSS per cgroup. We would eventually like to provide
 120  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 121  * to help the administrator determine what knobs to tune.
 122  *
 123  * TODO: Add a water mark for the memory controller. Reclaim will begin when
 124  * we hit the water mark. May be even add a low water mark, such that
 125  * no reclaim occurs from a cgroup at it's low water mark, this is
 126  * a feature that will be implemented much later in the future.
 127  */
 128 struct mem_cgroup {
 129         struct cgroup_subsys_state css;
 130         /*
 131          * the counter to account for memory usage
 132          */
 133         struct res_counter res;
 134         /*
 135          * Per cgroup active and inactive list, similar to the
 136          * per zone LRU lists.
 137          */
 138         struct mem_cgroup_lru_info info;
 139
 140         int     prev_priority;  /* for recording reclaim priority */
 141         /*
 142          * statistics. This must be placed at the end of memcg.
 143          */
 144         struct mem_cgroup_stat stat;
 145 };
 146
 147 enum charge_type {
 148         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 149         MEM_CGROUP_CHARGE_TYPE_MAPPED,
 150         MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
 151         MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
 152         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 153         NR_CHARGE_TYPE,
 154 };
 155
 156 /* only for here (for easy reading.) */
 157 #define PCGF_CACHE      (1UL << PCG_CACHE)
 158 #define PCGF_USED       (1UL << PCG_USED)
 159 #define PCGF_ACTIVE     (1UL << PCG_ACTIVE)
 160 #define PCGF_LOCK       (1UL << PCG_LOCK)
 161 #define PCGF_FILE       (1UL << PCG_FILE)
 162 static const unsigned long
 163 pcg_default_flags[NR_CHARGE_TYPE] = {
 164         PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
 165         PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
 166         PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
 167         0, /* FORCE */
 168 };
 169
 170 /*
 171  * Always modified under lru lock. Then, not necessary to preempt_disable()
 172  */
 173 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 174                                          struct page_cgroup *pc,
 175                                          bool charge)
 176 {
 177         int val = (charge)? 1 : -1;
 178         struct mem_cgroup_stat *stat = &mem->stat;
 179         struct mem_cgroup_stat_cpu *cpustat;
 180
 181         VM_BUG_ON(!irqs_disabled());
 182
 183         cpustat = &stat->cpustat[smp_processor_id()];
 184         if (PageCgroupCache(pc))
 185                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
 186         else
 187                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
 188
 189         if (charge)
 190                 __mem_cgroup_stat_add_safe(cpustat,
 191                                 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
 192         else
 193                 __mem_cgroup_stat_add_safe(cpustat,
 194                                 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
 195 }
 196
 197 static struct mem_cgroup_per_zone *
 198 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 199 {
 200         return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 201 }
 202
 203 static struct mem_cgroup_per_zone *
 204 page_cgroup_zoneinfo(struct page_cgroup *pc)
 205 {
 206         struct mem_cgroup *mem = pc->mem_cgroup;
 207         int nid = page_cgroup_nid(pc);
 208         int zid = page_cgroup_zid(pc);
 209
 210         return mem_cgroup_zoneinfo(mem, nid, zid);
 211 }
 212
 213 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
 214                                         enum lru_list idx)
 215 {
 216         int nid, zid;
 217         struct mem_cgroup_per_zone *mz;
 218         u64 total = 0;
 219
 220         for_each_online_node(nid)
 221                 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 222                         mz = mem_cgroup_zoneinfo(mem, nid, zid);
 223                         total += MEM_CGROUP_ZSTAT(mz, idx);
 224                 }
 225         return total;
 226 }
 227
 228 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 229 {
 230         return container_of(cgroup_subsys_state(cont,
 231                                 mem_cgroup_subsys_id), struct mem_cgroup,
 232                                 css);
 233 }
 234
 235 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 236 {
 237         /*
 238          * mm_update_next_owner() may clear mm->owner to NULL
 239          * if it races with swapoff, page migration, etc.
 240          * So this can be called with p == NULL.
 241          */
 242         if (unlikely(!p))
 243                 return NULL;
 244
 245         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 246                                 struct mem_cgroup, css);
 247 }
 248
 249 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 250                         struct page_cgroup *pc)
 251 {
 252         int lru = LRU_BASE;
 253
 254         if (PageCgroupUnevictable(pc))
 255                 lru = LRU_UNEVICTABLE;
 256         else {
 257                 if (PageCgroupActive(pc))
 258                         lru += LRU_ACTIVE;
 259                 if (PageCgroupFile(pc))
 260                         lru += LRU_FILE;
 261         }
 262
 263         MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 264
 265         mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
 266         list_del(&pc->lru);
 267 }
 268
 269 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 270                                 struct page_cgroup *pc, bool hot)
 271 {
 272         int lru = LRU_BASE;
 273
 274         if (PageCgroupUnevictable(pc))
 275                 lru = LRU_UNEVICTABLE;
 276         else {
 277                 if (PageCgroupActive(pc))
 278                         lru += LRU_ACTIVE;
 279                 if (PageCgroupFile(pc))
 280                         lru += LRU_FILE;
 281         }
 282
 283         MEM_CGROUP_ZSTAT(mz, lru) += 1;
 284         if (hot)
 285                 list_add(&pc->lru, &mz->lists[lru]);
 286         else
 287                 list_add_tail(&pc->lru, &mz->lists[lru]);
 288
 289         mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
 290 }
 291
 292 static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
 293 {
 294         struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
 295         int active    = PageCgroupActive(pc);
 296         int file      = PageCgroupFile(pc);
 297         int unevictable = PageCgroupUnevictable(pc);
 298         enum lru_list from = unevictable ? LRU_UNEVICTABLE :
 299                                 (LRU_FILE * !!file + !!active);
 300
 301         if (lru == from)
 302                 return;
 303
 304         MEM_CGROUP_ZSTAT(mz, from) -= 1;
 305         /*
 306          * However this is done under mz->lru_lock, another flags, which
 307          * are not related to LRU, will be modified from out-of-lock.
 308          * We have to use atomic set/clear flags.
 309          */
 310         if (is_unevictable_lru(lru)) {
 311                 ClearPageCgroupActive(pc);
 312                 SetPageCgroupUnevictable(pc);
 313         } else {
 314                 if (is_active_lru(lru))
 315                         SetPageCgroupActive(pc);
 316                 else
 317                         ClearPageCgroupActive(pc);
 318                 ClearPageCgroupUnevictable(pc);
 319         }
 320
 321         MEM_CGROUP_ZSTAT(mz, lru) += 1;
 322         list_move(&pc->lru, &mz->lists[lru]);
 323 }
 324
 325 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 326 {
 327         int ret;
 328
 329         task_lock(task);
 330         ret = task->mm && mm_match_cgroup(task->mm, mem);
 331         task_unlock(task);
 332         return ret;
 333 }
 334
 335 /*
 336  * This routine assumes that the appropriate zone's lru lock is already held
 337  */
 338 void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
 339 {
 340         struct page_cgroup *pc;
 341         struct mem_cgroup_per_zone *mz;
 342         unsigned long flags;
 343
 344         if (mem_cgroup_subsys.disabled)
 345                 return;
 346
 347         /*
 348          * We cannot lock_page_cgroup while holding zone's lru_lock,
 349          * because other holders of lock_page_cgroup can be interrupted
 350          * with an attempt to rotate_reclaimable_page.  But we cannot
 351          * safely get to page_cgroup without it, so just try_lock it:
 352          * mem_cgroup_isolate_pages allows for page left on wrong list.
 353          */
 354         pc = lookup_page_cgroup(page);
 355         if (!trylock_page_cgroup(pc))
 356                 return;
 357         if (pc && PageCgroupUsed(pc)) {
 358                 mz = page_cgroup_zoneinfo(pc);
 359                 spin_lock_irqsave(&mz->lru_lock, flags);
 360                 __mem_cgroup_move_lists(pc, lru);
 361                 spin_unlock_irqrestore(&mz->lru_lock, flags);
 362         }
 363         unlock_page_cgroup(pc);
 364 }
 365
 366 /*
 367  * Calculate mapped_ratio under memory controller. This will be used in
 368  * vmscan.c for deteremining we have to reclaim mapped pages.
 369  */
 370 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 371 {
 372         long total, rss;
 373
 374         /*
 375          * usage is recorded in bytes. But, here, we assume the number of
 376          * physical pages can be represented by "long" on any arch.
 377          */
 378         total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
 379         rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
 380         return (int)((rss * 100L) / total);
 381 }
 382
 383 /*
 384  * prev_priority control...this will be used in memory reclaim path.
 385  */
 386 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 387 {
 388         return mem->prev_priority;
 389 }
 390
 391 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
 392 {
 393         if (priority < mem->prev_priority)
 394                 mem->prev_priority = priority;
 395 }
 396
 397 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 398 {
 399         mem->prev_priority = priority;
 400 }
 401
 402 /*
 403  * Calculate # of pages to be scanned in this priority/zone.
 404  * See also vmscan.c
 405  *
 406  * priority starts from "DEF_PRIORITY" and decremented in each loop.
 407  * (see include/linux/mmzone.h)
 408  */
 409
 410 long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 411                                         int priority, enum lru_list lru)
 412 {
 413         long nr_pages;
 414         int nid = zone->zone_pgdat->node_id;
 415         int zid = zone_idx(zone);
 416         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
 417
 418         nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
 419
 420         return (nr_pages >> priority);
 421 }
 422
 423 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 424                                         struct list_head *dst,
 425                                         unsigned long *scanned, int order,
 426                                         int mode, struct zone *z,
 427                                         struct mem_cgroup *mem_cont,
 428                                         int active, int file)
 429 {
 430         unsigned long nr_taken = 0;
 431         struct page *page;
 432         unsigned long scan;
 433         LIST_HEAD(pc_list);
 434         struct list_head *src;
 435         struct page_cgroup *pc, *tmp;
 436         int nid = z->zone_pgdat->node_id;
 437         int zid = zone_idx(z);
 438         struct mem_cgroup_per_zone *mz;
 439         int lru = LRU_FILE * !!file + !!active;
 440
 441         BUG_ON(!mem_cont);
 442         mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 443         src = &mz->lists[lru];
 444
 445         spin_lock(&mz->lru_lock);
 446         scan = 0;
 447         list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 448                 if (scan >= nr_to_scan)
 449                         break;
 450                 if (unlikely(!PageCgroupUsed(pc)))
 451                         continue;
 452                 page = pc->page;
 453
 454                 if (unlikely(!PageLRU(page)))
 455                         continue;
 456
 457                 /*
 458                  * TODO: play better with lumpy reclaim, grabbing anything.
 459                  */
 460                 if (PageUnevictable(page) ||
 461                     (PageActive(page) && !active) ||
 462                     (!PageActive(page) && active)) {
 463                         __mem_cgroup_move_lists(pc, page_lru(page));
 464                         continue;
 465                 }
 466
 467                 scan++;
 468                 list_move(&pc->lru, &pc_list);
 469
 470                 if (__isolate_lru_page(page, mode, file) == 0) {
 471                         list_move(&page->lru, dst);
 472                         nr_taken++;
 473                 }
 474         }
 475
 476         list_splice(&pc_list, src);
 477         spin_unlock(&mz->lru_lock);
 478
 479         *scanned = scan;
 480         return nr_taken;
 481 }
 482
 483 /*
 484  * Unlike exported interface, "oom" parameter is added. if oom==true,
 485  * oom-killer can be invoked.
 486  */
 487 static int __mem_cgroup_try_charge(struct mm_struct *mm,
 488                         gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
 489 {
 490         struct mem_cgroup *mem;
 491         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 492         /*
 493          * We always charge the cgroup the mm_struct belongs to.
 494          * The mm_struct's mem_cgroup changes on task migration if the
 495          * thread group leader migrates. It's possible that mm is not
 496          * set, if so charge the init_mm (happens for pagecache usage).
 497          */
 498         if (likely(!*memcg)) {
 499                 rcu_read_lock();
 500                 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 501                 if (unlikely(!mem)) {
 502                         rcu_read_unlock();
 503                         return 0;
 504                 }
 505                 /*
 506                  * For every charge from the cgroup, increment reference count
 507                  */
 508                 css_get(&mem->css);
 509                 *memcg = mem;
 510                 rcu_read_unlock();
 511         } else {
 512                 mem = *memcg;
 513                 css_get(&mem->css);
 514         }
 515
 516
 517         while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
 518                 if (!(gfp_mask & __GFP_WAIT))
 519                         goto nomem;
 520
 521                 if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
 522                         continue;
 523
 524                 /*
 525                  * try_to_free_mem_cgroup_pages() might not give us a full
 526                  * picture of reclaim. Some pages are reclaimed and might be
 527                  * moved to swap cache or just unmapped from the cgroup.
 528                  * Check the limit again to see if the reclaim reduced the
 529                  * current usage of the cgroup before giving up
 530                  */
 531                 if (res_counter_check_under_limit(&mem->res))
 532                         continue;
 533
 534                 if (!nr_retries--) {
 535                         if (oom)
 536                                 mem_cgroup_out_of_memory(mem, gfp_mask);
 537                         goto nomem;
 538                 }
 539         }
 540         return 0;
 541 nomem:
 542         css_put(&mem->css);
 543         return -ENOMEM;
 544 }
 545
 546 /**
 547  * mem_cgroup_try_charge - get charge of PAGE_SIZE.
 548  * @mm: an mm_struct which is charged against. (when *memcg is NULL)
 549  * @gfp_mask: gfp_mask for reclaim.
 550  * @memcg: a pointer to memory cgroup which is charged against.
 551  *
 552  * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
 553  * memory cgroup from @mm is got and stored in *memcg.
 554  *
 555  * Returns 0 if success. -ENOMEM at failure.
 556  * This call can invoke OOM-Killer.
 557  */
 558
 559 int mem_cgroup_try_charge(struct mm_struct *mm,
 560                           gfp_t mask, struct mem_cgroup **memcg)
 561 {
 562         return __mem_cgroup_try_charge(mm, mask, memcg, true);
 563 }
 564
 565 /*
 566  * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
 567  * USED state. If already USED, uncharge and return.
 568  */
 569
 570 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 571                                      struct page_cgroup *pc,
 572                                      enum charge_type ctype)
 573 {
 574         struct mem_cgroup_per_zone *mz;
 575         unsigned long flags;
 576
 577         /* try_charge() can return NULL to *memcg, taking care of it. */
 578         if (!mem)
 579                 return;
 580
 581         lock_page_cgroup(pc);
 582         if (unlikely(PageCgroupUsed(pc))) {
 583                 unlock_page_cgroup(pc);
 584                 res_counter_uncharge(&mem->res, PAGE_SIZE);
 585                 css_put(&mem->css);
 586                 return;
 587         }
 588         pc->mem_cgroup = mem;
 589         /*
 590          * If a page is accounted as a page cache, insert to inactive list.
 591          * If anon, insert to active list.
 592          */
 593         pc->flags = pcg_default_flags[ctype];
 594
 595         mz = page_cgroup_zoneinfo(pc);
 596
 597         spin_lock_irqsave(&mz->lru_lock, flags);
 598         __mem_cgroup_add_list(mz, pc, true);
 599         spin_unlock_irqrestore(&mz->lru_lock, flags);
 600         unlock_page_cgroup(pc);
 601 }
 602
 603 /**
 604  * mem_cgroup_move_account - move account of the page
 605  * @pc: page_cgroup of the page.
 606  * @from: mem_cgroup which the page is moved from.
 607  * @to: mem_cgroup which the page is moved to. @from != @to.
 608  *
 609  * The caller must confirm following.
 610  * 1. disable irq.
 611  * 2. lru_lock of old mem_cgroup(@from) should be held.
 612  *
 613  * returns 0 at success,
 614  * returns -EBUSY when lock is busy or "pc" is unstable.
 615  *
 616  * This function does "uncharge" from old cgroup but doesn't do "charge" to
 617  * new cgroup. It should be done by a caller.
 618  */
 619
 620 static int mem_cgroup_move_account(struct page_cgroup *pc,
 621         struct mem_cgroup *from, struct mem_cgroup *to)
 622 {
 623         struct mem_cgroup_per_zone *from_mz, *to_mz;
 624         int nid, zid;
 625         int ret = -EBUSY;
 626
 627         VM_BUG_ON(!irqs_disabled());
 628         VM_BUG_ON(from == to);
 629
 630         nid = page_cgroup_nid(pc);
 631         zid = page_cgroup_zid(pc);
 632         from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
 633         to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
 634
 635
 636         if (!trylock_page_cgroup(pc))
 637                 return ret;
 638
 639         if (!PageCgroupUsed(pc))
 640                 goto out;
 641
 642         if (pc->mem_cgroup != from)
 643                 goto out;
 644
 645         if (spin_trylock(&to_mz->lru_lock)) {
 646                 __mem_cgroup_remove_list(from_mz, pc);
 647                 css_put(&from->css);
 648                 res_counter_uncharge(&from->res, PAGE_SIZE);
 649                 pc->mem_cgroup = to;
 650                 css_get(&to->css);
 651                 __mem_cgroup_add_list(to_mz, pc, false);
 652                 ret = 0;
 653                 spin_unlock(&to_mz->lru_lock);
 654         }
 655 out:
 656         unlock_page_cgroup(pc);
 657         return ret;
 658 }
 659
 660 /*
 661  * move charges to its parent.
 662  */
 663
 664 static int mem_cgroup_move_parent(struct page_cgroup *pc,
 665                                   struct mem_cgroup *child,
 666                                   gfp_t gfp_mask)
 667 {
 668         struct cgroup *cg = child->css.cgroup;
 669         struct cgroup *pcg = cg->parent;
 670         struct mem_cgroup *parent;
 671         struct mem_cgroup_per_zone *mz;
 672         unsigned long flags;
 673         int ret;
 674
 675         /* Is ROOT ? */
 676         if (!pcg)
 677                 return -EINVAL;
 678
 679         parent = mem_cgroup_from_cont(pcg);
 680
 681         ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
 682         if (ret)
 683                 return ret;
 684
 685         mz = mem_cgroup_zoneinfo(child,
 686                         page_cgroup_nid(pc), page_cgroup_zid(pc));
 687
 688         spin_lock_irqsave(&mz->lru_lock, flags);
 689         ret = mem_cgroup_move_account(pc, child, parent);
 690         spin_unlock_irqrestore(&mz->lru_lock, flags);
 691
 692         /* drop extra refcnt */
 693         css_put(&parent->css);
 694         /* uncharge if move fails */
 695         if (ret)
 696                 res_counter_uncharge(&parent->res, PAGE_SIZE);
 697
 698         return ret;
 699 }
 700
 701 /*
 702  * Charge the memory controller for page usage.
 703  * Return
 704  * 0 if the charge was successful
 705  * < 0 if the cgroup is over its limit
 706  */
 707 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 708                                 gfp_t gfp_mask, enum charge_type ctype,
 709                                 struct mem_cgroup *memcg)
 710 {
 711         struct mem_cgroup *mem;
 712         struct page_cgroup *pc;
 713         int ret;
 714
 715         pc = lookup_page_cgroup(page);
 716         /* can happen at boot */
 717         if (unlikely(!pc))
 718                 return 0;
 719         prefetchw(pc);
 720
 721         mem = memcg;
 722         ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
 723         if (ret)
 724                 return ret;
 725
 726         __mem_cgroup_commit_charge(mem, pc, ctype);
 727         return 0;
 728 }
 729
 730 int mem_cgroup_newpage_charge(struct page *page,
 731                               struct mm_struct *mm, gfp_t gfp_mask)
 732 {
 733         if (mem_cgroup_subsys.disabled)
 734                 return 0;
 735         if (PageCompound(page))
 736                 return 0;
 737         /*
 738          * If already mapped, we don't have to account.
 739          * If page cache, page->mapping has address_space.
 740          * But page->mapping may have out-of-use anon_vma pointer,
 741          * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
 742          * is NULL.
 743          */
 744         if (page_mapped(page) || (page->mapping && !PageAnon(page)))
 745                 return 0;
 746         if (unlikely(!mm))
 747                 mm = &init_mm;
 748         return mem_cgroup_charge_common(page, mm, gfp_mask,
 749                                 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
 750 }
 751
 752 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 753                                 gfp_t gfp_mask)
 754 {
 755         if (mem_cgroup_subsys.disabled)
 756                 return 0;
 757         if (PageCompound(page))
 758                 return 0;
 759         /*
 760          * Corner case handling. This is called from add_to_page_cache()
 761          * in usual. But some FS (shmem) precharges this page before calling it
 762          * and call add_to_page_cache() with GFP_NOWAIT.
 763          *
 764          * For GFP_NOWAIT case, the page may be pre-charged before calling
 765          * add_to_page_cache(). (See shmem.c) check it here and avoid to call
 766          * charge twice. (It works but has to pay a bit larger cost.)
 767          */
 768         if (!(gfp_mask & __GFP_WAIT)) {
 769                 struct page_cgroup *pc;
 770
 771
 772                 pc = lookup_page_cgroup(page);
 773                 if (!pc)
 774                         return 0;
 775                 lock_page_cgroup(pc);
 776                 if (PageCgroupUsed(pc)) {
 777                         unlock_page_cgroup(pc);
 778                         return 0;
 779                 }
 780                 unlock_page_cgroup(pc);
 781         }
 782
 783         if (unlikely(!mm))
 784                 mm = &init_mm;
 785
 786         if (page_is_file_cache(page))
 787                 return mem_cgroup_charge_common(page, mm, gfp_mask,
 788                                 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
 789         else
 790                 return mem_cgroup_charge_common(page, mm, gfp_mask,
 791                                 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 792 }
 793
 794 #ifdef CONFIG_SWAP
 795 int mem_cgroup_cache_charge_swapin(struct page *page,
 796                         struct mm_struct *mm, gfp_t mask, bool locked)
 797 {
 798         int ret = 0;
 799
 800         if (mem_cgroup_subsys.disabled)
 801                 return 0;
 802         if (unlikely(!mm))
 803                 mm = &init_mm;
 804         if (!locked)
 805                 lock_page(page);
 806         /*
 807          * If not locked, the page can be dropped from SwapCache until
 808          * we reach here.
 809          */
 810         if (PageSwapCache(page)) {
 811                 ret = mem_cgroup_charge_common(page, mm, mask,
 812                                 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 813         }
 814         if (!locked)
 815                 unlock_page(page);
 816
 817         return ret;
 818 }
 819 #endif
 820
 821 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 822 {
 823         struct page_cgroup *pc;
 824
 825         if (mem_cgroup_subsys.disabled)
 826                 return;
 827         if (!ptr)
 828                 return;
 829         pc = lookup_page_cgroup(page);
 830         __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 831 }
 832
 833 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 834 {
 835         if (mem_cgroup_subsys.disabled)
 836                 return;
 837         if (!mem)
 838                 return;
 839         res_counter_uncharge(&mem->res, PAGE_SIZE);
 840         css_put(&mem->css);
 841 }
 842
 843
 844 /*
 845  * uncharge if !page_mapped(page)
 846  */
 847 static void
 848 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 849 {
 850         struct page_cgroup *pc;
 851         struct mem_cgroup *mem;
 852         struct mem_cgroup_per_zone *mz;
 853         unsigned long flags;
 854
 855         if (mem_cgroup_subsys.disabled)
 856                 return;
 857
 858         if (PageSwapCache(page))
 859                 return;
 860
 861         /*
 862          * Check if our page_cgroup is valid
 863          */
 864         pc = lookup_page_cgroup(page);
 865         if (unlikely(!pc || !PageCgroupUsed(pc)))
 866                 return;
 867
 868         lock_page_cgroup(pc);
 869
 870         if (!PageCgroupUsed(pc))
 871                 goto unlock_out;
 872
 873         switch (ctype) {
 874         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
 875                 if (page_mapped(page))
 876                         goto unlock_out;
 877                 break;
 878         case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
 879                 if (!PageAnon(page)) {  /* Shared memory */
 880                         if (page->mapping && !page_is_file_cache(page))
 881                                 goto unlock_out;
 882                 } else if (page_mapped(page)) /* Anon */
 883                                 goto unlock_out;
 884                 break;
 885         default:
 886                 break;
 887         }
 888
 889         ClearPageCgroupUsed(pc);
 890         mem = pc->mem_cgroup;
 891
 892         mz = page_cgroup_zoneinfo(pc);
 893         spin_lock_irqsave(&mz->lru_lock, flags);
 894         __mem_cgroup_remove_list(mz, pc);
 895         spin_unlock_irqrestore(&mz->lru_lock, flags);
 896         unlock_page_cgroup(pc);
 897
 898         res_counter_uncharge(&mem->res, PAGE_SIZE);
 899         css_put(&mem->css);
 900
 901         return;
 902
 903 unlock_out:
 904         unlock_page_cgroup(pc);
 905         return;
 906 }
 907
 908 void mem_cgroup_uncharge_page(struct page *page)
 909 {
 910         /* early check. */
 911         if (page_mapped(page))
 912                 return;
 913         if (page->mapping && !PageAnon(page))
 914                 return;
 915         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 916 }
 917
 918 void mem_cgroup_uncharge_cache_page(struct page *page)
 919 {
 920         VM_BUG_ON(page_mapped(page));
 921         VM_BUG_ON(page->mapping);
 922         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 923 }
 924
 925 void mem_cgroup_uncharge_swapcache(struct page *page)
 926 {
 927         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
 928 }
 929
 930 /*
 931  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
 932  * page belongs to.
 933  */
 934 int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 935 {
 936         struct page_cgroup *pc;
 937         struct mem_cgroup *mem = NULL;
 938         int ret = 0;
 939
 940         if (mem_cgroup_subsys.disabled)
 941                 return 0;
 942
 943         pc = lookup_page_cgroup(page);
 944         lock_page_cgroup(pc);
 945         if (PageCgroupUsed(pc)) {
 946                 mem = pc->mem_cgroup;
 947                 css_get(&mem->css);
 948         }
 949         unlock_page_cgroup(pc);
 950
 951         if (mem) {
 952                 ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);
 953                 css_put(&mem->css);
 954         }
 955         *ptr = mem;
 956         return ret;
 957 }
 958
 959 /* remove redundant charge if migration failed*/
 960 void mem_cgroup_end_migration(struct mem_cgroup *mem,
 961                 struct page *oldpage, struct page *newpage)
 962 {
 963         struct page *target, *unused;
 964         struct page_cgroup *pc;
 965         enum charge_type ctype;
 966
 967         if (!mem)
 968                 return;
 969
 970         /* at migration success, oldpage->mapping is NULL. */
 971         if (oldpage->mapping) {
 972                 target = oldpage;
 973                 unused = NULL;
 974         } else {
 975                 target = newpage;
 976                 unused = oldpage;
 977         }
 978
 979         if (PageAnon(target))
 980                 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
 981         else if (page_is_file_cache(target))
 982                 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 983         else
 984                 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 985
 986         /* unused page is not on radix-tree now. */
 987         if (unused)
 988                 __mem_cgroup_uncharge_common(unused, ctype);
 989
 990         pc = lookup_page_cgroup(target);
 991         /*
 992          * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
 993          * So, double-counting is effectively avoided.
 994          */
 995         __mem_cgroup_commit_charge(mem, pc, ctype);
 996
 997         /*
 998          * Both of oldpage and newpage are still under lock_page().
 999          * Then, we don't have to care about race in radix-tree.
1000          * But we have to be careful that this page is unmapped or not.
1001          *
1002          * There is a case for !page_mapped(). At the start of
1003          * migration, oldpage was mapped. But now, it's zapped.
1004          * But we know *target* page is not freed/reused under us.
1005          * mem_cgroup_uncharge_page() does all necessary checks.
1006          */
1007         if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1008                 mem_cgroup_uncharge_page(target);
1009 }
1010
1011 /*
1012  * A call to try to shrink memory usage under specified resource controller.
1013  * This is typically used for page reclaiming for shmem for reducing side
1014  * effect of page allocation from shmem, which is used by some mem_cgroup.
1015  */
1016 int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
1017 {
1018         struct mem_cgroup *mem;
1019         int progress = 0;
1020         int retry = MEM_CGROUP_RECLAIM_RETRIES;
1021
1022         if (mem_cgroup_subsys.disabled)
1023                 return 0;
1024         if (!mm)
1025                 return 0;
1026
1027         rcu_read_lock();
1028         mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
1029         if (unlikely(!mem)) {
1030                 rcu_read_unlock();
1031                 return 0;
1032         }
1033         css_get(&mem->css);
1034         rcu_read_unlock();
1035
1036         do {
1037                 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
1038                 progress += res_counter_check_under_limit(&mem->res);
1039         } while (!progress && --retry);
1040
1041         css_put(&mem->css);
1042         if (!retry)
1043                 return -ENOMEM;
1044         return 0;
1045 }
1046
1047 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1048                                    unsigned long long val)
1049 {
1050
1051         int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1052         int progress;
1053         int ret = 0;
1054
1055         while (res_counter_set_limit(&memcg->res, val)) {
1056                 if (signal_pending(current)) {
1057                         ret = -EINTR;
1058                         break;
1059                 }
1060                 if (!retry_count) {
1061                         ret = -EBUSY;
1062                         break;
1063                 }
1064                 progress = try_to_free_mem_cgroup_pages(memcg,
1065                                 GFP_HIGHUSER_MOVABLE);
1066                 if (!progress)
1067                         retry_count--;
1068         }
1069         return ret;
1070 }
1071
1072
1073 /*
1074  * This routine traverse page_cgroup in given list and drop them all.
1075  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
1076  */
1077 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1078                             struct mem_cgroup_per_zone *mz,
1079                             enum lru_list lru)
1080 {
1081         struct page_cgroup *pc, *busy;
1082         unsigned long flags;
1083         unsigned long loop;
1084         struct list_head *list;
1085         int ret = 0;
1086
1087         list = &mz->lists[lru];
1088
1089         loop = MEM_CGROUP_ZSTAT(mz, lru);
1090         /* give some margin against EBUSY etc...*/
1091         loop += 256;
1092         busy = NULL;
1093         while (loop--) {
1094                 ret = 0;
1095                 spin_lock_irqsave(&mz->lru_lock, flags);
1096                 if (list_empty(list)) {
1097                         spin_unlock_irqrestore(&mz->lru_lock, flags);
1098                         break;
1099                 }
1100                 pc = list_entry(list->prev, struct page_cgroup, lru);
1101                 if (busy == pc) {
1102                         list_move(&pc->lru, list);
1103                         busy = 0;
1104                         spin_unlock_irqrestore(&mz->lru_lock, flags);
1105                         continue;
1106                 }
1107                 spin_unlock_irqrestore(&mz->lru_lock, flags);
1108
1109                 ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
1110                 if (ret == -ENOMEM)
1111                         break;
1112
1113                 if (ret == -EBUSY || ret == -EINVAL) {
1114                         /* found lock contention or "pc" is obsolete. */
1115                         busy = pc;
1116                         cond_resched();
1117                 } else
1118                         busy = NULL;
1119         }
1120         if (!ret && !list_empty(list))
1121                 return -EBUSY;
1122         return ret;
1123 }
1124
1125 /*
1126  * make mem_cgroup's charge to be 0 if there is no task.
1127  * This enables deleting this mem_cgroup.
1128  */
1129 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
1130 {
1131         int ret;
1132         int node, zid, shrink;
1133         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1134         struct cgroup *cgrp = mem->css.cgroup;
1135
1136         css_get(&mem->css);
1137
1138         shrink = 0;
1139         /* should free all ? */
1140         if (free_all)
1141                 goto try_to_free;
1142 move_account:
1143         while (mem->res.usage > 0) {
1144                 ret = -EBUSY;
1145                 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
1146                         goto out;
1147                 ret = -EINTR;
1148                 if (signal_pending(current))
1149                         goto out;
1150                 /* This is for making all *used* pages to be on LRU. */
1151                 lru_add_drain_all();
1152                 ret = 0;
1153                 for_each_node_state(node, N_POSSIBLE) {
1154                         for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1155                                 struct mem_cgroup_per_zone *mz;
1156                                 enum lru_list l;
1157                                 mz = mem_cgroup_zoneinfo(mem, node, zid);
1158                                 for_each_lru(l) {
1159                                         ret = mem_cgroup_force_empty_list(mem,
1160                                                                   mz, l);
1161                                         if (ret)
1162                                                 break;
1163                                 }
1164                         }
1165                         if (ret)
1166                                 break;
1167                 }
1168                 /* it seems parent cgroup doesn't have enough mem */
1169                 if (ret == -ENOMEM)
1170                         goto try_to_free;
1171                 cond_resched();
1172         }
1173         ret = 0;
1174 out:
1175         css_put(&mem->css);
1176         return ret;
1177
1178 try_to_free:
1179         /* returns EBUSY if there is a task or if we come here twice. */
1180         if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
1181                 ret = -EBUSY;
1182                 goto out;
1183         }
1184         /* we call try-to-free pages for make this cgroup empty */
1185         lru_add_drain_all();
1186         /* try to free all pages in this cgroup */
1187         shrink = 1;
1188         while (nr_retries && mem->res.usage > 0) {
1189                 int progress;
1190
1191                 if (signal_pending(current)) {
1192                         ret = -EINTR;
1193                         goto out;
1194                 }
1195                 progress = try_to_free_mem_cgroup_pages(mem,
1196                                                   GFP_HIGHUSER_MOVABLE);
1197                 if (!progress) {
1198                         nr_retries--;
1199                         /* maybe some writeback is necessary */
1200                         congestion_wait(WRITE, HZ/10);
1201                 }
1202
1203         }
1204         /* try move_account...there may be some *locked* pages. */
1205         if (mem->res.usage)
1206                 goto move_account;
1207         ret = 0;
1208         goto out;
1209 }
1210
1211 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
1212 {
1213         return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
1214 }
1215
1216
1217 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
1218 {
1219         return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
1220                                     cft->private);
1221 }
1222 /*
1223  * The user of this function is...
1224  * RES_LIMIT.
1225  */
1226 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1227                             const char *buffer)
1228 {
1229         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
1230         unsigned long long val;
1231         int ret;
1232
1233         switch (cft->private) {
1234         case RES_LIMIT:
1235                 /* This function does all necessary parse...reuse it */
1236                 ret = res_counter_memparse_write_strategy(buffer, &val);
1237                 if (!ret)
1238                         ret = mem_cgroup_resize_limit(memcg, val);
1239                 break;
1240         default:
1241                 ret = -EINVAL; /* should be BUG() ? */
1242                 break;
1243         }
1244         return ret;
1245 }
1246
1247 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1248 {
1249         struct mem_cgroup *mem;
1250
1251         mem = mem_cgroup_from_cont(cont);
1252         switch (event) {
1253         case RES_MAX_USAGE:
1254                 res_counter_reset_max(&mem->res);
1255                 break;
1256         case RES_FAILCNT:
1257                 res_counter_reset_failcnt(&mem->res);
1258                 break;
1259         }
1260         return 0;
1261 }
1262
1263 static const struct mem_cgroup_stat_desc {
1264         const char *msg;
1265         u64 unit;
1266 } mem_cgroup_stat_desc[] = {
1267         [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
1268         [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
1269         [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
1270         [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
1271 };
1272
1273 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1274                                  struct cgroup_map_cb *cb)
1275 {
1276         struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
1277         struct mem_cgroup_stat *stat = &mem_cont->stat;
1278         int i;
1279
1280         for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
1281                 s64 val;
1282
1283                 val = mem_cgroup_read_stat(stat, i);
1284                 val *= mem_cgroup_stat_desc[i].unit;
1285                 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1286         }
1287         /* showing # of active pages */
1288         {
1289                 unsigned long active_anon, inactive_anon;
1290                 unsigned long active_file, inactive_file;
1291                 unsigned long unevictable;
1292
1293                 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1294                                                 LRU_INACTIVE_ANON);
1295                 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1296                                                 LRU_ACTIVE_ANON);
1297                 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
1298                                                 LRU_INACTIVE_FILE);
1299                 active_file = mem_cgroup_get_all_zonestat(mem_cont,
1300                                                 LRU_ACTIVE_FILE);
1301                 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
1302                                                         LRU_UNEVICTABLE);
1303
1304                 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1305                 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1306                 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1307                 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
1308                 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
1309
1310         }
1311         return 0;
1312 }
1313
1314
1315 static struct cftype mem_cgroup_files[] = {
1316         {
1317                 .name = "usage_in_bytes",
1318                 .private = RES_USAGE,
1319                 .read_u64 = mem_cgroup_read,
1320         },
1321         {
1322                 .name = "max_usage_in_bytes",
1323                 .private = RES_MAX_USAGE,
1324                 .trigger = mem_cgroup_reset,
1325                 .read_u64 = mem_cgroup_read,
1326         },
1327         {
1328                 .name = "limit_in_bytes",
1329                 .private = RES_LIMIT,
1330                 .write_string = mem_cgroup_write,
1331                 .read_u64 = mem_cgroup_read,
1332         },
1333         {
1334                 .name = "failcnt",
1335                 .private = RES_FAILCNT,
1336                 .trigger = mem_cgroup_reset,
1337                 .read_u64 = mem_cgroup_read,
1338         },
1339         {
1340                 .name = "stat",
1341                 .read_map = mem_control_stat_show,
1342         },
1343         {
1344                 .name = "force_empty",
1345                 .trigger = mem_cgroup_force_empty_write,
1346         },
1347 };
1348
1349 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1350 {
1351         struct mem_cgroup_per_node *pn;
1352         struct mem_cgroup_per_zone *mz;
1353         enum lru_list l;
1354         int zone, tmp = node;
1355         /*
1356          * This routine is called against possible nodes.
1357          * But it's BUG to call kmalloc() against offline node.
1358          *
1359          * TODO: this routine can waste much memory for nodes which will
1360          *       never be onlined. It's better to use memory hotplug callback
1361          *       function.
1362          */
1363         if (!node_state(node, N_NORMAL_MEMORY))
1364                 tmp = -1;
1365         pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
1366         if (!pn)
1367                 return 1;
1368
1369         mem->info.nodeinfo[node] = pn;
1370         memset(pn, 0, sizeof(*pn));
1371
1372         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1373                 mz = &pn->zoneinfo[zone];
1374                 spin_lock_init(&mz->lru_lock);
1375                 for_each_lru(l)
1376                         INIT_LIST_HEAD(&mz->lists[l]);
1377         }
1378         return 0;
1379 }
1380
1381 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1382 {
1383         kfree(mem->info.nodeinfo[node]);
1384 }
1385
1386 static int mem_cgroup_size(void)
1387 {
1388         int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
1389         return sizeof(struct mem_cgroup) + cpustat_size;
1390 }
1391
1392 static struct mem_cgroup *mem_cgroup_alloc(void)
1393 {
1394         struct mem_cgroup *mem;
1395         int size = mem_cgroup_size();
1396
1397         if (size < PAGE_SIZE)
1398                 mem = kmalloc(size, GFP_KERNEL);
1399         else
1400                 mem = vmalloc(size);
1401
1402         if (mem)
1403                 memset(mem, 0, size);
1404         return mem;
1405 }
1406
1407 static void mem_cgroup_free(struct mem_cgroup *mem)
1408 {
1409         if (mem_cgroup_size() < PAGE_SIZE)
1410                 kfree(mem);
1411         else
1412                 vfree(mem);
1413 }
1414
1415
1416 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1417 static void __init enable_swap_cgroup(void)
1418 {
1419         if (!mem_cgroup_subsys.disabled && really_do_swap_account)
1420                 do_swap_account = 1;
1421 }
1422 #else
1423 static void __init enable_swap_cgroup(void)
1424 {
1425 }
1426 #endif
1427
1428 static struct cgroup_subsys_state *
1429 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1430 {
1431         struct mem_cgroup *mem;
1432         int node;
1433
1434         mem = mem_cgroup_alloc();
1435         if (!mem)
1436                 return ERR_PTR(-ENOMEM);
1437
1438         res_counter_init(&mem->res);
1439
1440         for_each_node_state(node, N_POSSIBLE)
1441                 if (alloc_mem_cgroup_per_zone_info(mem, node))
1442                         goto free_out;
1443         /* root ? */
1444         if (cont->parent == NULL)
1445                 enable_swap_cgroup();
1446
1447         return &mem->css;
1448 free_out:
1449         for_each_node_state(node, N_POSSIBLE)
1450                 free_mem_cgroup_per_zone_info(mem, node);
1451         mem_cgroup_free(mem);
1452         return ERR_PTR(-ENOMEM);
1453 }
1454
1455 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
1456                                         struct cgroup *cont)
1457 {
1458         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1459         mem_cgroup_force_empty(mem, false);
1460 }
1461
1462 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1463                                 struct cgroup *cont)
1464 {
1465         int node;
1466         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1467
1468         for_each_node_state(node, N_POSSIBLE)
1469                 free_mem_cgroup_per_zone_info(mem, node);
1470
1471         mem_cgroup_free(mem_cgroup_from_cont(cont));
1472 }
1473
1474 static int mem_cgroup_populate(struct cgroup_subsys *ss,
1475                                 struct cgroup *cont)
1476 {
1477         return cgroup_add_files(cont, ss, mem_cgroup_files,
1478                                         ARRAY_SIZE(mem_cgroup_files));
1479 }
1480
1481 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1482                                 struct cgroup *cont,
1483                                 struct cgroup *old_cont,
1484                                 struct task_struct *p)
1485 {
1486         struct mm_struct *mm;
1487         struct mem_cgroup *mem, *old_mem;
1488
1489         mm = get_task_mm(p);
1490         if (mm == NULL)
1491                 return;
1492
1493         mem = mem_cgroup_from_cont(cont);
1494         old_mem = mem_cgroup_from_cont(old_cont);
1495
1496         /*
1497          * Only thread group leaders are allowed to migrate, the mm_struct is
1498          * in effect owned by the leader
1499          */
1500         if (!thread_group_leader(p))
1501                 goto out;
1502
1503 out:
1504         mmput(mm);
1505 }
1506
1507 struct cgroup_subsys mem_cgroup_subsys = {
1508         .name = "memory",
1509         .subsys_id = mem_cgroup_subsys_id,
1510         .create = mem_cgroup_create,
1511         .pre_destroy = mem_cgroup_pre_destroy,
1512         .destroy = mem_cgroup_destroy,
1513         .populate = mem_cgroup_populate,
1514         .attach = mem_cgroup_move_task,
1515         .early_init = 0,
1516 };
1517
1518 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1519
1520 static int __init disable_swap_account(char *s)
1521 {
1522         really_do_swap_account = 0;
1523         return 1;
1524 }
1525 __setup("noswapaccount", disable_swap_account);
1526 #endif