mm/page_cgroup.c

   1 #include <linux/mm.h>
   2 #include <linux/mmzone.h>
   3 #include <linux/bootmem.h>
   4 #include <linux/bit_spinlock.h>
   5 #include <linux/page_cgroup.h>
   6 #include <linux/hash.h>
   7 #include <linux/slab.h>
   8 #include <linux/memory.h>
   9 #include <linux/vmalloc.h>
  10 #include <linux/cgroup.h>
  11 #include <linux/swapops.h>
  12 #include <linux/kmemleak.h>
  13
  14 static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
  15 {
  16         pc->flags = 0;
  17         set_page_cgroup_array_id(pc, id);
  18         pc->mem_cgroup = NULL;
  19         INIT_LIST_HEAD(&pc->lru);
  20 }
  21 static unsigned long total_usage;
  22
  23 #if !defined(CONFIG_SPARSEMEM)
  24
  25
  26 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  27 {
  28         pgdat->node_page_cgroup = NULL;
  29 }
  30
  31 struct page_cgroup *lookup_page_cgroup(struct page *page)
  32 {
  33         unsigned long pfn = page_to_pfn(page);
  34         unsigned long offset;
  35         struct page_cgroup *base;
  36
  37         base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
  38         if (unlikely(!base))
  39                 return NULL;
  40
  41         offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
  42         return base + offset;
  43 }
  44
  45 struct page *lookup_cgroup_page(struct page_cgroup *pc)
  46 {
  47         unsigned long pfn;
  48         struct page *page;
  49         pg_data_t *pgdat;
  50
  51         pgdat = NODE_DATA(page_cgroup_array_id(pc));
  52         pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
  53         page = pfn_to_page(pfn);
  54         VM_BUG_ON(pc != lookup_page_cgroup(page));
  55         return page;
  56 }
  57
  58 static int __init alloc_node_page_cgroup(int nid)
  59 {
  60         struct page_cgroup *base, *pc;
  61         unsigned long table_size;
  62         unsigned long start_pfn, nr_pages, index;
  63
  64         start_pfn = NODE_DATA(nid)->node_start_pfn;
  65         nr_pages = NODE_DATA(nid)->node_spanned_pages;
  66
  67         if (!nr_pages)
  68                 return 0;
  69
  70         table_size = sizeof(struct page_cgroup) * nr_pages;
  71
  72         base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
  73                         table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
  74         if (!base)
  75                 return -ENOMEM;
  76         for (index = 0; index < nr_pages; index++) {
  77                 pc = base + index;
  78                 init_page_cgroup(pc, nid);
  79         }
  80         NODE_DATA(nid)->node_page_cgroup = base;
  81         total_usage += table_size;
  82         return 0;
  83 }
  84
  85 void __init page_cgroup_init_flatmem(void)
  86 {
  87
  88         int nid, fail;
  89
  90         if (mem_cgroup_disabled())
  91                 return;
  92
  93         for_each_online_node(nid)  {
  94                 fail = alloc_node_page_cgroup(nid);
  95                 if (fail)
  96                         goto fail;
  97         }
  98         printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  99         printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
 100         " don't want memory cgroups\n");
 101         return;
 102 fail:
 103         printk(KERN_CRIT "allocation of page_cgroup failed.\n");
 104         printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
 105         panic("Out of memory");
 106 }
 107
 108 #else /* CONFIG_FLAT_NODE_MEM_MAP */
 109
 110 struct page_cgroup *lookup_page_cgroup(struct page *page)
 111 {
 112         unsigned long pfn = page_to_pfn(page);
 113         struct mem_section *section = __pfn_to_section(pfn);
 114
 115         if (!section->page_cgroup)
 116                 return NULL;
 117         return section->page_cgroup + pfn;
 118 }
 119
 120 struct page *lookup_cgroup_page(struct page_cgroup *pc)
 121 {
 122         struct mem_section *section;
 123         struct page *page;
 124         unsigned long nr;
 125
 126         nr = page_cgroup_array_id(pc);
 127         section = __nr_to_section(nr);
 128         page = pfn_to_page(pc - section->page_cgroup);
 129         VM_BUG_ON(pc != lookup_page_cgroup(page));
 130         return page;
 131 }
 132
 133 static void *__meminit alloc_page_cgroup(size_t size, int nid)
 134 {
 135         void *addr = NULL;
 136
 137         addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
 138         if (addr)
 139                 return addr;
 140
 141         if (node_state(nid, N_HIGH_MEMORY))
 142                 addr = vmalloc_node(size, nid);
 143         else
 144                 addr = vmalloc(size);
 145
 146         return addr;
 147 }
 148
 149 #ifdef CONFIG_MEMORY_HOTPLUG
 150 static void free_page_cgroup(void *addr)
 151 {
 152         if (is_vmalloc_addr(addr)) {
 153                 vfree(addr);
 154         } else {
 155                 struct page *page = virt_to_page(addr);
 156                 size_t table_size =
 157                         sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 158
 159                 BUG_ON(PageReserved(page));
 160                 free_pages_exact(addr, table_size);
 161         }
 162 }
 163 #endif
 164
 165 static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
 166 {
 167         struct page_cgroup *base, *pc;
 168         struct mem_section *section;
 169         unsigned long table_size;
 170         unsigned long nr;
 171         int index;
 172
 173         nr = pfn_to_section_nr(pfn);
 174         section = __nr_to_section(nr);
 175
 176         if (section->page_cgroup)
 177                 return 0;
 178
 179         table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 180         base = alloc_page_cgroup(table_size, nid);
 181
 182         /*
 183          * The value stored in section->page_cgroup is (base - pfn)
 184          * and it does not point to the memory block allocated above,
 185          * causing kmemleak false positives.
 186          */
 187         kmemleak_not_leak(base);
 188
 189         if (!base) {
 190                 printk(KERN_ERR "page cgroup allocation failure\n");
 191                 return -ENOMEM;
 192         }
 193
 194         for (index = 0; index < PAGES_PER_SECTION; index++) {
 195                 pc = base + index;
 196                 init_page_cgroup(pc, nr);
 197         }
 198         /*
 199          * The passed "pfn" may not be aligned to SECTION.  For the calculation
 200          * we need to apply a mask.
 201          */
 202         pfn &= PAGE_SECTION_MASK;
 203         section->page_cgroup = base - pfn;
 204         total_usage += table_size;
 205         return 0;
 206 }
 207 #ifdef CONFIG_MEMORY_HOTPLUG
 208 void __free_page_cgroup(unsigned long pfn)
 209 {
 210         struct mem_section *ms;
 211         struct page_cgroup *base;
 212
 213         ms = __pfn_to_section(pfn);
 214         if (!ms || !ms->page_cgroup)
 215                 return;
 216         base = ms->page_cgroup + pfn;
 217         free_page_cgroup(base);
 218         ms->page_cgroup = NULL;
 219 }
 220
 221 int __meminit online_page_cgroup(unsigned long start_pfn,
 222                         unsigned long nr_pages,
 223                         int nid)
 224 {
 225         unsigned long start, end, pfn;
 226         int fail = 0;
 227
 228         start = SECTION_ALIGN_DOWN(start_pfn);
 229         end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 230
 231         if (nid == -1) {
 232                 /*
 233                  * In this case, "nid" already exists and contains valid memory.
 234                  * "start_pfn" passed to us is a pfn which is an arg for
 235                  * online__pages(), and start_pfn should exist.
 236                  */
 237                 nid = pfn_to_nid(start_pfn);
 238                 VM_BUG_ON(!node_state(nid, N_ONLINE));
 239         }
 240
 241         for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 242                 if (!pfn_present(pfn))
 243                         continue;
 244                 fail = init_section_page_cgroup(pfn, nid);
 245         }
 246         if (!fail)
 247                 return 0;
 248
 249         /* rollback */
 250         for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 251                 __free_page_cgroup(pfn);
 252
 253         return -ENOMEM;
 254 }
 255
 256 int __meminit offline_page_cgroup(unsigned long start_pfn,
 257                 unsigned long nr_pages, int nid)
 258 {
 259         unsigned long start, end, pfn;
 260
 261         start = SECTION_ALIGN_DOWN(start_pfn);
 262         end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 263
 264         for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 265                 __free_page_cgroup(pfn);
 266         return 0;
 267
 268 }
 269
 270 static int __meminit page_cgroup_callback(struct notifier_block *self,
 271                                unsigned long action, void *arg)
 272 {
 273         struct memory_notify *mn = arg;
 274         int ret = 0;
 275         switch (action) {
 276         case MEM_GOING_ONLINE:
 277                 ret = online_page_cgroup(mn->start_pfn,
 278                                    mn->nr_pages, mn->status_change_nid);
 279                 break;
 280         case MEM_OFFLINE:
 281                 offline_page_cgroup(mn->start_pfn,
 282                                 mn->nr_pages, mn->status_change_nid);
 283                 break;
 284         case MEM_CANCEL_ONLINE:
 285         case MEM_GOING_OFFLINE:
 286                 break;
 287         case MEM_ONLINE:
 288         case MEM_CANCEL_OFFLINE:
 289                 break;
 290         }
 291
 292         return notifier_from_errno(ret);
 293 }
 294
 295 #endif
 296
 297 void __init page_cgroup_init(void)
 298 {
 299         unsigned long pfn;
 300         int nid;
 301
 302         if (mem_cgroup_disabled())
 303                 return;
 304
 305         for_each_node_state(nid, N_HIGH_MEMORY) {
 306                 unsigned long start_pfn, end_pfn;
 307
 308                 start_pfn = node_start_pfn(nid);
 309                 end_pfn = node_end_pfn(nid);
 310                 /*
 311                  * start_pfn and end_pfn may not be aligned to SECTION and the
 312                  * page->flags of out of node pages are not initialized.  So we
 313                  * scan [start_pfn, the biggest section's pfn < end_pfn) here.
 314                  */
 315                 for (pfn = start_pfn;
 316                      pfn < end_pfn;
 317                      pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
 318
 319                         if (!pfn_valid(pfn))
 320                                 continue;
 321                         /*
 322                          * Nodes's pfns can be overlapping.
 323                          * We know some arch can have a nodes layout such as
 324                          * -------------pfn-------------->
 325                          * N0 | N1 | N2 | N0 | N1 | N2|....
 326                          */
 327                         if (pfn_to_nid(pfn) != nid)
 328                                 continue;
 329                         if (init_section_page_cgroup(pfn, nid))
 330                                 goto oom;
 331                 }
 332         }
 333         hotplug_memory_notifier(page_cgroup_callback, 0);
 334         printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 335         printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
 336                          "don't want memory cgroups\n");
 337         return;
 338 oom:
 339         printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
 340         panic("Out of memory");
 341 }
 342
 343 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 344 {
 345         return;
 346 }
 347
 348 #endif
 349
 350
 351 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 352
 353 static DEFINE_MUTEX(swap_cgroup_mutex);
 354 struct swap_cgroup_ctrl {
 355         struct page **map;
 356         unsigned long length;
 357         spinlock_t      lock;
 358 };
 359
 360 struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 361
 362 struct swap_cgroup {
 363         unsigned short          id;
 364 };
 365 #define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
 366 #define SC_POS_MASK     (SC_PER_PAGE - 1)
 367
 368 /*
 369  * SwapCgroup implements "lookup" and "exchange" operations.
 370  * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 371  * against SwapCache. At swap_free(), this is accessed directly from swap.
 372  *
 373  * This means,
 374  *  - we have no race in "exchange" when we're accessed via SwapCache because
 375  *    SwapCache(and its swp_entry) is under lock.
 376  *  - When called via swap_free(), there is no user of this entry and no race.
 377  * Then, we don't need lock around "exchange".
 378  *
 379  * TODO: we can push these buffers out to HIGHMEM.
 380  */
 381
 382 /*
 383  * allocate buffer for swap_cgroup.
 384  */
 385 static int swap_cgroup_prepare(int type)
 386 {
 387         struct page *page;
 388         struct swap_cgroup_ctrl *ctrl;
 389         unsigned long idx, max;
 390
 391         ctrl = &swap_cgroup_ctrl[type];
 392
 393         for (idx = 0; idx < ctrl->length; idx++) {
 394                 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 395                 if (!page)
 396                         goto not_enough_page;
 397                 ctrl->map[idx] = page;
 398         }
 399         return 0;
 400 not_enough_page:
 401         max = idx;
 402         for (idx = 0; idx < max; idx++)
 403                 __free_page(ctrl->map[idx]);
 404
 405         return -ENOMEM;
 406 }
 407
 408 /**
 409  * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
 410  * @end: swap entry to be cmpxchged
 411  * @old: old id
 412  * @new: new id
 413  *
 414  * Returns old id at success, 0 at failure.
 415  * (There is no mem_cgroup using 0 as its id)
 416  */
 417 unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 418                                         unsigned short old, unsigned short new)
 419 {
 420         int type = swp_type(ent);
 421         unsigned long offset = swp_offset(ent);
 422         unsigned long idx = offset / SC_PER_PAGE;
 423         unsigned long pos = offset & SC_POS_MASK;
 424         struct swap_cgroup_ctrl *ctrl;
 425         struct page *mappage;
 426         struct swap_cgroup *sc;
 427         unsigned long flags;
 428         unsigned short retval;
 429
 430         ctrl = &swap_cgroup_ctrl[type];
 431
 432         mappage = ctrl->map[idx];
 433         sc = page_address(mappage);
 434         sc += pos;
 435         spin_lock_irqsave(&ctrl->lock, flags);
 436         retval = sc->id;
 437         if (retval == old)
 438                 sc->id = new;
 439         else
 440                 retval = 0;
 441         spin_unlock_irqrestore(&ctrl->lock, flags);
 442         return retval;
 443 }
 444
 445 /**
 446  * swap_cgroup_record - record mem_cgroup for this swp_entry.
 447  * @ent: swap entry to be recorded into
 448  * @mem: mem_cgroup to be recorded
 449  *
 450  * Returns old value at success, 0 at failure.
 451  * (Of course, old value can be 0.)
 452  */
 453 unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 454 {
 455         int type = swp_type(ent);
 456         unsigned long offset = swp_offset(ent);
 457         unsigned long idx = offset / SC_PER_PAGE;
 458         unsigned long pos = offset & SC_POS_MASK;
 459         struct swap_cgroup_ctrl *ctrl;
 460         struct page *mappage;
 461         struct swap_cgroup *sc;
 462         unsigned short old;
 463         unsigned long flags;
 464
 465         ctrl = &swap_cgroup_ctrl[type];
 466
 467         mappage = ctrl->map[idx];
 468         sc = page_address(mappage);
 469         sc += pos;
 470         spin_lock_irqsave(&ctrl->lock, flags);
 471         old = sc->id;
 472         sc->id = id;
 473         spin_unlock_irqrestore(&ctrl->lock, flags);
 474
 475         return old;
 476 }
 477
 478 /**
 479  * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
 480  * @ent: swap entry to be looked up.
 481  *
 482  * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 483  */
 484 unsigned short lookup_swap_cgroup(swp_entry_t ent)
 485 {
 486         int type = swp_type(ent);
 487         unsigned long offset = swp_offset(ent);
 488         unsigned long idx = offset / SC_PER_PAGE;
 489         unsigned long pos = offset & SC_POS_MASK;
 490         struct swap_cgroup_ctrl *ctrl;
 491         struct page *mappage;
 492         struct swap_cgroup *sc;
 493         unsigned short ret;
 494
 495         ctrl = &swap_cgroup_ctrl[type];
 496         mappage = ctrl->map[idx];
 497         sc = page_address(mappage);
 498         sc += pos;
 499         ret = sc->id;
 500         return ret;
 501 }
 502
 503 int swap_cgroup_swapon(int type, unsigned long max_pages)
 504 {
 505         void *array;
 506         unsigned long array_size;
 507         unsigned long length;
 508         struct swap_cgroup_ctrl *ctrl;
 509
 510         if (!do_swap_account)
 511                 return 0;
 512
 513         length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
 514         array_size = length * sizeof(void *);
 515
 516         array = vzalloc(array_size);
 517         if (!array)
 518                 goto nomem;
 519
 520         ctrl = &swap_cgroup_ctrl[type];
 521         mutex_lock(&swap_cgroup_mutex);
 522         ctrl->length = length;
 523         ctrl->map = array;
 524         spin_lock_init(&ctrl->lock);
 525         if (swap_cgroup_prepare(type)) {
 526                 /* memory shortage */
 527                 ctrl->map = NULL;
 528                 ctrl->length = 0;
 529                 mutex_unlock(&swap_cgroup_mutex);
 530                 vfree(array);
 531                 goto nomem;
 532         }
 533         mutex_unlock(&swap_cgroup_mutex);
 534
 535         return 0;
 536 nomem:
 537         printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
 538         printk(KERN_INFO
 539                 "swap_cgroup can be disabled by swapaccount=0 boot option\n");
 540         return -ENOMEM;
 541 }
 542
 543 void swap_cgroup_swapoff(int type)
 544 {
 545         struct page **map;
 546         unsigned long i, length;
 547         struct swap_cgroup_ctrl *ctrl;
 548
 549         if (!do_swap_account)
 550                 return;
 551
 552         mutex_lock(&swap_cgroup_mutex);
 553         ctrl = &swap_cgroup_ctrl[type];
 554         map = ctrl->map;
 555         length = ctrl->length;
 556         ctrl->map = NULL;
 557         ctrl->length = 0;
 558         mutex_unlock(&swap_cgroup_mutex);
 559
 560         if (map) {
 561                 for (i = 0; i < length; i++) {
 562                         struct page *page = map[i];
 563                         if (page)
 564                                 __free_page(page);
 565                 }
 566                 vfree(map);
 567         }
 568 }
 569
 570 #endif