[mirror_ubuntu-focal-kernel.git] / mm / page_cgroup.c

#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
#include <linux/bit_spinlock.h>
#include <linux/page_cgroup.h>
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/memory.h>
#include <linux/vmalloc.h>
#include <linux/cgroup.h>
#include <linux/swapops.h>
#include <linux/kmemleak.h>

static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
{
	pc->flags = 0;
	set_page_cgroup_array_id(pc, id);
	pc->mem_cgroup = NULL;
}
static unsigned long total_usage;

#if !defined(CONFIG_SPARSEMEM)


void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
{
	pgdat->node_page_cgroup = NULL;
}

struct page_cgroup *lookup_page_cgroup(struct page *page)
{
	unsigned long pfn = page_to_pfn(page);
	unsigned long offset;
	struct page_cgroup *base;

	base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
	if (unlikely(!base))
		return NULL;

	offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
	return base + offset;
}

struct page *lookup_cgroup_page(struct page_cgroup *pc)
{
	unsigned long pfn;
	struct page *page;
	pg_data_t *pgdat;

	pgdat = NODE_DATA(page_cgroup_array_id(pc));
	pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
	page = pfn_to_page(pfn);
	VM_BUG_ON(pc != lookup_page_cgroup(page));
	return page;
}

static int __init alloc_node_page_cgroup(int nid)
{
	struct page_cgroup *base, *pc;
	unsigned long table_size;
	unsigned long start_pfn, nr_pages, index;

	start_pfn = NODE_DATA(nid)->node_start_pfn;
	nr_pages = NODE_DATA(nid)->node_spanned_pages;

	if (!nr_pages)
		return 0;

	table_size = sizeof(struct page_cgroup) * nr_pages;

	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
	if (!base)
		return -ENOMEM;
	for (index = 0; index < nr_pages; index++) {
		pc = base + index;
		init_page_cgroup(pc, nid);
	}
	NODE_DATA(nid)->node_page_cgroup = base;
	total_usage += table_size;
	return 0;
}

void __init page_cgroup_init_flatmem(void)
{

	int nid, fail;

	if (mem_cgroup_disabled())
		return;

	for_each_online_node(nid)  {
		fail = alloc_node_page_cgroup(nid);
		if (fail)
			goto fail;
	}
	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
	" don't want memory cgroups\n");
	return;
fail:
	printk(KERN_CRIT "allocation of page_cgroup failed.\n");
	printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
	panic("Out of memory");
}

#else /* CONFIG_FLAT_NODE_MEM_MAP */

struct page_cgroup *lookup_page_cgroup(struct page *page)
{
	unsigned long pfn = page_to_pfn(page);
	struct mem_section *section = __pfn_to_section(pfn);

	if (!section->page_cgroup)
		return NULL;
	return section->page_cgroup + pfn;
}

struct page *lookup_cgroup_page(struct page_cgroup *pc)
{
	struct mem_section *section;
	struct page *page;
	unsigned long nr;

	nr = page_cgroup_array_id(pc);
	section = __nr_to_section(nr);
	page = pfn_to_page(pc - section->page_cgroup);
	VM_BUG_ON(pc != lookup_page_cgroup(page));
	return page;
}

static void *__meminit alloc_page_cgroup(size_t size, int nid)
{
	void *addr = NULL;
	gfp_t flags = GFP_KERNEL | __GFP_NOWARN;

	addr = alloc_pages_exact_nid(nid, size, flags);
	if (addr) {
		kmemleak_alloc(addr, size, 1, flags);
		return addr;
	}

	if (node_state(nid, N_HIGH_MEMORY))
		addr = vmalloc_node(size, nid);
	else
		addr = vmalloc(size);

	return addr;
}

#ifdef CONFIG_MEMORY_HOTPLUG
static void free_page_cgroup(void *addr)
{
	if (is_vmalloc_addr(addr)) {
		vfree(addr);
	} else {
		struct page *page = virt_to_page(addr);
		size_t table_size =
			sizeof(struct page_cgroup) * PAGES_PER_SECTION;

		BUG_ON(PageReserved(page));
		free_pages_exact(addr, table_size);
	}
}
#endif

static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
{
	struct page_cgroup *base, *pc;
	struct mem_section *section;
	unsigned long table_size;
	unsigned long nr;
	int index;

	nr = pfn_to_section_nr(pfn);
	section = __nr_to_section(nr);

	if (section->page_cgroup)
		return 0;

	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
	base = alloc_page_cgroup(table_size, nid);

	/*
	 * The value stored in section->page_cgroup is (base - pfn)
	 * and it does not point to the memory block allocated above,
	 * causing kmemleak false positives.
	 */
	kmemleak_not_leak(base);

	if (!base) {
		printk(KERN_ERR "page cgroup allocation failure\n");
		return -ENOMEM;
	}

	for (index = 0; index < PAGES_PER_SECTION; index++) {
		pc = base + index;
		init_page_cgroup(pc, nr);
	}
	/*
	 * The passed "pfn" may not be aligned to SECTION.  For the calculation
	 * we need to apply a mask.
	 */
	pfn &= PAGE_SECTION_MASK;
	section->page_cgroup = base - pfn;
	total_usage += table_size;
	return 0;
}
#ifdef CONFIG_MEMORY_HOTPLUG
void __free_page_cgroup(unsigned long pfn)
{
	struct mem_section *ms;
	struct page_cgroup *base;

	ms = __pfn_to_section(pfn);
	if (!ms || !ms->page_cgroup)
		return;
	base = ms->page_cgroup + pfn;
	free_page_cgroup(base);
	ms->page_cgroup = NULL;
}

int __meminit online_page_cgroup(unsigned long start_pfn,
			unsigned long nr_pages,
			int nid)
{
	unsigned long start, end, pfn;
	int fail = 0;

	start = SECTION_ALIGN_DOWN(start_pfn);
	end = SECTION_ALIGN_UP(start_pfn + nr_pages);

	if (nid == -1) {
		/*
		 * In this case, "nid" already exists and contains valid memory.
		 * "start_pfn" passed to us is a pfn which is an arg for
		 * online__pages(), and start_pfn should exist.
		 */
		nid = pfn_to_nid(start_pfn);
		VM_BUG_ON(!node_state(nid, N_ONLINE));
	}

	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
		if (!pfn_present(pfn))
			continue;
		fail = init_section_page_cgroup(pfn, nid);
	}
	if (!fail)
		return 0;

	/* rollback */
	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
		__free_page_cgroup(pfn);

	return -ENOMEM;
}

int __meminit offline_page_cgroup(unsigned long start_pfn,
		unsigned long nr_pages, int nid)
{
	unsigned long start, end, pfn;

	start = SECTION_ALIGN_DOWN(start_pfn);
	end = SECTION_ALIGN_UP(start_pfn + nr_pages);

	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
		__free_page_cgroup(pfn);
	return 0;

}

static int __meminit page_cgroup_callback(struct notifier_block *self,
			       unsigned long action, void *arg)
{
	struct memory_notify *mn = arg;
	int ret = 0;
	switch (action) {
	case MEM_GOING_ONLINE:
		ret = online_page_cgroup(mn->start_pfn,
				   mn->nr_pages, mn->status_change_nid);
		break;
	case MEM_OFFLINE:
		offline_page_cgroup(mn->start_pfn,
				mn->nr_pages, mn->status_change_nid);
		break;
	case MEM_CANCEL_ONLINE:
	case MEM_GOING_OFFLINE:
		break;
	case MEM_ONLINE:
	case MEM_CANCEL_OFFLINE:
		break;
	}

	return notifier_from_errno(ret);
}

#endif

void __init page_cgroup_init(void)
{
	unsigned long pfn;
	int nid;

	if (mem_cgroup_disabled())
		return;

	for_each_node_state(nid, N_HIGH_MEMORY) {
		unsigned long start_pfn, end_pfn;

		start_pfn = node_start_pfn(nid);
		end_pfn = node_end_pfn(nid);
		/*
		 * start_pfn and end_pfn may not be aligned to SECTION and the
		 * page->flags of out of node pages are not initialized.  So we
		 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
		 */
		for (pfn = start_pfn;
		     pfn < end_pfn;
                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {

			if (!pfn_valid(pfn))
				continue;
			/*
			 * Nodes's pfns can be overlapping.
			 * We know some arch can have a nodes layout such as
			 * -------------pfn-------------->
			 * N0 | N1 | N2 | N0 | N1 | N2|....
			 */
			if (pfn_to_nid(pfn) != nid)
				continue;
			if (init_section_page_cgroup(pfn, nid))
				goto oom;
		}
	}
	hotplug_memory_notifier(page_cgroup_callback, 0);
	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
			 "don't want memory cgroups\n");
	return;
oom:
	printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
	panic("Out of memory");
}

void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
{
	return;
}

#endif


#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

static DEFINE_MUTEX(swap_cgroup_mutex);
struct swap_cgroup_ctrl {
	struct page **map;
	unsigned long length;
	spinlock_t	lock;
};

static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];

struct swap_cgroup {
	unsigned short		id;
};
#define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
#define SC_POS_MASK	(SC_PER_PAGE - 1)

/*
 * SwapCgroup implements "lookup" and "exchange" operations.
 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 * against SwapCache. At swap_free(), this is accessed directly from swap.
 *
 * This means,
 *  - we have no race in "exchange" when we're accessed via SwapCache because
 *    SwapCache(and its swp_entry) is under lock.
 *  - When called via swap_free(), there is no user of this entry and no race.
 * Then, we don't need lock around "exchange".
 *
 * TODO: we can push these buffers out to HIGHMEM.
 */

/*
 * allocate buffer for swap_cgroup.
 */
static int swap_cgroup_prepare(int type)
{
	struct page *page;
	struct swap_cgroup_ctrl *ctrl;
	unsigned long idx, max;

	ctrl = &swap_cgroup_ctrl[type];

	for (idx = 0; idx < ctrl->length; idx++) {
		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
		if (!page)
			goto not_enough_page;
		ctrl->map[idx] = page;
	}
	return 0;
not_enough_page:
	max = idx;
	for (idx = 0; idx < max; idx++)
		__free_page(ctrl->map[idx]);

	return -ENOMEM;
}

/**
 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
 * @end: swap entry to be cmpxchged
 * @old: old id
 * @new: new id
 *
 * Returns old id at success, 0 at failure.
 * (There is no mem_cgroup using 0 as its id)
 */
unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
					unsigned short old, unsigned short new)
{
	int type = swp_type(ent);
	unsigned long offset = swp_offset(ent);
	unsigned long idx = offset / SC_PER_PAGE;
	unsigned long pos = offset & SC_POS_MASK;
	struct swap_cgroup_ctrl *ctrl;
	struct page *mappage;
	struct swap_cgroup *sc;
	unsigned long flags;
	unsigned short retval;

	ctrl = &swap_cgroup_ctrl[type];

	mappage = ctrl->map[idx];
	sc = page_address(mappage);
	sc += pos;
	spin_lock_irqsave(&ctrl->lock, flags);
	retval = sc->id;
	if (retval == old)
		sc->id = new;
	else
		retval = 0;
	spin_unlock_irqrestore(&ctrl->lock, flags);
	return retval;
}

/**
 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 * @ent: swap entry to be recorded into
 * @mem: mem_cgroup to be recorded
 *
 * Returns old value at success, 0 at failure.
 * (Of course, old value can be 0.)
 */
unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
{
	int type = swp_type(ent);
	unsigned long offset = swp_offset(ent);
	unsigned long idx = offset / SC_PER_PAGE;
	unsigned long pos = offset & SC_POS_MASK;
	struct swap_cgroup_ctrl *ctrl;
	struct page *mappage;
	struct swap_cgroup *sc;
	unsigned short old;
	unsigned long flags;

	ctrl = &swap_cgroup_ctrl[type];

	mappage = ctrl->map[idx];
	sc = page_address(mappage);
	sc += pos;
	spin_lock_irqsave(&ctrl->lock, flags);
	old = sc->id;
	sc->id = id;
	spin_unlock_irqrestore(&ctrl->lock, flags);

	return old;
}

/**
 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
 * @ent: swap entry to be looked up.
 *
 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 */
unsigned short lookup_swap_cgroup(swp_entry_t ent)
{
	int type = swp_type(ent);
	unsigned long offset = swp_offset(ent);
	unsigned long idx = offset / SC_PER_PAGE;
	unsigned long pos = offset & SC_POS_MASK;
	struct swap_cgroup_ctrl *ctrl;
	struct page *mappage;
	struct swap_cgroup *sc;
	unsigned short ret;

	ctrl = &swap_cgroup_ctrl[type];
	mappage = ctrl->map[idx];
	sc = page_address(mappage);
	sc += pos;
	ret = sc->id;
	return ret;
}

int swap_cgroup_swapon(int type, unsigned long max_pages)
{
	void *array;
	unsigned long array_size;
	unsigned long length;
	struct swap_cgroup_ctrl *ctrl;

	if (!do_swap_account)
		return 0;

	length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
	array_size = length * sizeof(void *);

	array = vzalloc(array_size);
	if (!array)
		goto nomem;

	ctrl = &swap_cgroup_ctrl[type];
	mutex_lock(&swap_cgroup_mutex);
	ctrl->length = length;
	ctrl->map = array;
	spin_lock_init(&ctrl->lock);
	if (swap_cgroup_prepare(type)) {
		/* memory shortage */
		ctrl->map = NULL;
		ctrl->length = 0;
		mutex_unlock(&swap_cgroup_mutex);
		vfree(array);
		goto nomem;
	}
	mutex_unlock(&swap_cgroup_mutex);

	return 0;
nomem:
	printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
	printk(KERN_INFO
		"swap_cgroup can be disabled by swapaccount=0 boot option\n");
	return -ENOMEM;
}

void swap_cgroup_swapoff(int type)
{
	struct page **map;
	unsigned long i, length;
	struct swap_cgroup_ctrl *ctrl;

	if (!do_swap_account)
		return;

	mutex_lock(&swap_cgroup_mutex);
	ctrl = &swap_cgroup_ctrl[type];
	map = ctrl->map;
	length = ctrl->length;
	ctrl->map = NULL;
	ctrl->length = 0;
	mutex_unlock(&swap_cgroup_mutex);

	if (map) {
		for (i = 0; i < length; i++) {
			struct page *page = map[i];
			if (page)
				__free_page(page);
		}
		vfree(map);
	}
}

#endif
Commit	Line	Data
52d4b9ac KH	1	#include <linux/mm.h>
	2	#include <linux/mmzone.h>
	3	#include <linux/bootmem.h>
	4	#include <linux/bit_spinlock.h>
	5	#include <linux/page_cgroup.h>
	6	#include <linux/hash.h>
94b6da5a	7	#include <linux/slab.h>
52d4b9ac	8	#include <linux/memory.h>
4c821042	9	#include <linux/vmalloc.h>
94b6da5a	10	#include <linux/cgroup.h>
27a7faa0	11	#include <linux/swapops.h>
7952f988	12	#include <linux/kmemleak.h>
52d4b9ac	13
6b3ae58e	14	static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
52d4b9ac KH	15	{
52d4b9ac KH	16	pc->flags = 0;
6b3ae58e	17	set_page_cgroup_array_id(pc, id);
52d4b9ac	18	pc->mem_cgroup = NULL;
52d4b9ac KH	19	}
	20	static unsigned long total_usage;
	21
	22	#if !defined(CONFIG_SPARSEMEM)
	23
	24
31168481	25	void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
52d4b9ac KH	26	{
	27	pgdat->node_page_cgroup = NULL;
	28	}
	29
	30	struct page_cgroup lookup_page_cgroup(struct page page)
	31	{
	32	unsigned long pfn = page_to_pfn(page);
	33	unsigned long offset;
	34	struct page_cgroup *base;
	35
	36	base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
	37	if (unlikely(!base))
	38	return NULL;
	39
	40	offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
	41	return base + offset;
	42	}
	43
6b3ae58e JW	44	struct page lookup_cgroup_page(struct page_cgroup pc)
	45	{
	46	unsigned long pfn;
	47	struct page *page;
	48	pg_data_t *pgdat;
	49
	50	pgdat = NODE_DATA(page_cgroup_array_id(pc));
	51	pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
	52	page = pfn_to_page(pfn);
	53	VM_BUG_ON(pc != lookup_page_cgroup(page));
	54	return page;
	55	}
	56
52d4b9ac KH	57	static int __init alloc_node_page_cgroup(int nid)
	58	{
	59	struct page_cgroup base, pc;
	60	unsigned long table_size;
	61	unsigned long start_pfn, nr_pages, index;
	62
	63	start_pfn = NODE_DATA(nid)->node_start_pfn;
	64	nr_pages = NODE_DATA(nid)->node_spanned_pages;
	65
653d22c0 KH	66	if (!nr_pages)
	67	return 0;
	68
52d4b9ac	69	table_size = sizeof(struct page_cgroup) * nr_pages;
ca371c0d KH	70
	71	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
	72	table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
	73	if (!base)
52d4b9ac KH	74	return -ENOMEM;
	75	for (index = 0; index < nr_pages; index++) {
	76	pc = base + index;
6b3ae58e	77	init_page_cgroup(pc, nid);
52d4b9ac KH	78	}
	79	NODE_DATA(nid)->node_page_cgroup = base;
	80	total_usage += table_size;
	81	return 0;
	82	}
	83
ca371c0d	84	void __init page_cgroup_init_flatmem(void)
52d4b9ac KH	85	{
	86
	87	int nid, fail;
	88
f8d66542	89	if (mem_cgroup_disabled())
94b6da5a KH	90	return;
94b6da5a KH	91
52d4b9ac KH	92	for_each_online_node(nid) {
	93	fail = alloc_node_page_cgroup(nid);
	94	if (fail)
	95	goto fail;
	96	}
	97	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
8ca739e3 RD	98	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
8ca739e3 RD	99	" don't want memory cgroups\n");
52d4b9ac KH	100	return;
52d4b9ac KH	101	fail:
8ca739e3 RD	102	printk(KERN_CRIT "allocation of page_cgroup failed.\n");
8ca739e3 RD	103	printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
52d4b9ac KH	104	panic("Out of memory");
	105	}
	106
	107	#else /* CONFIG_FLAT_NODE_MEM_MAP */
	108
	109	struct page_cgroup lookup_page_cgroup(struct page page)
	110	{
	111	unsigned long pfn = page_to_pfn(page);
	112	struct mem_section *section = __pfn_to_section(pfn);
	113
d69b042f BS	114	if (!section->page_cgroup)
d69b042f BS	115	return NULL;
52d4b9ac KH	116	return section->page_cgroup + pfn;
	117	}
	118
6b3ae58e JW	119	struct page lookup_cgroup_page(struct page_cgroup pc)
	120	{
	121	struct mem_section *section;
	122	struct page *page;
	123	unsigned long nr;
	124
	125	nr = page_cgroup_array_id(pc);
	126	section = __nr_to_section(nr);
	127	page = pfn_to_page(pc - section->page_cgroup);
	128	VM_BUG_ON(pc != lookup_page_cgroup(page));
	129	return page;
	130	}
	131
268433b8	132	static void *__meminit alloc_page_cgroup(size_t size, int nid)
dde79e00 MH	133	{
dde79e00 MH	134	void *addr = NULL;
ff7ee93f	135	gfp_t flags = GFP_KERNEL \| __GFP_NOWARN;
dde79e00	136
ff7ee93f SR	137	addr = alloc_pages_exact_nid(nid, size, flags);
	138	if (addr) {
	139	kmemleak_alloc(addr, size, 1, flags);
dde79e00	140	return addr;
ff7ee93f	141	}
dde79e00 MH	142
	143	if (node_state(nid, N_HIGH_MEMORY))
	144	addr = vmalloc_node(size, nid);
	145	else
	146	addr = vmalloc(size);
	147
	148	return addr;
	149	}
	150
	151	#ifdef CONFIG_MEMORY_HOTPLUG
	152	static void free_page_cgroup(void *addr)
	153	{
	154	if (is_vmalloc_addr(addr)) {
	155	vfree(addr);
	156	} else {
	157	struct page *page = virt_to_page(addr);
6cfddb26 MH	158	size_t table_size =
	159	sizeof(struct page_cgroup) * PAGES_PER_SECTION;
	160
	161	BUG_ON(PageReserved(page));
	162	free_pages_exact(addr, table_size);
dde79e00 MH	163	}
	164	}
	165	#endif
	166
37573e8c	167	static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
52d4b9ac	168	{
52d4b9ac	169	struct page_cgroup base, pc;
6b3ae58e	170	struct mem_section *section;
52d4b9ac	171	unsigned long table_size;
6b3ae58e	172	unsigned long nr;
37573e8c	173	int index;
52d4b9ac	174
6b3ae58e JW	175	nr = pfn_to_section_nr(pfn);
	176	section = __nr_to_section(nr);
	177
	178	if (section->page_cgroup)
	179	return 0;
	180
6b3ae58e	181	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
dde79e00 MH	182	base = alloc_page_cgroup(table_size, nid);
dde79e00 MH	183
6b3ae58e JW	184	/*
	185	* The value stored in section->page_cgroup is (base - pfn)
	186	* and it does not point to the memory block allocated above,
	187	* causing kmemleak false positives.
	188	*/
	189	kmemleak_not_leak(base);
52d4b9ac KH	190
	191	if (!base) {
	192	printk(KERN_ERR "page cgroup allocation failure\n");
	193	return -ENOMEM;
	194	}
	195
	196	for (index = 0; index < PAGES_PER_SECTION; index++) {
	197	pc = base + index;
6b3ae58e	198	init_page_cgroup(pc, nr);
52d4b9ac	199	}
37573e8c KH	200	/*
	201	* The passed "pfn" may not be aligned to SECTION. For the calculation
	202	* we need to apply a mask.
	203	*/
	204	pfn &= PAGE_SECTION_MASK;
52d4b9ac KH	205	section->page_cgroup = base - pfn;
	206	total_usage += table_size;
	207	return 0;
	208	}
	209	#ifdef CONFIG_MEMORY_HOTPLUG
	210	void __free_page_cgroup(unsigned long pfn)
	211	{
	212	struct mem_section *ms;
	213	struct page_cgroup *base;
	214
	215	ms = __pfn_to_section(pfn);
	216	if (!ms \|\| !ms->page_cgroup)
	217	return;
	218	base = ms->page_cgroup + pfn;
dde79e00 MH	219	free_page_cgroup(base);
dde79e00 MH	220	ms->page_cgroup = NULL;
52d4b9ac KH	221	}
52d4b9ac KH	222
31168481	223	int __meminit online_page_cgroup(unsigned long start_pfn,
52d4b9ac KH	224	unsigned long nr_pages,
	225	int nid)
	226	{
	227	unsigned long start, end, pfn;
	228	int fail = 0;
	229
1bb36fbd DK	230	start = SECTION_ALIGN_DOWN(start_pfn);
1bb36fbd DK	231	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
52d4b9ac	232
37573e8c KH	233	if (nid == -1) {
	234	/*
	235	* In this case, "nid" already exists and contains valid memory.
	236	* "start_pfn" passed to us is a pfn which is an arg for
	237	* online__pages(), and start_pfn should exist.
	238	*/
	239	nid = pfn_to_nid(start_pfn);
	240	VM_BUG_ON(!node_state(nid, N_ONLINE));
	241	}
	242
52d4b9ac KH	243	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
	244	if (!pfn_present(pfn))
	245	continue;
37573e8c	246	fail = init_section_page_cgroup(pfn, nid);
52d4b9ac KH	247	}
	248	if (!fail)
	249	return 0;
	250
	251	/* rollback */
	252	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
	253	__free_page_cgroup(pfn);
	254
	255	return -ENOMEM;
	256	}
	257
31168481	258	int __meminit offline_page_cgroup(unsigned long start_pfn,
52d4b9ac KH	259	unsigned long nr_pages, int nid)
	260	{
	261	unsigned long start, end, pfn;
	262
1bb36fbd DK	263	start = SECTION_ALIGN_DOWN(start_pfn);
1bb36fbd DK	264	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
52d4b9ac KH	265
	266	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
	267	__free_page_cgroup(pfn);
	268	return 0;
	269
	270	}
	271
31168481	272	static int __meminit page_cgroup_callback(struct notifier_block *self,
52d4b9ac KH	273	unsigned long action, void *arg)
	274	{
	275	struct memory_notify *mn = arg;
	276	int ret = 0;
	277	switch (action) {
	278	case MEM_GOING_ONLINE:
	279	ret = online_page_cgroup(mn->start_pfn,
	280	mn->nr_pages, mn->status_change_nid);
	281	break;
52d4b9ac KH	282	case MEM_OFFLINE:
	283	offline_page_cgroup(mn->start_pfn,
	284	mn->nr_pages, mn->status_change_nid);
	285	break;
dc19f9db	286	case MEM_CANCEL_ONLINE:
52d4b9ac KH	287	case MEM_GOING_OFFLINE:
	288	break;
	289	case MEM_ONLINE:
	290	case MEM_CANCEL_OFFLINE:
	291	break;
	292	}
dc19f9db	293
5fda1bd5	294	return notifier_from_errno(ret);
52d4b9ac KH	295	}
	296
	297	#endif
	298
	299	void __init page_cgroup_init(void)
	300	{
	301	unsigned long pfn;
37573e8c	302	int nid;
52d4b9ac	303
f8d66542	304	if (mem_cgroup_disabled())
94b6da5a KH	305	return;
94b6da5a KH	306
37573e8c KH	307	for_each_node_state(nid, N_HIGH_MEMORY) {
	308	unsigned long start_pfn, end_pfn;
	309
	310	start_pfn = node_start_pfn(nid);
	311	end_pfn = node_end_pfn(nid);
	312	/*
	313	* start_pfn and end_pfn may not be aligned to SECTION and the
	314	* page->flags of out of node pages are not initialized. So we
	315	* scan [start_pfn, the biggest section's pfn < end_pfn) here.
	316	*/
	317	for (pfn = start_pfn;
	318	pfn < end_pfn;
	319	pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
	320
	321	if (!pfn_valid(pfn))
	322	continue;
	323	/*
	324	* Nodes's pfns can be overlapping.
	325	* We know some arch can have a nodes layout such as
	326	* -------------pfn-------------->
	327	* N0 \| N1 \| N2 \| N0 \| N1 \| N2\|....
	328	*/
	329	if (pfn_to_nid(pfn) != nid)
	330	continue;
	331	if (init_section_page_cgroup(pfn, nid))
	332	goto oom;
	333	}
52d4b9ac	334	}
37573e8c	335	hotplug_memory_notifier(page_cgroup_callback, 0);
52d4b9ac	336	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
37573e8c KH	337	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
	338	"don't want memory cgroups\n");
	339	return;
	340	oom:
	341	printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
	342	panic("Out of memory");
52d4b9ac KH	343	}
52d4b9ac KH	344
31168481	345	void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
52d4b9ac KH	346	{
	347	return;
	348	}
	349
	350	#endif
27a7faa0 KH	351
	352
	353	#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
	354
	355	static DEFINE_MUTEX(swap_cgroup_mutex);
	356	struct swap_cgroup_ctrl {
	357	struct page **map;
	358	unsigned long length;
e9e58a4e	359	spinlock_t lock;
27a7faa0 KH	360	};
27a7faa0 KH	361
61600f57	362	static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
27a7faa0	363
27a7faa0	364	struct swap_cgroup {
a3b2d692	365	unsigned short id;
27a7faa0 KH	366	};
	367	#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
	368	#define SC_POS_MASK (SC_PER_PAGE - 1)
	369
	370	/*
	371	* SwapCgroup implements "lookup" and "exchange" operations.
	372	* In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
	373	* against SwapCache. At swap_free(), this is accessed directly from swap.
	374	*
	375	* This means,
	376	* - we have no race in "exchange" when we're accessed via SwapCache because
	377	* SwapCache(and its swp_entry) is under lock.
	378	* - When called via swap_free(), there is no user of this entry and no race.
	379	* Then, we don't need lock around "exchange".
	380	*
	381	* TODO: we can push these buffers out to HIGHMEM.
	382	*/
	383
	384	/*
	385	* allocate buffer for swap_cgroup.
	386	*/
	387	static int swap_cgroup_prepare(int type)
	388	{
	389	struct page *page;
	390	struct swap_cgroup_ctrl *ctrl;
	391	unsigned long idx, max;
	392
27a7faa0 KH	393	ctrl = &swap_cgroup_ctrl[type];
	394
	395	for (idx = 0; idx < ctrl->length; idx++) {
	396	page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
	397	if (!page)
	398	goto not_enough_page;
	399	ctrl->map[idx] = page;
	400	}
	401	return 0;
	402	not_enough_page:
	403	max = idx;
	404	for (idx = 0; idx < max; idx++)
	405	__free_page(ctrl->map[idx]);
	406
	407	return -ENOMEM;
	408	}
	409
02491447 DN	410	/**
	411	* swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
	412	* @end: swap entry to be cmpxchged
	413	* @old: old id
	414	* @new: new id
	415	*
	416	* Returns old id at success, 0 at failure.
25985edc	417	* (There is no mem_cgroup using 0 as its id)
02491447 DN	418	*/
	419	unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
	420	unsigned short old, unsigned short new)
	421	{
	422	int type = swp_type(ent);
	423	unsigned long offset = swp_offset(ent);
	424	unsigned long idx = offset / SC_PER_PAGE;
	425	unsigned long pos = offset & SC_POS_MASK;
	426	struct swap_cgroup_ctrl *ctrl;
	427	struct page *mappage;
	428	struct swap_cgroup *sc;
e9e58a4e KH	429	unsigned long flags;
e9e58a4e KH	430	unsigned short retval;
02491447 DN	431
	432	ctrl = &swap_cgroup_ctrl[type];
	433
	434	mappage = ctrl->map[idx];
	435	sc = page_address(mappage);
	436	sc += pos;
e9e58a4e KH	437	spin_lock_irqsave(&ctrl->lock, flags);
	438	retval = sc->id;
	439	if (retval == old)
	440	sc->id = new;
02491447	441	else
e9e58a4e KH	442	retval = 0;
	443	spin_unlock_irqrestore(&ctrl->lock, flags);
	444	return retval;
02491447 DN	445	}
02491447 DN	446
27a7faa0 KH	447	/**
	448	* swap_cgroup_record - record mem_cgroup for this swp_entry.
	449	* @ent: swap entry to be recorded into
	450	* @mem: mem_cgroup to be recorded
	451	*
a3b2d692 KH	452	* Returns old value at success, 0 at failure.
a3b2d692 KH	453	* (Of course, old value can be 0.)
27a7faa0	454	*/
a3b2d692	455	unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
27a7faa0 KH	456	{
	457	int type = swp_type(ent);
	458	unsigned long offset = swp_offset(ent);
	459	unsigned long idx = offset / SC_PER_PAGE;
	460	unsigned long pos = offset & SC_POS_MASK;
	461	struct swap_cgroup_ctrl *ctrl;
	462	struct page *mappage;
	463	struct swap_cgroup *sc;
a3b2d692	464	unsigned short old;
e9e58a4e	465	unsigned long flags;
27a7faa0	466
27a7faa0 KH	467	ctrl = &swap_cgroup_ctrl[type];
	468
	469	mappage = ctrl->map[idx];
	470	sc = page_address(mappage);
	471	sc += pos;
e9e58a4e KH	472	spin_lock_irqsave(&ctrl->lock, flags);
	473	old = sc->id;
	474	sc->id = id;
	475	spin_unlock_irqrestore(&ctrl->lock, flags);
27a7faa0 KH	476
	477	return old;
	478	}
	479
	480	/**
	481	* lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
	482	* @ent: swap entry to be looked up.
	483	*
a3b2d692	484	* Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
27a7faa0	485	*/
a3b2d692	486	unsigned short lookup_swap_cgroup(swp_entry_t ent)
27a7faa0 KH	487	{
	488	int type = swp_type(ent);
	489	unsigned long offset = swp_offset(ent);
	490	unsigned long idx = offset / SC_PER_PAGE;
	491	unsigned long pos = offset & SC_POS_MASK;
	492	struct swap_cgroup_ctrl *ctrl;
	493	struct page *mappage;
	494	struct swap_cgroup *sc;
a3b2d692	495	unsigned short ret;
27a7faa0	496
27a7faa0 KH	497	ctrl = &swap_cgroup_ctrl[type];
	498	mappage = ctrl->map[idx];
	499	sc = page_address(mappage);
	500	sc += pos;
a3b2d692	501	ret = sc->id;
27a7faa0 KH	502	return ret;
	503	}
	504
	505	int swap_cgroup_swapon(int type, unsigned long max_pages)
	506	{
	507	void *array;
	508	unsigned long array_size;
	509	unsigned long length;
	510	struct swap_cgroup_ctrl *ctrl;
	511
	512	if (!do_swap_account)
	513	return 0;
	514
33278f7f	515	length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
27a7faa0 KH	516	array_size = length * sizeof(void *);
27a7faa0 KH	517
8c1fec1b	518	array = vzalloc(array_size);
27a7faa0 KH	519	if (!array)
	520	goto nomem;
	521
27a7faa0 KH	522	ctrl = &swap_cgroup_ctrl[type];
	523	mutex_lock(&swap_cgroup_mutex);
	524	ctrl->length = length;
	525	ctrl->map = array;
e9e58a4e	526	spin_lock_init(&ctrl->lock);
27a7faa0 KH	527	if (swap_cgroup_prepare(type)) {
	528	/* memory shortage */
	529	ctrl->map = NULL;
	530	ctrl->length = 0;
27a7faa0	531	mutex_unlock(&swap_cgroup_mutex);
6a5b18d2	532	vfree(array);
27a7faa0 KH	533	goto nomem;
	534	}
	535	mutex_unlock(&swap_cgroup_mutex);
	536
27a7faa0 KH	537	return 0;
	538	nomem:
	539	printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
	540	printk(KERN_INFO
00a66d29	541	"swap_cgroup can be disabled by swapaccount=0 boot option\n");
27a7faa0 KH	542	return -ENOMEM;
	543	}
	544
	545	void swap_cgroup_swapoff(int type)
	546	{
6a5b18d2 NK	547	struct page **map;
6a5b18d2 NK	548	unsigned long i, length;
27a7faa0 KH	549	struct swap_cgroup_ctrl *ctrl;
	550
	551	if (!do_swap_account)
	552	return;
	553
	554	mutex_lock(&swap_cgroup_mutex);
	555	ctrl = &swap_cgroup_ctrl[type];
6a5b18d2 NK	556	map = ctrl->map;
	557	length = ctrl->length;
	558	ctrl->map = NULL;
	559	ctrl->length = 0;
	560	mutex_unlock(&swap_cgroup_mutex);
	561
	562	if (map) {
	563	for (i = 0; i < length; i++) {
	564	struct page *page = map[i];
27a7faa0 KH	565	if (page)
	566	__free_page(page);
	567	}
6a5b18d2	568	vfree(map);
27a7faa0	569	}
27a7faa0 KH	570	}
	571
	572	#endif