[mirror_ubuntu-artful-kernel.git] / mm / swap_state.c

/*
 *  linux/mm/swap_state.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *
 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
 */
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/migrate.h>
#include <linux/page_cgroup.h>

#include <asm/pgtable.h>

/*
 * swapper_space is a fiction, retained to simplify the path through
 * vmscan's shrink_page_list.
 */
static const struct address_space_operations swap_aops = {
	.writepage	= swap_writepage,
	.set_page_dirty	= __set_page_dirty_nobuffers,
	.migratepage	= migrate_page,
};

static struct backing_dev_info swap_backing_dev_info = {
	.name		= "swap",
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
};

struct address_space swapper_space = {
	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
	.tree_lock	= __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
	.a_ops		= &swap_aops,
	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
	.backing_dev_info = &swap_backing_dev_info,
};

#define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)

static struct {
	unsigned long add_total;
	unsigned long del_total;
	unsigned long find_success;
	unsigned long find_total;
} swap_cache_info;

void show_swap_cache_info(void)
{
	printk("%lu pages in swap cache\n", total_swapcache_pages);
	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
		swap_cache_info.add_total, swap_cache_info.del_total,
		swap_cache_info.find_success, swap_cache_info.find_total);
	printk("Free swap  = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}

/*
 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
 * but sets SwapCache flag and private instead of mapping and index.
 */
static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
{
	int error;

	VM_BUG_ON(!PageLocked(page));
	VM_BUG_ON(PageSwapCache(page));
	VM_BUG_ON(!PageSwapBacked(page));

	page_cache_get(page);
	SetPageSwapCache(page);
	set_page_private(page, entry.val);

	spin_lock_irq(&swapper_space.tree_lock);
	error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
	if (likely(!error)) {
		total_swapcache_pages++;
		__inc_zone_page_state(page, NR_FILE_PAGES);
		INC_CACHE_INFO(add_total);
	}
	spin_unlock_irq(&swapper_space.tree_lock);

	if (unlikely(error)) {
		/*
		 * Only the context which have set SWAP_HAS_CACHE flag
		 * would call add_to_swap_cache().
		 * So add_to_swap_cache() doesn't returns -EEXIST.
		 */
		VM_BUG_ON(error == -EEXIST);
		set_page_private(page, 0UL);
		ClearPageSwapCache(page);
		page_cache_release(page);
	}

	return error;
}


int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
	int error;

	error = radix_tree_preload(gfp_mask);
	if (!error) {
		error = __add_to_swap_cache(page, entry);
		radix_tree_preload_end();
	}
	return error;
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache.
 */
void __delete_from_swap_cache(struct page *page)
{
	VM_BUG_ON(!PageLocked(page));
	VM_BUG_ON(!PageSwapCache(page));
	VM_BUG_ON(PageWriteback(page));

	radix_tree_delete(&swapper_space.page_tree, page_private(page));
	set_page_private(page, 0);
	ClearPageSwapCache(page);
	total_swapcache_pages--;
	__dec_zone_page_state(page, NR_FILE_PAGES);
	INC_CACHE_INFO(del_total);
}

/**
 * add_to_swap - allocate swap space for a page
 * @page: page we want to move to swap
 *
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
int add_to_swap(struct page *page)
{
	swp_entry_t entry;
	int err;

	VM_BUG_ON(!PageLocked(page));
	VM_BUG_ON(!PageUptodate(page));

	entry = get_swap_page();
	if (!entry.val)
		return 0;

	if (unlikely(PageTransHuge(page)))
		if (unlikely(split_huge_page(page))) {
			swapcache_free(entry, NULL);
			return 0;
		}

	/*
	 * Radix-tree node allocations from PF_MEMALLOC contexts could
	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
	 * stops emergency reserves from being allocated.
	 *
	 * TODO: this could cause a theoretical memory reclaim
	 * deadlock in the swap out path.
	 */
	/*
	 * Add it to the swap cache and mark it dirty
	 */
	err = add_to_swap_cache(page, entry,
			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);

	if (!err) {	/* Success */
		SetPageDirty(page);
		return 1;
	} else {	/* -ENOMEM radix-tree allocation failure */
		/*
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
		 */
		swapcache_free(entry, NULL);
		return 0;
	}
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache and locked.
 * It will never put the page into the free list,
 * the caller has a reference on the page.
 */
void delete_from_swap_cache(struct page *page)
{
	swp_entry_t entry;

	entry.val = page_private(page);

	spin_lock_irq(&swapper_space.tree_lock);
	__delete_from_swap_cache(page);
	spin_unlock_irq(&swapper_space.tree_lock);

	swapcache_free(entry, page);
	page_cache_release(page);
}

/* 
 * If we are the only user, then try to free up the swap cache. 
 * 
 * Its ok to check for PageSwapCache without the page lock
 * here because we are going to recheck again inside
 * try_to_free_swap() _with_ the lock.
 * 					- Marcelo
 */
static inline void free_swap_cache(struct page *page)
{
	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
		try_to_free_swap(page);
		unlock_page(page);
	}
}

/* 
 * Perform a free_page(), also freeing any swap cache associated with
 * this page if it is the last user of the page.
 */
void free_page_and_swap_cache(struct page *page)
{
	free_swap_cache(page);
	page_cache_release(page);
}

/*
 * Passed an array of pages, drop them all from swapcache and then release
 * them.  They are removed from the LRU and freed if this is their last use.
 */
void free_pages_and_swap_cache(struct page **pages, int nr)
{
	struct page **pagep = pages;

	lru_add_drain();
	while (nr) {
		int todo = min(nr, PAGEVEC_SIZE);
		int i;

		for (i = 0; i < todo; i++)
			free_swap_cache(pagep[i]);
		release_pages(pagep, todo, 0);
		pagep += todo;
		nr -= todo;
	}
}

/*
 * Lookup a swap entry in the swap cache. A found page will be returned
 * unlocked and with its refcount incremented - we rely on the kernel
 * lock getting page table operations atomic even if we drop the page
 * lock before returning.
 */
struct page * lookup_swap_cache(swp_entry_t entry)
{
	struct page *page;

	page = find_get_page(&swapper_space, entry.val);

	if (page)
		INC_CACHE_INFO(find_success);

	INC_CACHE_INFO(find_total);
	return page;
}

/* 
 * Locate a page of swap in physical memory, reserving swap cache space
 * and reading the disk if it is not already cached.
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 */
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
			struct vm_area_struct *vma, unsigned long addr)
{
	struct page *found_page, *new_page = NULL;
	int err;

	do {
		/*
		 * First check the swap cache.  Since this is normally
		 * called after lookup_swap_cache() failed, re-calling
		 * that would confuse statistics.
		 */
		found_page = find_get_page(&swapper_space, entry.val);
		if (found_page)
			break;

		/*
		 * Get a new page to read into from swap.
		 */
		if (!new_page) {
			new_page = alloc_page_vma(gfp_mask, vma, addr);
			if (!new_page)
				break;		/* Out of memory */
		}

		/*
		 * call radix_tree_preload() while we can wait.
		 */
		err = radix_tree_preload(gfp_mask & GFP_KERNEL);
		if (err)
			break;

		/*
		 * Swap entry may have been freed since our caller observed it.
		 */
		err = swapcache_prepare(entry);
		if (err == -EEXIST) {	/* seems racy */
			radix_tree_preload_end();
			continue;
		}
		if (err) {		/* swp entry is obsolete ? */
			radix_tree_preload_end();
			break;
		}

		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
		__set_page_locked(new_page);
		SetPageSwapBacked(new_page);
		err = __add_to_swap_cache(new_page, entry);
		if (likely(!err)) {
			radix_tree_preload_end();
			/*
			 * Initiate read into locked page and return.
			 */
			lru_cache_add_anon(new_page);
			swap_readpage(new_page);
			return new_page;
		}
		radix_tree_preload_end();
		ClearPageSwapBacked(new_page);
		__clear_page_locked(new_page);
		/*
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
		 */
		swapcache_free(entry, NULL);
	} while (err != -ENOMEM);

	if (new_page)
		page_cache_release(new_page);
	return found_page;
}

/**
 * swapin_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
 * @gfp_mask: memory allocation flags
 * @vma: user vma this address belongs to
 * @addr: target address for mempolicy
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
 * because it doesn't cost us any seek time.  We also make sure to queue
 * the 'original' request together with the readahead ones...
 *
 * This has been extended to use the NUMA policies from the mm triggering
 * the readahead.
 *
 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
 */
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
			struct vm_area_struct *vma, unsigned long addr)
{
	int nr_pages;
	struct page *page;
	unsigned long offset;
	unsigned long end_offset;

	/*
	 * Get starting offset for readaround, and number of pages to read.
	 * Adjust starting address by readbehind (for NUMA interleave case)?
	 * No, it's very unlikely that swap layout would follow vma layout,
	 * more likely that neighbouring swap pages came from the same node:
	 * so use the same "addr" to choose the same node for each swap read.
	 */
	nr_pages = valid_swaphandles(entry, &offset);
	for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
		/* Ok, do the async read-ahead now */
		page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
						gfp_mask, vma, addr);
		if (!page)
			break;
		page_cache_release(page);
	}
	lru_add_drain();	/* Push any new pages onto the LRU now */
	return read_swap_cache_async(entry, gfp_mask, vma, addr);
}
Commit	Line	Data
	1	/*
	2	* linux/mm/swap_state.c
	3	*
	4	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
	5	* Swap reorganised 29.12.95, Stephen Tweedie
	6	*
	7	* Rewritten to use page cache, (C) 1998 Stephen Tweedie
	8	*/
	9	#include <linux/module.h>
	10	#include <linux/mm.h>
	11	#include <linux/gfp.h>
	12	#include <linux/kernel_stat.h>
	13	#include <linux/swap.h>
	14	#include <linux/swapops.h>
	15	#include <linux/init.h>
	16	#include <linux/pagemap.h>
	17	#include <linux/buffer_head.h>
	18	#include <linux/backing-dev.h>
	19	#include <linux/pagevec.h>
	20	#include <linux/migrate.h>
	21	#include <linux/page_cgroup.h>
	22
	23	#include <asm/pgtable.h>
	24
	25	/*
	26	* swapper_space is a fiction, retained to simplify the path through
	27	* vmscan's shrink_page_list.
	28	*/
	29	static const struct address_space_operations swap_aops = {
	30	.writepage = swap_writepage,
	31	.set_page_dirty = __set_page_dirty_nobuffers,
	32	.migratepage = migrate_page,
	33	};
	34
	35	static struct backing_dev_info swap_backing_dev_info = {
	36	.name = "swap",
	37	.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK \| BDI_CAP_SWAP_BACKED,
	38	};
	39
	40	struct address_space swapper_space = {
	41	.page_tree = RADIX_TREE_INIT(GFP_ATOMIC\|__GFP_NOWARN),
	42	.tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
	43	.a_ops = &swap_aops,
	44	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
	45	.backing_dev_info = &swap_backing_dev_info,
	46	};
	47
	48	#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
	49
	50	static struct {
	51	unsigned long add_total;
	52	unsigned long del_total;
	53	unsigned long find_success;
	54	unsigned long find_total;
	55	} swap_cache_info;
	56
	57	void show_swap_cache_info(void)
	58	{
	59	printk("%lu pages in swap cache\n", total_swapcache_pages);
	60	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
	61	swap_cache_info.add_total, swap_cache_info.del_total,
	62	swap_cache_info.find_success, swap_cache_info.find_total);
	63	printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
	64	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
	65	}
	66
	67	/*
	68	* __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
	69	* but sets SwapCache flag and private instead of mapping and index.
	70	*/
	71	static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
	72	{
	73	int error;
	74
	75	VM_BUG_ON(!PageLocked(page));
	76	VM_BUG_ON(PageSwapCache(page));
	77	VM_BUG_ON(!PageSwapBacked(page));
	78
	79	page_cache_get(page);
	80	SetPageSwapCache(page);
	81	set_page_private(page, entry.val);
	82
	83	spin_lock_irq(&swapper_space.tree_lock);
	84	error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
	85	if (likely(!error)) {
	86	total_swapcache_pages++;
	87	__inc_zone_page_state(page, NR_FILE_PAGES);
	88	INC_CACHE_INFO(add_total);
	89	}
	90	spin_unlock_irq(&swapper_space.tree_lock);
	91
	92	if (unlikely(error)) {
	93	/*
	94	* Only the context which have set SWAP_HAS_CACHE flag
	95	* would call add_to_swap_cache().
	96	* So add_to_swap_cache() doesn't returns -EEXIST.
	97	*/
	98	VM_BUG_ON(error == -EEXIST);
	99	set_page_private(page, 0UL);
	100	ClearPageSwapCache(page);
	101	page_cache_release(page);
	102	}
	103
	104	return error;
	105	}
	106
	107
	108	int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
	109	{
	110	int error;
	111
	112	error = radix_tree_preload(gfp_mask);
	113	if (!error) {
	114	error = __add_to_swap_cache(page, entry);
	115	radix_tree_preload_end();
	116	}
	117	return error;
	118	}
	119
	120	/*
	121	* This must be called only on pages that have
	122	* been verified to be in the swap cache.
	123	*/
	124	void __delete_from_swap_cache(struct page *page)
	125	{
	126	VM_BUG_ON(!PageLocked(page));
	127	VM_BUG_ON(!PageSwapCache(page));
	128	VM_BUG_ON(PageWriteback(page));
	129
	130	radix_tree_delete(&swapper_space.page_tree, page_private(page));
	131	set_page_private(page, 0);
	132	ClearPageSwapCache(page);
	133	total_swapcache_pages--;
	134	__dec_zone_page_state(page, NR_FILE_PAGES);
	135	INC_CACHE_INFO(del_total);
	136	}
	137
	138	/**
	139	* add_to_swap - allocate swap space for a page
	140	* @page: page we want to move to swap
	141	*
	142	* Allocate swap space for the page and add the page to the
	143	* swap cache. Caller needs to hold the page lock.
	144	*/
	145	int add_to_swap(struct page *page)
	146	{
	147	swp_entry_t entry;
	148	int err;
	149
	150	VM_BUG_ON(!PageLocked(page));
	151	VM_BUG_ON(!PageUptodate(page));
	152
	153	entry = get_swap_page();
	154	if (!entry.val)
	155	return 0;
	156
	157	if (unlikely(PageTransHuge(page)))
	158	if (unlikely(split_huge_page(page))) {
	159	swapcache_free(entry, NULL);
	160	return 0;
	161	}
	162
	163	/*
	164	* Radix-tree node allocations from PF_MEMALLOC contexts could
	165	* completely exhaust the page allocator. __GFP_NOMEMALLOC
	166	* stops emergency reserves from being allocated.
	167	*
	168	* TODO: this could cause a theoretical memory reclaim
	169	* deadlock in the swap out path.
	170	*/
	171	/*
	172	* Add it to the swap cache and mark it dirty
	173	*/
	174	err = add_to_swap_cache(page, entry,
	175	__GFP_HIGH\|__GFP_NOMEMALLOC\|__GFP_NOWARN);
	176
	177	if (!err) { /* Success */
	178	SetPageDirty(page);
	179	return 1;
	180	} else { /* -ENOMEM radix-tree allocation failure */
	181	/*
	182	* add_to_swap_cache() doesn't return -EEXIST, so we can safely
	183	* clear SWAP_HAS_CACHE flag.
	184	*/
	185	swapcache_free(entry, NULL);
	186	return 0;
	187	}
	188	}
	189
	190	/*
	191	* This must be called only on pages that have
	192	* been verified to be in the swap cache and locked.
	193	* It will never put the page into the free list,
	194	* the caller has a reference on the page.
	195	*/
	196	void delete_from_swap_cache(struct page *page)
	197	{
	198	swp_entry_t entry;
	199
	200	entry.val = page_private(page);
	201
	202	spin_lock_irq(&swapper_space.tree_lock);
	203	__delete_from_swap_cache(page);
	204	spin_unlock_irq(&swapper_space.tree_lock);
	205
	206	swapcache_free(entry, page);
	207	page_cache_release(page);
	208	}
	209
	210	/*
	211	* If we are the only user, then try to free up the swap cache.
	212	*
	213	* Its ok to check for PageSwapCache without the page lock
	214	* here because we are going to recheck again inside
	215	* try_to_free_swap() _with_ the lock.
	216	* - Marcelo
	217	*/
	218	static inline void free_swap_cache(struct page *page)
	219	{
	220	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
	221	try_to_free_swap(page);
	222	unlock_page(page);
	223	}
	224	}
	225
	226	/*
	227	* Perform a free_page(), also freeing any swap cache associated with
	228	* this page if it is the last user of the page.
	229	*/
	230	void free_page_and_swap_cache(struct page *page)
	231	{
	232	free_swap_cache(page);
	233	page_cache_release(page);
	234	}
	235
	236	/*
	237	* Passed an array of pages, drop them all from swapcache and then release
	238	* them. They are removed from the LRU and freed if this is their last use.
	239	*/
	240	void free_pages_and_swap_cache(struct page **pages, int nr)
	241	{
	242	struct page **pagep = pages;
	243
	244	lru_add_drain();
	245	while (nr) {
	246	int todo = min(nr, PAGEVEC_SIZE);
	247	int i;
	248
	249	for (i = 0; i < todo; i++)
	250	free_swap_cache(pagep[i]);
	251	release_pages(pagep, todo, 0);
	252	pagep += todo;
	253	nr -= todo;
	254	}
	255	}
	256
	257	/*
	258	* Lookup a swap entry in the swap cache. A found page will be returned
	259	* unlocked and with its refcount incremented - we rely on the kernel
	260	* lock getting page table operations atomic even if we drop the page
	261	* lock before returning.
	262	*/
	263	struct page * lookup_swap_cache(swp_entry_t entry)
	264	{
	265	struct page *page;
	266
	267	page = find_get_page(&swapper_space, entry.val);
	268
	269	if (page)
	270	INC_CACHE_INFO(find_success);
	271
	272	INC_CACHE_INFO(find_total);
	273	return page;
	274	}
	275
	276	/*
	277	* Locate a page of swap in physical memory, reserving swap cache space
	278	* and reading the disk if it is not already cached.
	279	* A failure return means that either the page allocation failed or that
	280	* the swap entry is no longer in use.
	281	*/
	282	struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
	283	struct vm_area_struct *vma, unsigned long addr)
	284	{
	285	struct page found_page, new_page = NULL;
	286	int err;
	287
	288	do {
	289	/*
	290	* First check the swap cache. Since this is normally
	291	* called after lookup_swap_cache() failed, re-calling
	292	* that would confuse statistics.
	293	*/
	294	found_page = find_get_page(&swapper_space, entry.val);
	295	if (found_page)
	296	break;
	297
	298	/*
	299	* Get a new page to read into from swap.
	300	*/
	301	if (!new_page) {
	302	new_page = alloc_page_vma(gfp_mask, vma, addr);
	303	if (!new_page)
	304	break; /* Out of memory */
	305	}
	306
	307	/*
	308	* call radix_tree_preload() while we can wait.
	309	*/
	310	err = radix_tree_preload(gfp_mask & GFP_KERNEL);
	311	if (err)
	312	break;
	313
	314	/*
	315	* Swap entry may have been freed since our caller observed it.
	316	*/
	317	err = swapcache_prepare(entry);
	318	if (err == -EEXIST) { /* seems racy */
	319	radix_tree_preload_end();
	320	continue;
	321	}
	322	if (err) { /* swp entry is obsolete ? */
	323	radix_tree_preload_end();
	324	break;
	325	}
	326
	327	/* May fail (-ENOMEM) if radix-tree node allocation failed. */
	328	__set_page_locked(new_page);
	329	SetPageSwapBacked(new_page);
	330	err = __add_to_swap_cache(new_page, entry);
	331	if (likely(!err)) {
	332	radix_tree_preload_end();
	333	/*
	334	* Initiate read into locked page and return.
	335	*/
	336	lru_cache_add_anon(new_page);
	337	swap_readpage(new_page);
	338	return new_page;
	339	}
	340	radix_tree_preload_end();
	341	ClearPageSwapBacked(new_page);
	342	__clear_page_locked(new_page);
	343	/*
	344	* add_to_swap_cache() doesn't return -EEXIST, so we can safely
	345	* clear SWAP_HAS_CACHE flag.
	346	*/
	347	swapcache_free(entry, NULL);
	348	} while (err != -ENOMEM);
	349
	350	if (new_page)
	351	page_cache_release(new_page);
	352	return found_page;
	353	}
	354
	355	/**
	356	* swapin_readahead - swap in pages in hope we need them soon
	357	* @entry: swap entry of this memory
	358	* @gfp_mask: memory allocation flags
	359	* @vma: user vma this address belongs to
	360	* @addr: target address for mempolicy
	361	*
	362	* Returns the struct page for entry and addr, after queueing swapin.
	363	*
	364	* Primitive swap readahead code. We simply read an aligned block of
	365	* (1 << page_cluster) entries in the swap area. This method is chosen
	366	* because it doesn't cost us any seek time. We also make sure to queue
	367	* the 'original' request together with the readahead ones...
	368	*
	369	* This has been extended to use the NUMA policies from the mm triggering
	370	* the readahead.
	371	*
	372	* Caller must hold down_read on the vma->vm_mm if vma is not NULL.
	373	*/
	374	struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
	375	struct vm_area_struct *vma, unsigned long addr)
	376	{
	377	int nr_pages;
	378	struct page *page;
	379	unsigned long offset;
	380	unsigned long end_offset;
	381
	382	/*
	383	* Get starting offset for readaround, and number of pages to read.
	384	* Adjust starting address by readbehind (for NUMA interleave case)?
	385	* No, it's very unlikely that swap layout would follow vma layout,
	386	* more likely that neighbouring swap pages came from the same node:
	387	* so use the same "addr" to choose the same node for each swap read.
	388	*/
	389	nr_pages = valid_swaphandles(entry, &offset);
	390	for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
	391	/* Ok, do the async read-ahead now */
	392	page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
	393	gfp_mask, vma, addr);
	394	if (!page)
	395	break;
	396	page_cache_release(page);
	397	}
	398	lru_add_drain(); /* Push any new pages onto the LRU now */
	399	return read_swap_cache_async(entry, gfp_mask, vma, addr);
	400	}