[mirror_ubuntu-bionic-kernel.git] / mm / compaction.c

/*
 * linux/mm/compaction.c
 *
 * Memory compaction for the reduction of external fragmentation. Note that
 * this heavily depends upon page migration to do all the real heavy
 * lifting
 *
 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
 */
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/sysctl.h>
#include <linux/sysfs.h>
#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>

/*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
 * at the end of a zone and migrate_pfn begins at the start. Movable pages
 * are moved to the end of a zone during a compaction run and the run
 * completes when free_pfn <= migrate_pfn
 */
struct compact_control {
	struct list_head freepages;	/* List of free pages to migrate to */
	struct list_head migratepages;	/* List of pages being migrated */
	unsigned long nr_freepages;	/* Number of isolated free pages */
	unsigned long nr_migratepages;	/* Number of pages to migrate */
	unsigned long free_pfn;		/* isolate_freepages search base */
	unsigned long migrate_pfn;	/* isolate_migratepages search base */

	/* Account for isolated anon and file pages */
	unsigned long nr_anon;
	unsigned long nr_file;

	unsigned int order;		/* order a direct compactor needs */
	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
	struct zone *zone;
};

static unsigned long release_freepages(struct list_head *freelist)
{
	struct page *page, *next;
	unsigned long count = 0;

	list_for_each_entry_safe(page, next, freelist, lru) {
		list_del(&page->lru);
		__free_page(page);
		count++;
	}

	return count;
}

/* Isolate free pages onto a private freelist. Must hold zone->lock */
static unsigned long isolate_freepages_block(struct zone *zone,
				unsigned long blockpfn,
				struct list_head *freelist)
{
	unsigned long zone_end_pfn, end_pfn;
	int nr_scanned = 0, total_isolated = 0;
	struct page *cursor;

	/* Get the last PFN we should scan for free pages at */
	zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
	end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);

	/* Find the first usable PFN in the block to initialse page cursor */
	for (; blockpfn < end_pfn; blockpfn++) {
		if (pfn_valid_within(blockpfn))
			break;
	}
	cursor = pfn_to_page(blockpfn);

	/* Isolate free pages. This assumes the block is valid */
	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
		int isolated, i;
		struct page *page = cursor;

		if (!pfn_valid_within(blockpfn))
			continue;
		nr_scanned++;

		if (!PageBuddy(page))
			continue;

		/* Found a free page, break it into order-0 pages */
		isolated = split_free_page(page);
		total_isolated += isolated;
		for (i = 0; i < isolated; i++) {
			list_add(&page->lru, freelist);
			page++;
		}

		/* If a page was split, advance to the end of it */
		if (isolated) {
			blockpfn += isolated - 1;
			cursor += isolated - 1;
		}
	}

	trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
	return total_isolated;
}

/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct page *page)
{

	int migratetype = get_pageblock_migratetype(page);

	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
		return false;

	/* If the page is a large free page, then allow migration */
	if (PageBuddy(page) && page_order(page) >= pageblock_order)
		return true;

	/* If the block is MIGRATE_MOVABLE, allow migration */
	if (migratetype == MIGRATE_MOVABLE)
		return true;

	/* Otherwise skip the block */
	return false;
}

/*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
 */
static void isolate_freepages(struct zone *zone,
				struct compact_control *cc)
{
	struct page *page;
	unsigned long high_pfn, low_pfn, pfn;
	unsigned long flags;
	int nr_freepages = cc->nr_freepages;
	struct list_head *freelist = &cc->freepages;

	pfn = cc->free_pfn;
	low_pfn = cc->migrate_pfn + pageblock_nr_pages;
	high_pfn = low_pfn;

	/*
	 * Isolate free pages until enough are available to migrate the
	 * pages on cc->migratepages. We stop searching if the migrate
	 * and free page scanners meet or enough free pages are isolated.
	 */
	spin_lock_irqsave(&zone->lock, flags);
	for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
					pfn -= pageblock_nr_pages) {
		unsigned long isolated;

		if (!pfn_valid(pfn))
			continue;

		/*
		 * Check for overlapping nodes/zones. It's possible on some
		 * configurations to have a setup like
		 * node0 node1 node0
		 * i.e. it's possible that all pages within a zones range of
		 * pages do not belong to a single zone.
		 */
		page = pfn_to_page(pfn);
		if (page_zone(page) != zone)
			continue;

		/* Check the block is suitable for migration */
		if (!suitable_migration_target(page))
			continue;

		/* Found a block suitable for isolating free pages from */
		isolated = isolate_freepages_block(zone, pfn, freelist);
		nr_freepages += isolated;

		/*
		 * Record the highest PFN we isolated pages from. When next
		 * looking for free pages, the search will restart here as
		 * page migration may have returned some pages to the allocator
		 */
		if (isolated)
			high_pfn = max(high_pfn, pfn);
	}
	spin_unlock_irqrestore(&zone->lock, flags);

	/* split_free_page does not map the pages */
	list_for_each_entry(page, freelist, lru) {
		arch_alloc_page(page, 0);
		kernel_map_pages(page, 1, 1);
	}

	cc->free_pfn = high_pfn;
	cc->nr_freepages = nr_freepages;
}

/* Update the number of anon and file isolated pages in the zone */
static void acct_isolated(struct zone *zone, struct compact_control *cc)
{
	struct page *page;
	unsigned int count[NR_LRU_LISTS] = { 0, };

	list_for_each_entry(page, &cc->migratepages, lru) {
		int lru = page_lru_base_type(page);
		count[lru]++;
	}

	cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
	cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
	__mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
	__mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
}

/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct zone *zone)
{
	unsigned long active, inactive, isolated;

	inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
					zone_page_state(zone, NR_INACTIVE_ANON);
	active = zone_page_state(zone, NR_ACTIVE_FILE) +
					zone_page_state(zone, NR_ACTIVE_ANON);
	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
					zone_page_state(zone, NR_ISOLATED_ANON);

	return isolated > (inactive + active) / 2;
}

/*
 * Isolate all pages that can be migrated from the block pointed to by
 * the migrate scanner within compact_control.
 */
static unsigned long isolate_migratepages(struct zone *zone,
					struct compact_control *cc)
{
	unsigned long low_pfn, end_pfn;
	unsigned long nr_scanned = 0, nr_isolated = 0;
	struct list_head *migratelist = &cc->migratepages;

	/* Do not scan outside zone boundaries */
	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);

	/* Only scan within a pageblock boundary */
	end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);

	/* Do not cross the free scanner or scan within a memory hole */
	if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
		cc->migrate_pfn = end_pfn;
		return 0;
	}

	/*
	 * Ensure that there are not too many pages isolated from the LRU
	 * list by either parallel reclaimers or compaction. If there are,
	 * delay for some time until fewer pages are isolated
	 */
	while (unlikely(too_many_isolated(zone))) {
		congestion_wait(BLK_RW_ASYNC, HZ/10);

		if (fatal_signal_pending(current))
			return 0;
	}

	/* Time to isolate some pages for migration */
	spin_lock_irq(&zone->lru_lock);
	for (; low_pfn < end_pfn; low_pfn++) {
		struct page *page;
		if (!pfn_valid_within(low_pfn))
			continue;
		nr_scanned++;

		/* Get the page and skip if free */
		page = pfn_to_page(low_pfn);
		if (PageBuddy(page))
			continue;

		/* Try isolate the page */
		if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
			continue;

		/* Successfully isolated */
		del_page_from_lru_list(zone, page, page_lru(page));
		list_add(&page->lru, migratelist);
		cc->nr_migratepages++;
		nr_isolated++;

		/* Avoid isolating too much */
		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
			break;
	}

	acct_isolated(zone, cc);

	spin_unlock_irq(&zone->lru_lock);
	cc->migrate_pfn = low_pfn;

	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);

	return cc->nr_migratepages;
}

/*
 * This is a migrate-callback that "allocates" freepages by taking pages
 * from the isolated freelists in the block we are migrating to.
 */
static struct page *compaction_alloc(struct page *migratepage,
					unsigned long data,
					int **result)
{
	struct compact_control *cc = (struct compact_control *)data;
	struct page *freepage;

	/* Isolate free pages if necessary */
	if (list_empty(&cc->freepages)) {
		isolate_freepages(cc->zone, cc);

		if (list_empty(&cc->freepages))
			return NULL;
	}

	freepage = list_entry(cc->freepages.next, struct page, lru);
	list_del(&freepage->lru);
	cc->nr_freepages--;

	return freepage;
}

/*
 * We cannot control nr_migratepages and nr_freepages fully when migration is
 * running as migrate_pages() has no knowledge of compact_control. When
 * migration is complete, we count the number of pages on the lists by hand.
 */
static void update_nr_listpages(struct compact_control *cc)
{
	int nr_migratepages = 0;
	int nr_freepages = 0;
	struct page *page;

	list_for_each_entry(page, &cc->migratepages, lru)
		nr_migratepages++;
	list_for_each_entry(page, &cc->freepages, lru)
		nr_freepages++;

	cc->nr_migratepages = nr_migratepages;
	cc->nr_freepages = nr_freepages;
}

static int compact_finished(struct zone *zone,
						struct compact_control *cc)
{
	unsigned int order;
	unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);

	if (fatal_signal_pending(current))
		return COMPACT_PARTIAL;

	/* Compaction run completes if the migrate and free scanner meet */
	if (cc->free_pfn <= cc->migrate_pfn)
		return COMPACT_COMPLETE;

	/* Compaction run is not finished if the watermark is not met */
	if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
		return COMPACT_CONTINUE;

	if (cc->order == -1)
		return COMPACT_CONTINUE;

	/* Direct compactor: Is a suitable page free? */
	for (order = cc->order; order < MAX_ORDER; order++) {
		/* Job done if page is free of the right migratetype */
		if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
			return COMPACT_PARTIAL;

		/* Job done if allocation would set block type */
		if (order >= pageblock_order && zone->free_area[order].nr_free)
			return COMPACT_PARTIAL;
	}

	return COMPACT_CONTINUE;
}

/*
 * compaction_suitable: Is this suitable to run compaction on this zone now?
 * Returns
 *   COMPACT_SKIPPED  - If there are too few free pages for compaction
 *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
 *   COMPACT_CONTINUE - If compaction should run now
 */
unsigned long compaction_suitable(struct zone *zone, int order)
{
	int fragindex;
	unsigned long watermark;

	/*
	 * Watermarks for order-0 must be met for compaction. Note the 2UL.
	 * This is because during migration, copies of pages need to be
	 * allocated and for a short time, the footprint is higher
	 */
	watermark = low_wmark_pages(zone) + (2UL << order);
	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
		return COMPACT_SKIPPED;

	/*
	 * fragmentation index determines if allocation failures are due to
	 * low memory or external fragmentation
	 *
	 * index of -1 implies allocations might succeed dependingon watermarks
	 * index towards 0 implies failure is due to lack of memory
	 * index towards 1000 implies failure is due to fragmentation
	 *
	 * Only compact if a failure would be due to fragmentation.
	 */
	fragindex = fragmentation_index(zone, order);
	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
		return COMPACT_SKIPPED;

	if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
		return COMPACT_PARTIAL;

	return COMPACT_CONTINUE;
}

static int compact_zone(struct zone *zone, struct compact_control *cc)
{
	int ret;

	ret = compaction_suitable(zone, cc->order);
	switch (ret) {
	case COMPACT_PARTIAL:
	case COMPACT_SKIPPED:
		/* Compaction is likely to fail */
		return ret;
	case COMPACT_CONTINUE:
		/* Fall through to compaction */
		;
	}

	/* Setup to move all movable pages to the end of the zone */
	cc->migrate_pfn = zone->zone_start_pfn;
	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
	cc->free_pfn &= ~(pageblock_nr_pages-1);

	migrate_prep_local();

	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
		unsigned long nr_migrate, nr_remaining;

		if (!isolate_migratepages(zone, cc))
			continue;

		nr_migrate = cc->nr_migratepages;
		migrate_pages(&cc->migratepages, compaction_alloc,
						(unsigned long)cc, 0);
		update_nr_listpages(cc);
		nr_remaining = cc->nr_migratepages;

		count_vm_event(COMPACTBLOCKS);
		count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
		if (nr_remaining)
			count_vm_events(COMPACTPAGEFAILED, nr_remaining);
		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
						nr_remaining);

		/* Release LRU pages not migrated */
		if (!list_empty(&cc->migratepages)) {
			putback_lru_pages(&cc->migratepages);
			cc->nr_migratepages = 0;
		}

	}

	/* Release free pages and check accounting */
	cc->nr_freepages -= release_freepages(&cc->freepages);
	VM_BUG_ON(cc->nr_freepages != 0);

	return ret;
}

unsigned long compact_zone_order(struct zone *zone,
						int order, gfp_t gfp_mask)
{
	struct compact_control cc = {
		.nr_freepages = 0,
		.nr_migratepages = 0,
		.order = order,
		.migratetype = allocflags_to_migratetype(gfp_mask),
		.zone = zone,
	};
	INIT_LIST_HEAD(&cc.freepages);
	INIT_LIST_HEAD(&cc.migratepages);

	return compact_zone(zone, &cc);
}

int sysctl_extfrag_threshold = 500;

/**
 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
 * @zonelist: The zonelist used for the current allocation
 * @order: The order of the current allocation
 * @gfp_mask: The GFP mask of the current allocation
 * @nodemask: The allowed nodes to allocate from
 *
 * This is the main entry point for direct page compaction.
 */
unsigned long try_to_compact_pages(struct zonelist *zonelist,
			int order, gfp_t gfp_mask, nodemask_t *nodemask)
{
	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
	int may_enter_fs = gfp_mask & __GFP_FS;
	int may_perform_io = gfp_mask & __GFP_IO;
	struct zoneref *z;
	struct zone *zone;
	int rc = COMPACT_SKIPPED;

	/*
	 * Check whether it is worth even starting compaction. The order check is
	 * made because an assumption is made that the page allocator can satisfy
	 * the "cheaper" orders without taking special steps
	 */
	if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
		return rc;

	count_vm_event(COMPACTSTALL);

	/* Compact each zone in the list */
	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
								nodemask) {
		int status;

		status = compact_zone_order(zone, order, gfp_mask);
		rc = max(status, rc);

		/* If a normal allocation would succeed, stop compacting */
		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
			break;
	}

	return rc;
}


/* Compact all zones within a node */
static int compact_node(int nid)
{
	int zoneid;
	pg_data_t *pgdat;
	struct zone *zone;

	if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
		return -EINVAL;
	pgdat = NODE_DATA(nid);

	/* Flush pending updates to the LRU lists */
	lru_add_drain_all();

	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
		struct compact_control cc = {
			.nr_freepages = 0,
			.nr_migratepages = 0,
			.order = -1,
		};

		zone = &pgdat->node_zones[zoneid];
		if (!populated_zone(zone))
			continue;

		cc.zone = zone;
		INIT_LIST_HEAD(&cc.freepages);
		INIT_LIST_HEAD(&cc.migratepages);

		compact_zone(zone, &cc);

		VM_BUG_ON(!list_empty(&cc.freepages));
		VM_BUG_ON(!list_empty(&cc.migratepages));
	}

	return 0;
}

/* Compact all nodes in the system */
static int compact_nodes(void)
{
	int nid;

	for_each_online_node(nid)
		compact_node(nid);

	return COMPACT_COMPLETE;
}

/* The written value is actually unused, all memory is compacted */
int sysctl_compact_memory;

/* This is the entry point for compacting all nodes via /proc/sys/vm */
int sysctl_compaction_handler(struct ctl_table *table, int write,
			void __user *buffer, size_t *length, loff_t *ppos)
{
	if (write)
		return compact_nodes();

	return 0;
}

int sysctl_extfrag_handler(struct ctl_table *table, int write,
			void __user *buffer, size_t *length, loff_t *ppos)
{
	proc_dointvec_minmax(table, write, buffer, length, ppos);

	return 0;
}

#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
ssize_t sysfs_compact_node(struct sys_device *dev,
			struct sysdev_attribute *attr,
			const char *buf, size_t count)
{
	compact_node(dev->id);

	return count;
}
static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);

int compaction_register_node(struct node *node)
{
	return sysdev_create_file(&node->sysdev, &attr_compact);
}

void compaction_unregister_node(struct node *node)
{
	return sysdev_remove_file(&node->sysdev, &attr_compact);
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
Commit	Line	Data
748446bb MG	1	/*
	2	* linux/mm/compaction.c
	3	*
	4	* Memory compaction for the reduction of external fragmentation. Note that
	5	* this heavily depends upon page migration to do all the real heavy
	6	* lifting
	7	*
	8	* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
	9	*/
	10	#include <linux/swap.h>
	11	#include <linux/migrate.h>
	12	#include <linux/compaction.h>
	13	#include <linux/mm_inline.h>
	14	#include <linux/backing-dev.h>
76ab0f53	15	#include <linux/sysctl.h>
ed4a6d7f	16	#include <linux/sysfs.h>
748446bb MG	17	#include "internal.h"
748446bb MG	18
b7aba698 MG	19	#define CREATE_TRACE_POINTS
	20	#include <trace/events/compaction.h>
	21
748446bb MG	22	/*
	23	* compact_control is used to track pages being migrated and the free pages
	24	* they are being migrated to during memory compaction. The free_pfn starts
	25	* at the end of a zone and migrate_pfn begins at the start. Movable pages
	26	* are moved to the end of a zone during a compaction run and the run
	27	* completes when free_pfn <= migrate_pfn
	28	*/
	29	struct compact_control {
	30	struct list_head freepages; /* List of free pages to migrate to */
	31	struct list_head migratepages; /* List of pages being migrated */
	32	unsigned long nr_freepages; /* Number of isolated free pages */
	33	unsigned long nr_migratepages; /* Number of pages to migrate */
	34	unsigned long free_pfn; /* isolate_freepages search base */
	35	unsigned long migrate_pfn; /* isolate_migratepages search base */
	36
	37	/* Account for isolated anon and file pages */
	38	unsigned long nr_anon;
	39	unsigned long nr_file;
	40
56de7263 MG	41	unsigned int order; /* order a direct compactor needs */
56de7263 MG	42	int migratetype; /* MOVABLE, RECLAIMABLE etc */
748446bb MG	43	struct zone *zone;
	44	};
	45
	46	static unsigned long release_freepages(struct list_head *freelist)
	47	{
	48	struct page page, next;
	49	unsigned long count = 0;
	50
	51	list_for_each_entry_safe(page, next, freelist, lru) {
	52	list_del(&page->lru);
	53	__free_page(page);
	54	count++;
	55	}
	56
	57	return count;
	58	}
	59
	60	/* Isolate free pages onto a private freelist. Must hold zone->lock */
	61	static unsigned long isolate_freepages_block(struct zone *zone,
	62	unsigned long blockpfn,
	63	struct list_head *freelist)
	64	{
	65	unsigned long zone_end_pfn, end_pfn;
b7aba698	66	int nr_scanned = 0, total_isolated = 0;
748446bb MG	67	struct page *cursor;
	68
	69	/* Get the last PFN we should scan for free pages at */
	70	zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
	71	end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
	72
	73	/* Find the first usable PFN in the block to initialse page cursor */
	74	for (; blockpfn < end_pfn; blockpfn++) {
	75	if (pfn_valid_within(blockpfn))
	76	break;
	77	}
	78	cursor = pfn_to_page(blockpfn);
	79
	80	/* Isolate free pages. This assumes the block is valid */
	81	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
	82	int isolated, i;
	83	struct page *page = cursor;
	84
	85	if (!pfn_valid_within(blockpfn))
	86	continue;
b7aba698	87	nr_scanned++;
748446bb MG	88
	89	if (!PageBuddy(page))
	90	continue;
	91
	92	/* Found a free page, break it into order-0 pages */
	93	isolated = split_free_page(page);
	94	total_isolated += isolated;
	95	for (i = 0; i < isolated; i++) {
	96	list_add(&page->lru, freelist);
	97	page++;
	98	}
	99
	100	/* If a page was split, advance to the end of it */
	101	if (isolated) {
	102	blockpfn += isolated - 1;
	103	cursor += isolated - 1;
	104	}
	105	}
	106
b7aba698	107	trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
748446bb MG	108	return total_isolated;
	109	}
	110
	111	/* Returns true if the page is within a block suitable for migration to */
	112	static bool suitable_migration_target(struct page *page)
	113	{
	114
	115	int migratetype = get_pageblock_migratetype(page);
	116
	117	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
	118	if (migratetype == MIGRATE_ISOLATE \|\| migratetype == MIGRATE_RESERVE)
	119	return false;
	120
	121	/* If the page is a large free page, then allow migration */
	122	if (PageBuddy(page) && page_order(page) >= pageblock_order)
	123	return true;
	124
	125	/* If the block is MIGRATE_MOVABLE, allow migration */
	126	if (migratetype == MIGRATE_MOVABLE)
	127	return true;
	128
	129	/* Otherwise skip the block */
	130	return false;
	131	}
	132
	133	/*
	134	* Based on information in the current compact_control, find blocks
	135	* suitable for isolating free pages from and then isolate them.
	136	*/
	137	static void isolate_freepages(struct zone *zone,
	138	struct compact_control *cc)
	139	{
	140	struct page *page;
	141	unsigned long high_pfn, low_pfn, pfn;
	142	unsigned long flags;
	143	int nr_freepages = cc->nr_freepages;
	144	struct list_head *freelist = &cc->freepages;
	145
	146	pfn = cc->free_pfn;
	147	low_pfn = cc->migrate_pfn + pageblock_nr_pages;
	148	high_pfn = low_pfn;
	149
	150	/*
	151	* Isolate free pages until enough are available to migrate the
	152	* pages on cc->migratepages. We stop searching if the migrate
	153	* and free page scanners meet or enough free pages are isolated.
	154	*/
	155	spin_lock_irqsave(&zone->lock, flags);
	156	for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
	157	pfn -= pageblock_nr_pages) {
	158	unsigned long isolated;
	159
	160	if (!pfn_valid(pfn))
	161	continue;
	162
	163	/*
	164	* Check for overlapping nodes/zones. It's possible on some
	165	* configurations to have a setup like
	166	* node0 node1 node0
	167	* i.e. it's possible that all pages within a zones range of
	168	* pages do not belong to a single zone.
	169	*/
	170	page = pfn_to_page(pfn);
	171	if (page_zone(page) != zone)
172	continue;
173
174	/* Check the block is suitable for migration */
175	if (!suitable_migration_target(page))
176	continue;
177
178	/* Found a block suitable for isolating free pages from */
179	isolated = isolate_freepages_block(zone, pfn, freelist);
180	nr_freepages += isolated;
181
182	/*
183	* Record the highest PFN we isolated pages from. When next
184	* looking for free pages, the search will restart here as
185	* page migration may have returned some pages to the allocator
186	*/
187	if (isolated)
188	high_pfn = max(high_pfn, pfn);
189	}
190	spin_unlock_irqrestore(&zone->lock, flags);
191
192	/* split_free_page does not map the pages */
193	list_for_each_entry(page, freelist, lru) {
194	arch_alloc_page(page, 0);
195	kernel_map_pages(page, 1, 1);
196	}
197
198	cc->free_pfn = high_pfn;
199	cc->nr_freepages = nr_freepages;
200	}
201
202	/* Update the number of anon and file isolated pages in the zone */
203	static void acct_isolated(struct zone zone, struct compact_control cc)
204	{
205	struct page *page;
206	unsigned int count[NR_LRU_LISTS] = { 0, };
207
208	list_for_each_entry(page, &cc->migratepages, lru) {
209	int lru = page_lru_base_type(page);
210	count[lru]++;
211	}
212
213	cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
214	cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
215	__mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
216	__mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
217	}
218
219	/* Similar to reclaim, but different enough that they don't share logic */
220	static bool too_many_isolated(struct zone *zone)
221	{
bc693045	222	unsigned long active, inactive, isolated;
748446bb MG	223
	224	inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
	225	zone_page_state(zone, NR_INACTIVE_ANON);
bc693045 MK	226	active = zone_page_state(zone, NR_ACTIVE_FILE) +
bc693045 MK	227	zone_page_state(zone, NR_ACTIVE_ANON);
748446bb MG	228	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
	229	zone_page_state(zone, NR_ISOLATED_ANON);
	230
bc693045	231	return isolated > (inactive + active) / 2;
748446bb MG	232	}
	233
	234	/*
	235	* Isolate all pages that can be migrated from the block pointed to by
	236	* the migrate scanner within compact_control.
	237	*/
	238	static unsigned long isolate_migratepages(struct zone *zone,
	239	struct compact_control *cc)
	240	{
	241	unsigned long low_pfn, end_pfn;
b7aba698	242	unsigned long nr_scanned = 0, nr_isolated = 0;
748446bb MG	243	struct list_head *migratelist = &cc->migratepages;
	244
	245	/* Do not scan outside zone boundaries */
	246	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
	247
	248	/* Only scan within a pageblock boundary */
	249	end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
	250
	251	/* Do not cross the free scanner or scan within a memory hole */
	252	if (end_pfn > cc->free_pfn \|\| !pfn_valid(low_pfn)) {
	253	cc->migrate_pfn = end_pfn;
	254	return 0;
	255	}
	256
	257	/*
	258	* Ensure that there are not too many pages isolated from the LRU
	259	* list by either parallel reclaimers or compaction. If there are,
	260	* delay for some time until fewer pages are isolated
	261	*/
	262	while (unlikely(too_many_isolated(zone))) {
	263	congestion_wait(BLK_RW_ASYNC, HZ/10);
	264
	265	if (fatal_signal_pending(current))
	266	return 0;
	267	}
	268
	269	/* Time to isolate some pages for migration */
	270	spin_lock_irq(&zone->lru_lock);
	271	for (; low_pfn < end_pfn; low_pfn++) {
	272	struct page *page;
	273	if (!pfn_valid_within(low_pfn))
	274	continue;
b7aba698	275	nr_scanned++;
748446bb MG	276
	277	/* Get the page and skip if free */
	278	page = pfn_to_page(low_pfn);
	279	if (PageBuddy(page))
	280	continue;
	281
	282	/* Try isolate the page */
	283	if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
	284	continue;
	285
	286	/* Successfully isolated */
	287	del_page_from_lru_list(zone, page, page_lru(page));
	288	list_add(&page->lru, migratelist);
748446bb	289	cc->nr_migratepages++;
b7aba698	290	nr_isolated++;
748446bb MG	291
	292	/* Avoid isolating too much */
	293	if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
	294	break;
	295	}
	296
	297	acct_isolated(zone, cc);
	298
	299	spin_unlock_irq(&zone->lru_lock);
	300	cc->migrate_pfn = low_pfn;
	301
b7aba698 MG	302	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
b7aba698 MG	303
748446bb MG	304	return cc->nr_migratepages;
	305	}
	306
	307	/*
	308	* This is a migrate-callback that "allocates" freepages by taking pages
	309	* from the isolated freelists in the block we are migrating to.
	310	*/
	311	static struct page compaction_alloc(struct page migratepage,
	312	unsigned long data,
	313	int **result)
	314	{
	315	struct compact_control cc = (struct compact_control )data;
	316	struct page *freepage;
	317
	318	/* Isolate free pages if necessary */
	319	if (list_empty(&cc->freepages)) {
	320	isolate_freepages(cc->zone, cc);
	321
	322	if (list_empty(&cc->freepages))
	323	return NULL;
	324	}
	325
	326	freepage = list_entry(cc->freepages.next, struct page, lru);
	327	list_del(&freepage->lru);
	328	cc->nr_freepages--;
	329
	330	return freepage;
	331	}
	332
	333	/*
	334	* We cannot control nr_migratepages and nr_freepages fully when migration is
	335	* running as migrate_pages() has no knowledge of compact_control. When
	336	* migration is complete, we count the number of pages on the lists by hand.
	337	*/
	338	static void update_nr_listpages(struct compact_control *cc)
	339	{
	340	int nr_migratepages = 0;
	341	int nr_freepages = 0;
	342	struct page *page;
	343
	344	list_for_each_entry(page, &cc->migratepages, lru)
	345	nr_migratepages++;
	346	list_for_each_entry(page, &cc->freepages, lru)
	347	nr_freepages++;
	348
	349	cc->nr_migratepages = nr_migratepages;
	350	cc->nr_freepages = nr_freepages;
	351	}
	352
	353	static int compact_finished(struct zone *zone,
	354	struct compact_control *cc)
	355	{
56de7263 MG	356	unsigned int order;
	357	unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
	358
748446bb MG	359	if (fatal_signal_pending(current))
	360	return COMPACT_PARTIAL;
	361
	362	/* Compaction run completes if the migrate and free scanner meet */
	363	if (cc->free_pfn <= cc->migrate_pfn)
	364	return COMPACT_COMPLETE;
	365
56de7263 MG	366	/* Compaction run is not finished if the watermark is not met */
	367	if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
	368	return COMPACT_CONTINUE;
	369
	370	if (cc->order == -1)
	371	return COMPACT_CONTINUE;
	372
	373	/* Direct compactor: Is a suitable page free? */
	374	for (order = cc->order; order < MAX_ORDER; order++) {
	375	/* Job done if page is free of the right migratetype */
	376	if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
	377	return COMPACT_PARTIAL;
	378
	379	/* Job done if allocation would set block type */
	380	if (order >= pageblock_order && zone->free_area[order].nr_free)
	381	return COMPACT_PARTIAL;
	382	}
	383
748446bb MG	384	return COMPACT_CONTINUE;
	385	}
	386
3e7d3449 MG	387	/*
	388	* compaction_suitable: Is this suitable to run compaction on this zone now?
	389	* Returns
	390	* COMPACT_SKIPPED - If there are too few free pages for compaction
	391	* COMPACT_PARTIAL - If the allocation would succeed without compaction
	392	* COMPACT_CONTINUE - If compaction should run now
	393	*/
	394	unsigned long compaction_suitable(struct zone *zone, int order)
	395	{
	396	int fragindex;
	397	unsigned long watermark;
	398
	399	/*
	400	* Watermarks for order-0 must be met for compaction. Note the 2UL.
	401	* This is because during migration, copies of pages need to be
	402	* allocated and for a short time, the footprint is higher
	403	*/
	404	watermark = low_wmark_pages(zone) + (2UL << order);
	405	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
	406	return COMPACT_SKIPPED;
	407
	408	/*
	409	* fragmentation index determines if allocation failures are due to
	410	* low memory or external fragmentation
	411	*
	412	* index of -1 implies allocations might succeed dependingon watermarks
	413	* index towards 0 implies failure is due to lack of memory
	414	* index towards 1000 implies failure is due to fragmentation
	415	*
	416	* Only compact if a failure would be due to fragmentation.
	417	*/
	418	fragindex = fragmentation_index(zone, order);
	419	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
	420	return COMPACT_SKIPPED;
	421
	422	if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
	423	return COMPACT_PARTIAL;
	424
	425	return COMPACT_CONTINUE;
	426	}
	427
748446bb MG	428	static int compact_zone(struct zone zone, struct compact_control cc)
	429	{
	430	int ret;
	431
3e7d3449 MG	432	ret = compaction_suitable(zone, cc->order);
	433	switch (ret) {
	434	case COMPACT_PARTIAL:
	435	case COMPACT_SKIPPED:
	436	/* Compaction is likely to fail */
	437	return ret;
	438	case COMPACT_CONTINUE:
	439	/* Fall through to compaction */
	440	;
	441	}
	442
748446bb MG	443	/* Setup to move all movable pages to the end of the zone */
	444	cc->migrate_pfn = zone->zone_start_pfn;
	445	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
	446	cc->free_pfn &= ~(pageblock_nr_pages-1);
	447
	448	migrate_prep_local();
	449
	450	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
	451	unsigned long nr_migrate, nr_remaining;
	452
	453	if (!isolate_migratepages(zone, cc))
	454	continue;
	455
	456	nr_migrate = cc->nr_migratepages;
	457	migrate_pages(&cc->migratepages, compaction_alloc,
	458	(unsigned long)cc, 0);
	459	update_nr_listpages(cc);
	460	nr_remaining = cc->nr_migratepages;
	461
	462	count_vm_event(COMPACTBLOCKS);
	463	count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
	464	if (nr_remaining)
	465	count_vm_events(COMPACTPAGEFAILED, nr_remaining);
b7aba698 MG	466	trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
b7aba698 MG	467	nr_remaining);
748446bb MG	468
	469	/* Release LRU pages not migrated */
	470	if (!list_empty(&cc->migratepages)) {
	471	putback_lru_pages(&cc->migratepages);
	472	cc->nr_migratepages = 0;
	473	}
	474
	475	}
	476
	477	/* Release free pages and check accounting */
	478	cc->nr_freepages -= release_freepages(&cc->freepages);
	479	VM_BUG_ON(cc->nr_freepages != 0);
	480
	481	return ret;
	482	}
76ab0f53	483
3e7d3449	484	unsigned long compact_zone_order(struct zone *zone,
56de7263 MG	485	int order, gfp_t gfp_mask)
	486	{
	487	struct compact_control cc = {
	488	.nr_freepages = 0,
	489	.nr_migratepages = 0,
	490	.order = order,
	491	.migratetype = allocflags_to_migratetype(gfp_mask),
	492	.zone = zone,
	493	};
	494	INIT_LIST_HEAD(&cc.freepages);
	495	INIT_LIST_HEAD(&cc.migratepages);
	496
	497	return compact_zone(zone, &cc);
	498	}
	499
5e771905 MG	500	int sysctl_extfrag_threshold = 500;
5e771905 MG	501
56de7263 MG	502	/**
	503	* try_to_compact_pages - Direct compact to satisfy a high-order allocation
	504	* @zonelist: The zonelist used for the current allocation
	505	* @order: The order of the current allocation
	506	* @gfp_mask: The GFP mask of the current allocation
	507	* @nodemask: The allowed nodes to allocate from
	508	*
	509	* This is the main entry point for direct page compaction.
	510	*/
	511	unsigned long try_to_compact_pages(struct zonelist *zonelist,
	512	int order, gfp_t gfp_mask, nodemask_t *nodemask)
	513	{
	514	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
	515	int may_enter_fs = gfp_mask & __GFP_FS;
	516	int may_perform_io = gfp_mask & __GFP_IO;
56de7263 MG	517	struct zoneref *z;
	518	struct zone *zone;
	519	int rc = COMPACT_SKIPPED;
	520
	521	/*
	522	* Check whether it is worth even starting compaction. The order check is
	523	* made because an assumption is made that the page allocator can satisfy
	524	* the "cheaper" orders without taking special steps
	525	*/
	526	if (order <= PAGE_ALLOC_COSTLY_ORDER \|\| !may_enter_fs \|\| !may_perform_io)
	527	return rc;
	528
	529	count_vm_event(COMPACTSTALL);
	530
	531	/* Compact each zone in the list */
	532	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
	533	nodemask) {
56de7263 MG	534	int status;
56de7263 MG	535
56de7263 MG	536	status = compact_zone_order(zone, order, gfp_mask);
	537	rc = max(status, rc);
	538
3e7d3449 MG	539	/* If a normal allocation would succeed, stop compacting */
3e7d3449 MG	540	if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
56de7263 MG	541	break;
	542	}
	543
	544	return rc;
	545	}
	546
	547
76ab0f53 MG	548	/* Compact all zones within a node */
	549	static int compact_node(int nid)
	550	{
	551	int zoneid;
	552	pg_data_t *pgdat;
	553	struct zone *zone;
	554
	555	if (nid < 0 \|\| nid >= nr_node_ids \|\| !node_online(nid))
	556	return -EINVAL;
	557	pgdat = NODE_DATA(nid);
	558
	559	/* Flush pending updates to the LRU lists */
	560	lru_add_drain_all();
	561
	562	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
	563	struct compact_control cc = {
	564	.nr_freepages = 0,
	565	.nr_migratepages = 0,
56de7263	566	.order = -1,
76ab0f53 MG	567	};
	568
	569	zone = &pgdat->node_zones[zoneid];
	570	if (!populated_zone(zone))
	571	continue;
	572
	573	cc.zone = zone;
	574	INIT_LIST_HEAD(&cc.freepages);
	575	INIT_LIST_HEAD(&cc.migratepages);
	576
	577	compact_zone(zone, &cc);
	578
	579	VM_BUG_ON(!list_empty(&cc.freepages));
	580	VM_BUG_ON(!list_empty(&cc.migratepages));
	581	}
	582
	583	return 0;
	584	}
	585
	586	/* Compact all nodes in the system */
	587	static int compact_nodes(void)
	588	{
	589	int nid;
	590
	591	for_each_online_node(nid)
	592	compact_node(nid);
	593
	594	return COMPACT_COMPLETE;
	595	}
	596
	597	/* The written value is actually unused, all memory is compacted */
	598	int sysctl_compact_memory;
	599
	600	/* This is the entry point for compacting all nodes via /proc/sys/vm */
	601	int sysctl_compaction_handler(struct ctl_table *table, int write,
	602	void __user buffer, size_t length, loff_t *ppos)
	603	{
	604	if (write)
	605	return compact_nodes();
	606
	607	return 0;
	608	}
ed4a6d7f	609
5e771905 MG	610	int sysctl_extfrag_handler(struct ctl_table *table, int write,
	611	void __user buffer, size_t length, loff_t *ppos)
	612	{
	613	proc_dointvec_minmax(table, write, buffer, length, ppos);
	614
	615	return 0;
	616	}
	617
ed4a6d7f MG	618	#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
	619	ssize_t sysfs_compact_node(struct sys_device *dev,
	620	struct sysdev_attribute *attr,
	621	const char *buf, size_t count)
	622	{
	623	compact_node(dev->id);
	624
	625	return count;
	626	}
	627	static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
	628
	629	int compaction_register_node(struct node *node)
	630	{
	631	return sysdev_create_file(&node->sysdev, &attr_compact);
	632	}
	633
	634	void compaction_unregister_node(struct node *node)
	635	{
	636	return sysdev_remove_file(&node->sysdev, &attr_compact);
	637	}
	638	#endif /* CONFIG_SYSFS && CONFIG_NUMA */