mm: compaction: capture a suitable high-order page immediately when it is made available

author Mel Gorman <mgorman@suse.de>

Mon, 8 Oct 2012 23:29:12 +0000 (16:29 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 9 Oct 2012 07:22:21 +0000 (16:22 +0900)
author Mel Gorman <mgorman@suse.de>
Mon, 8 Oct 2012 23:29:12 +0000 (16:29 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 9 Oct 2012 07:22:21 +0000 (16:22 +0900)
diff --git a/include/linux/compaction.h b/include/linux/compaction.h

index ef658147e4e8391eec61aa7559a1452fcffe6a79..0e38a1deeb2374f9553b088bd7c43b248da954d2 100644 (file)
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
  extern int fragmentation_index(struct zone *zone, unsigned int order);
  extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
                         int order, gfp_t gfp_mask, nodemask_t *mask,
-                       bool sync, bool *contended);
+                       bool sync, bool *contended, struct page **page);
  extern int compact_pgdat(pg_data_t *pgdat, int order);
  extern unsigned long compaction_suitable(struct zone *zone, int order);
  
@@ -64,7 +64,7 @@ static inline bool compaction_deferred(struct zone *zone, int order)
  #else
  static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
                         int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                       bool sync, bool *contended)
+                       bool sync, bool *contended, struct page **page)
  {
         return COMPACT_CONTINUE;
  }
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 0514fe9d3c842f797a2ef52883cd74ec50bf7abf..5ddb11b2b4bb8d3611b0e4655a6369f4a8677923 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -442,6 +442,7 @@ void put_pages_list(struct list_head *pages);
  
  void split_page(struct page *page, unsigned int order);
  int split_free_page(struct page *page);
+int capture_free_page(struct page *page, int alloc_order, int migratetype);
  
  /*
   * Compound pages have a destructor function.  Provide a
diff --git a/mm/compaction.c b/mm/compaction.c

index 7168edc7592c687dc6a0de914739135360f512a0..0fbc6b73a522c1210cfc3b13b441e2762ec4e2e8 100644 (file)
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -91,6 +91,60 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
         return compact_checklock_irqsave(lock, flags, false, cc);
  }
  
+static void compact_capture_page(struct compact_control *cc)
+{
+       unsigned long flags;
+       int mtype, mtype_low, mtype_high;
+
+       if (!cc->page || *cc->page)
+               return;
+
+       /*
+        * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
+        * regardless of the migratetype of the freelist is is captured from.
+        * This is fine because the order for a high-order MIGRATE_MOVABLE
+        * allocation is typically at least a pageblock size and overall
+        * fragmentation is not impaired. Other allocation types must
+        * capture pages from their own migratelist because otherwise they
+        * could pollute other pageblocks like MIGRATE_MOVABLE with
+        * difficult to move pages and making fragmentation worse overall.
+        */
+       if (cc->migratetype == MIGRATE_MOVABLE) {
+               mtype_low = 0;
+               mtype_high = MIGRATE_PCPTYPES;
+       } else {
+               mtype_low = cc->migratetype;
+               mtype_high = cc->migratetype + 1;
+       }
+
+       /* Speculatively examine the free lists without zone lock */
+       for (mtype = mtype_low; mtype < mtype_high; mtype++) {
+               int order;
+               for (order = cc->order; order < MAX_ORDER; order++) {
+                       struct page *page;
+                       struct free_area *area;
+                       area = &(cc->zone->free_area[order]);
+                       if (list_empty(&area->free_list[mtype]))
+                               continue;
+
+                       /* Take the lock and attempt capture of the page */
+                       if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
+                               return;
+                       if (!list_empty(&area->free_list[mtype])) {
+                               page = list_entry(area->free_list[mtype].next,
+                                                       struct page, lru);
+                               if (capture_free_page(page, cc->order, mtype)) {
+                                       spin_unlock_irqrestore(&cc->zone->lock,
+                                                                       flags);
+                                       *cc->page = page;
+                                       return;
+                               }
+                       }
+                       spin_unlock_irqrestore(&cc->zone->lock, flags);
+               }
+       }
+}
+
  /*
   * Isolate free pages onto a private freelist. Caller must hold zone->lock.
   * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -645,7 +699,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
  static int compact_finished(struct zone *zone,
                             struct compact_control *cc)
  {
-       unsigned int order;
         unsigned long watermark;
  
         if (fatal_signal_pending(current))
@@ -688,14 +741,22 @@ static int compact_finished(struct zone *zone,
                 return COMPACT_CONTINUE;
  
         /* Direct compactor: Is a suitable page free? */
-       for (order = cc->order; order < MAX_ORDER; order++) {
-               /* Job done if page is free of the right migratetype */
-               if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
-                       return COMPACT_PARTIAL;
-
-               /* Job done if allocation would set block type */
-               if (order >= pageblock_order && zone->free_area[order].nr_free)
+       if (cc->page) {
+               /* Was a suitable page captured? */
+               if (*cc->page)
                         return COMPACT_PARTIAL;
+       } else {
+               unsigned int order;
+               for (order = cc->order; order < MAX_ORDER; order++) {
+                       struct free_area *area = &zone->free_area[cc->order];
+                       /* Job done if page is free of the right migratetype */
+                       if (!list_empty(&area->free_list[cc->migratetype]))
+                               return COMPACT_PARTIAL;
+
+                       /* Job done if allocation would set block type */
+                       if (cc->order >= pageblock_order && area->nr_free)
+                               return COMPACT_PARTIAL;
+               }
         }
  
         return COMPACT_CONTINUE;
@@ -817,6 +878,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                                 goto out;
                         }
                 }
+
+               /* Capture a page now if it is a suitable size */
+               compact_capture_page(cc);
         }
  
  out:
@@ -829,7 +893,8 @@ out:
  
  static unsigned long compact_zone_order(struct zone *zone,
                                  int order, gfp_t gfp_mask,
-                                bool sync, bool *contended)
+                                bool sync, bool *contended,
+                                struct page **page)
  {
         struct compact_control cc = {
                 .nr_freepages = 0,
@@ -839,6 +904,7 @@ static unsigned long compact_zone_order(struct zone *zone,
                 .zone = zone,
                 .sync = sync,
                 .contended = contended,
+               .page = page,
         };
         INIT_LIST_HEAD(&cc.freepages);
         INIT_LIST_HEAD(&cc.migratepages);
@@ -860,7 +926,7 @@ int sysctl_extfrag_threshold = 500;
   */
  unsigned long try_to_compact_pages(struct zonelist *zonelist,
                         int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                       bool sync, bool *contended)
+                       bool sync, bool *contended, struct page **page)
  {
         enum zone_type high_zoneidx = gfp_zone(gfp_mask);
         int may_enter_fs = gfp_mask & __GFP_FS;
@@ -881,7 +947,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                 int status;
  
                 status = compact_zone_order(zone, order, gfp_mask, sync,
-                                               contended);
+                                               contended, page);
                 rc = max(status, rc);
  
                 /* If a normal allocation would succeed, stop compacting */
@@ -936,6 +1002,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
         struct compact_control cc = {
                 .order = order,
                 .sync = false,
+               .page = NULL,
         };
  
         return __compact_pgdat(pgdat, &cc);
@@ -946,6 +1013,7 @@ static int compact_node(int nid)
         struct compact_control cc = {
                 .order = -1,
                 .sync = true,
+               .page = NULL,
         };
  
         return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/internal.h b/mm/internal.h

index b8c91b342e244153ec9b24b4673e6dd2575af267..e549a7fbc29678d8dd5295876d48b74e5a8fed34 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -131,6 +131,7 @@ struct compact_control {
         int migratetype;                /* MOVABLE, RECLAIMABLE etc */
         struct zone *zone;
         bool *contended;                /* True if a lock was contended */
+       struct page **page;             /* Page captured of requested size */
  };
  
  unsigned long
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 5e92698e539553903d69f804300f8ef9dfef796a..cfd565dbe124d41298fc112a21048fc8ec82b084 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1380,16 +1380,11 @@ void split_page(struct page *page, unsigned int order)
  }
  
  /*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
+ * Similar to the split_page family of functions except that the page
+ * required at the given order and being isolated now to prevent races
+ * with parallel allocators
   */
-int split_free_page(struct page *page)
+int capture_free_page(struct page *page, int alloc_order, int migratetype)
  {
         unsigned int order;
         unsigned long watermark;
@@ -1411,10 +1406,11 @@ int split_free_page(struct page *page)
         rmv_page_order(page);
         __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
  
-       /* Split into individual pages */
-       set_page_refcounted(page);
-       split_page(page, order);
+       if (alloc_order != order)
+               expand(zone, page, alloc_order, order,
+                       &zone->free_area[order], migratetype);
  
+       /* Set the pageblock if the captured page is at least a pageblock */
         if (order >= pageblock_order - 1) {
                 struct page *endpage = page + (1 << order) - 1;
                 for (; page < endpage; page += pageblock_nr_pages) {
@@ -1425,7 +1421,35 @@ int split_free_page(struct page *page)
                 }
         }
  
-       return 1 << order;
+       return 1UL << order;
+}
+
+/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+       unsigned int order;
+       int nr_pages;
+
+       BUG_ON(!PageBuddy(page));
+       order = page_order(page);
+
+       nr_pages = capture_free_page(page, order, 0);
+       if (!nr_pages)
+               return 0;
+
+       /* Split into individual pages */
+       set_page_refcounted(page);
+       split_page(page, order);
+       return nr_pages;
  }
  
  /*
@@ -2105,7 +2129,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         bool *contended_compaction, bool *deferred_compaction,
         unsigned long *did_some_progress)
  {
-       struct page *page;
+       struct page *page = NULL;
  
         if (!order)
                 return NULL;
@@ -2118,10 +2142,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         current->flags |= PF_MEMALLOC;
         *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                 nodemask, sync_migration,
-                                               contended_compaction);
+                                               contended_compaction, &page);
         current->flags &= ~PF_MEMALLOC;
-       if (*did_some_progress != COMPACT_SKIPPED) {
  
+       /* If compaction captured a page, prep and use it */
+       if (page) {
+               prep_new_page(page, order, gfp_mask);
+               goto got_page;
+       }
+
+       if (*did_some_progress != COMPACT_SKIPPED) {
                 /* Page migration frees to the PCP lists but we want merging */
                 drain_pages(get_cpu());
                 put_cpu();
@@ -2131,6 +2161,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                                 alloc_flags & ~ALLOC_NO_WATERMARKS,
                                 preferred_zone, migratetype);
                 if (page) {
+got_page:
                         preferred_zone->compact_considered = 0;
                         preferred_zone->compact_defer_shift = 0;
                         if (order >= preferred_zone->compact_order_failed)
author	Mel Gorman <mgorman@suse.de>
	Mon, 8 Oct 2012 23:29:12 +0000 (16:29 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 9 Oct 2012 07:22:21 +0000 (16:22 +0900)
include/linux/compaction.h		patch \| blob \| blame \| history
include/linux/mm.h		patch \| blob \| blame \| history
mm/compaction.c		patch \| blob \| blame \| history
mm/internal.h		patch \| blob \| blame \| history
mm/page_alloc.c		patch \| blob \| blame \| history