mm: page_alloc: spill to remote nodes before waking kswapd

author Johannes Weiner <hannes@cmpxchg.org>

Mon, 7 Apr 2014 22:37:48 +0000 (15:37 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 7 Apr 2014 23:35:57 +0000 (16:35 -0700)
author Johannes Weiner <hannes@cmpxchg.org>
Mon, 7 Apr 2014 22:37:48 +0000 (15:37 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 7 Apr 2014 23:35:57 +0000 (16:35 -0700)
diff --git a/mm/internal.h b/mm/internal.h

index 29e1e761f9ebe3fee42eea1b0e77589122a23d24..3e910000fda43a55a477ea8c3b5984e2e8008fb8 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -370,5 +370,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
  #define ALLOC_HIGH             0x20 /* __GFP_HIGH set */
  #define ALLOC_CPUSET           0x40 /* check for correct cpuset */
  #define ALLOC_CMA              0x80 /* allow allocations from CMA areas */
+#define ALLOC_FAIR             0x100 /* fair zone allocation */
  
  #endif /* __MM_INTERNAL_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 73c25912c7c45542fce16dae7529442adf49e39c..15d140755e71fd0b47fa305165c802f685eaeb92 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1239,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
         }
         local_irq_restore(flags);
  }
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
-       return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
-}
-#else
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
-       return false;
-}
  #endif
  
  /*
@@ -1584,12 +1575,7 @@ again:
                                           get_pageblock_migratetype(page));
         }
  
-       /*
-        * NOTE: GFP_THISNODE allocations do not partake in the kswapd
-        * aging protocol, so they can't be fair.
-        */
-       if (!gfp_thisnode_allocation(gfp_flags))
-               __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+       __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
  
         __count_zone_vm_events(PGALLOC, zone, 1 << order);
         zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1955,23 +1941,12 @@ zonelist_scan:
                  * zone size to ensure fair page aging.  The zone a
                  * page was allocated in should have no effect on the
                  * time the page has in memory before being reclaimed.
-                *
-                * Try to stay in local zones in the fastpath.  If
-                * that fails, the slowpath is entered, which will do
-                * another pass starting with the local zones, but
-                * ultimately fall back to remote zones that do not
-                * partake in the fairness round-robin cycle of this
-                * zonelist.
-                *
-                * NOTE: GFP_THISNODE allocations do not partake in
-                * the kswapd aging protocol, so they can't be fair.
                  */
-               if ((alloc_flags & ALLOC_WMARK_LOW) &&
-                   !gfp_thisnode_allocation(gfp_mask)) {
-                       if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
-                               continue;
+               if (alloc_flags & ALLOC_FAIR) {
                         if (!zone_local(preferred_zone, zone))
                                 continue;
+                       if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+                               continue;
                 }
                 /*
                  * When allocating a page cache page for writing, we
@@ -2409,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
         return page;
  }
  
-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
-                            struct zonelist *zonelist,
-                            enum zone_type high_zoneidx,
-                            struct zone *preferred_zone)
+static void reset_alloc_batches(struct zonelist *zonelist,
+                               enum zone_type high_zoneidx,
+                               struct zone *preferred_zone)
  {
         struct zoneref *z;
         struct zone *zone;
  
         for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-               if (!(gfp_mask & __GFP_NO_KSWAPD))
-                       wakeup_kswapd(zone, order, zone_idx(preferred_zone));
                 /*
                  * Only reset the batches of zones that were actually
-                * considered in the fast path, we don't want to
-                * thrash fairness information for zones that are not
+                * considered in the fairness pass, we don't want to
+                * trash fairness information for zones that are not
                  * actually part of this zonelist's round-robin cycle.
                  */
                 if (!zone_local(preferred_zone, zone))
                         continue;
                 mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                                   high_wmark_pages(zone) -
-                                   low_wmark_pages(zone) -
-                                   zone_page_state(zone, NR_ALLOC_BATCH));
+                       high_wmark_pages(zone) - low_wmark_pages(zone) -
+                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
         }
  }
  
+static void wake_all_kswapds(unsigned int order,
+                            struct zonelist *zonelist,
+                            enum zone_type high_zoneidx,
+                            struct zone *preferred_zone)
+{
+       struct zoneref *z;
+       struct zone *zone;
+
+       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+               wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+}
+
  static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
@@ -2523,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
          * allowed per node queues are empty and that nodes are
          * over allocated.
          */
-       if (gfp_thisnode_allocation(gfp_mask))
+       if (IS_ENABLED(CONFIG_NUMA) &&
+           (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                 goto nopage;
  
  restart:
-       prepare_slowpath(gfp_mask, order, zonelist,
-                        high_zoneidx, preferred_zone);
+       if (!(gfp_mask & __GFP_NO_KSWAPD))
+               wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
  
         /*
          * OK, we're below the kswapd watermark and have kicked background
@@ -2712,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
         struct page *page = NULL;
         int migratetype = allocflags_to_migratetype(gfp_mask);
         unsigned int cpuset_mems_cookie;
-       int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+       int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
         struct mem_cgroup *memcg = NULL;
  
         gfp_mask &= gfp_allowed_mask;
@@ -2753,11 +2737,28 @@ retry_cpuset:
         if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                 alloc_flags |= ALLOC_CMA;
  #endif
+retry:
         /* First allocation attempt */
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                         zonelist, high_zoneidx, alloc_flags,
                         preferred_zone, migratetype);
         if (unlikely(!page)) {
+               /*
+                * The first pass makes sure allocations are spread
+                * fairly within the local node.  However, the local
+                * node might have free pages left after the fairness
+                * batches are exhausted, and remote zones haven't
+                * even been considered yet.  Try once more without
+                * fairness, and include remote zones now, before
+                * entering the slowpath and waking kswapd: prefer
+                * spilling to a remote zone over swapping locally.
+                */
+               if (alloc_flags & ALLOC_FAIR) {
+                       reset_alloc_batches(zonelist, high_zoneidx,
+                                           preferred_zone);
+                       alloc_flags &= ~ALLOC_FAIR;
+                       goto retry;
+               }
                 /*
                  * Runtime PM, block IO and its error handling path
                  * can deadlock because I/O on the device might not
author	Johannes Weiner <hannes@cmpxchg.org>
	Mon, 7 Apr 2014 22:37:48 +0000 (15:37 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 7 Apr 2014 23:35:57 +0000 (16:35 -0700)
mm/internal.h		patch \| blob \| blame \| history
mm/page_alloc.c		patch \| blob \| blame \| history