revert buggy OOM commits

author Fabian Grünbichler <f.gruenbichler@proxmox.com>

Mon, 9 Jan 2017 09:05:16 +0000 (10:05 +0100)

committer Fabian Grünbichler <f.gruenbichler@proxmox.com>

Mon, 9 Jan 2017 09:05:16 +0000 (10:05 +0100)
author Fabian Grünbichler <f.gruenbichler@proxmox.com>
Mon, 9 Jan 2017 09:05:16 +0000 (10:05 +0100)
committer Fabian Grünbichler <f.gruenbichler@proxmox.com>
Mon, 9 Jan 2017 09:05:16 +0000 (10:05 +0100)
diff --git a/0001-Revert-mm-throttle-on-IO-only-when-there-are-too-man.patch b/0001-Revert-mm-throttle-on-IO-only-when-there-are-too-man.patch

new file mode 100644 (file)

index 0000000..b4ff5a8
--- /dev/null
+++ b/0001-Revert-mm-throttle-on-IO-only-when-there-are-too-man.patch
@@ -0,0 +1,118 @@
+From 3168fc7faf603da9d523c9dffbec6fee5b1a8a04 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
+Date: Wed, 4 Jan 2017 11:29:00 +0100
+Subject: [PATCH 1/2] Revert "mm: throttle on IO only when there are too many
+ dirty and writeback pages"
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This reverts commit 57e9ef475661f46769cad6c0ed9a13f0cec1dbd8.
+
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ mm/backing-dev.c | 20 +++++++++++++++++---
+ mm/page_alloc.c  | 41 ++++-------------------------------------
+ 2 files changed, 21 insertions(+), 40 deletions(-)
+
+diff --git a/mm/backing-dev.c b/mm/backing-dev.c
+index a1aef87..9ef80bf 100644
+--- a/mm/backing-dev.c
++++ b/mm/backing-dev.c
+@@ -976,8 +976,9 @@ EXPORT_SYMBOL(congestion_wait);
+  * jiffies for either a BDI to exit congestion of the given @sync queue
+  * or a write to complete.
+  *
+- * In the absence of zone congestion, cond_resched() is called to yield
+- * the processor if necessary but otherwise does not sleep.
++ * In the absence of zone congestion, a short sleep or a cond_resched is
++ * performed to yield the processor and to allow other subsystems to make
++ * a forward progress.
+  *
+  * The return value is 0 if the sleep is for the full timeout. Otherwise,
+  * it is the number of jiffies that were still remaining when the function
+@@ -997,7 +998,20 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
+        */
+       if (atomic_read(&nr_wb_congested[sync]) == 0 ||
+           !test_bit(ZONE_CONGESTED, &zone->flags)) {
+-              cond_resched();
++
++              /*
++               * Memory allocation/reclaim might be called from a WQ
++               * context and the current implementation of the WQ
++               * concurrency control doesn't recognize that a particular
++               * WQ is congested if the worker thread is looping without
++               * ever sleeping. Therefore we have to do a short sleep
++               * here rather than calling cond_resched().
++               */
++              if (current->flags & PF_WQ_WORKER)
++                      schedule_timeout_uninterruptible(1);
++              else
++                      cond_resched();
++
+               /* In case we scheduled, work out time remaining */
+               ret = timeout - (jiffies - start);
+               if (ret < 0)
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index aadbd7e..f13b503 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3038,9 +3038,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+       for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+                                       ac->nodemask) {
+               unsigned long available;
+-              unsigned long reclaimable;
+ 
+-              available = reclaimable = zone_reclaimable_pages(zone);
++              available = zone_reclaimable_pages(zone);
+               available -= DIV_ROUND_UP(no_progress_loops * available,
+                                         MAX_RECLAIM_RETRIES);
+               available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+@@ -3050,41 +3049,9 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+                * available?
+                */
+               if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+-                              ac->classzone_idx, alloc_flags, available)) {
+-                      /*
+-                       * If we didn't make any progress and have a lot of
+-                       * dirty + writeback pages then we should wait for
+-                       * an IO to complete to slow down the reclaim and
+-                       * prevent from pre mature OOM
+-                       */
+-                      if (!did_some_progress) {
+-                              unsigned long writeback;
+-                              unsigned long dirty;
+-
+-                              writeback = zone_page_state_snapshot(zone,
+-                                                                   NR_WRITEBACK);
+-                              dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+-
+-                              if (2*(writeback + dirty) > reclaimable) {
+-                                      congestion_wait(BLK_RW_ASYNC, HZ/10);
+-                                      return true;
+-                              }
+-                      }
+-
+-                      /*
+-                       * Memory allocation/reclaim might be called from a WQ
+-                       * context and the current implementation of the WQ
+-                       * concurrency control doesn't recognize that
+-                       * a particular WQ is congested if the worker thread is
+-                       * looping without ever sleeping. Therefore we have to
+-                       * do a short sleep here rather than calling
+-                       * cond_resched().
+-                       */
+-                      if (current->flags & PF_WQ_WORKER)
+-                              schedule_timeout_uninterruptible(1);
+-                      else
+-                              cond_resched();
+-
++                              ac->high_zoneidx, alloc_flags, available)) {
++                      /* Wait for some write requests to complete then retry */
++                      wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50);
+                       return true;
+               }
+       }
+-- 
+2.1.4
+
diff --git a/0002-Revert-mm-oom-rework-oom-detection.patch b/0002-Revert-mm-oom-rework-oom-detection.patch

new file mode 100644 (file)

index 0000000..5a1ec76
--- /dev/null
+++ b/0002-Revert-mm-oom-rework-oom-detection.patch
@@ -0,0 +1,255 @@
+From 6e2588df3dc3d1704eae939ed9c9425000f48069 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
+Date: Wed, 4 Jan 2017 11:29:26 +0100
+Subject: [PATCH 2/2] Revert "mm, oom: rework oom detection"
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This reverts commit c630ec12d831521b0566481eb56d7257b051911e.
+
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ include/linux/swap.h |   1 -
+ mm/page_alloc.c      | 100 +++++----------------------------------------------
+ mm/vmscan.c          |  25 ++++++++++---
+ 3 files changed, 29 insertions(+), 97 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 1498c5a..d8ca2ea 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -318,7 +318,6 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
+                                               struct vm_area_struct *vma);
+ 
+ /* linux/mm/vmscan.c */
+-extern unsigned long zone_reclaimable_pages(struct zone *zone);
+ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
+                                       gfp_t gfp_mask, nodemask_t *mask);
+ extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index f13b503..56319cf 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2988,77 +2988,6 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
+       return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+ }
+ 
+-/*
+- * Maximum number of reclaim retries without any progress before OOM killer
+- * is consider as the only way to move forward.
+- */
+-#define MAX_RECLAIM_RETRIES 16
+-
+-/*
+- * Checks whether it makes sense to retry the reclaim to make a forward progress
+- * for the given allocation request.
+- * The reclaim feedback represented by did_some_progress (any progress during
+- * the last reclaim round), pages_reclaimed (cumulative number of reclaimed
+- * pages) and no_progress_loops (number of reclaim rounds without any progress
+- * in a row) is considered as well as the reclaimable pages on the applicable
+- * zone list (with a backoff mechanism which is a function of no_progress_loops).
+- *
+- * Returns true if a retry is viable or false to enter the oom path.
+- */
+-static inline bool
+-should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+-                   struct alloc_context *ac, int alloc_flags,
+-                   bool did_some_progress, unsigned long pages_reclaimed,
+-                   int no_progress_loops)
+-{
+-      struct zone *zone;
+-      struct zoneref *z;
+-
+-      /*
+-       * Make sure we converge to OOM if we cannot make any progress
+-       * several times in the row.
+-       */
+-      if (no_progress_loops > MAX_RECLAIM_RETRIES)
+-              return false;
+-
+-      if (order > PAGE_ALLOC_COSTLY_ORDER) {
+-              if (pages_reclaimed >= (1<<order))
+-                      return false;
+-
+-              if (did_some_progress)
+-                      return true;
+-      }
+-
+-      /*
+-       * Keep reclaiming pages while there is a chance this will lead somewhere.
+-       * If none of the target zones can satisfy our allocation request even
+-       * if all reclaimable pages are considered then we are screwed and have
+-       * to go OOM.
+-       */
+-      for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+-                                      ac->nodemask) {
+-              unsigned long available;
+-
+-              available = zone_reclaimable_pages(zone);
+-              available -= DIV_ROUND_UP(no_progress_loops * available,
+-                                        MAX_RECLAIM_RETRIES);
+-              available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+-
+-              /*
+-               * Would the allocation succeed if we reclaimed the whole
+-               * available?
+-               */
+-              if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+-                              ac->high_zoneidx, alloc_flags, available)) {
+-                      /* Wait for some write requests to complete then retry */
+-                      wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50);
+-                      return true;
+-              }
+-      }
+-
+-      return false;
+-}
+-
+ static inline struct page *
+ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+                                               struct alloc_context *ac)
+@@ -3071,7 +3000,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+       enum migrate_mode migration_mode = MIGRATE_ASYNC;
+       bool deferred_compaction = false;
+       int contended_compaction = COMPACT_CONTENDED_NONE;
+-      int no_progress_loops = 0;
+ 
+       /*
+        * In the slowpath, we sanity check order to avoid ever trying to
+@@ -3223,24 +3151,14 @@ retry:
+       if (gfp_mask & __GFP_NORETRY)
+               goto noretry;
+ 
+-      /*
+-       * Do not retry costly high order allocations unless they are
+-       * __GFP_REPEAT
+-       */
+-      if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+-              goto noretry;
+-
+-      if (did_some_progress) {
+-              no_progress_loops = 0;
+-              pages_reclaimed += did_some_progress;
+-      } else {
+-              no_progress_loops++;
+-      }
+-
+-      if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
+-                               did_some_progress > 0, pages_reclaimed,
+-                               no_progress_loops))
++      /* Keep reclaiming pages as long as there is reasonable progress */
++      pages_reclaimed += did_some_progress;
++      if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
++          ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
++              /* Wait for some write requests to complete then retry */
++              wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
+               goto retry;
++      }
+ 
+       /* Reclaim has failed us, start killing things */
+       page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+@@ -3248,10 +3166,8 @@ retry:
+               goto got_pg;
+ 
+       /* Retry as long as the OOM killer is making progress */
+-      if (did_some_progress) {
+-              no_progress_loops = 0;
++      if (did_some_progress)
+               goto retry;
+-      }
+ 
+ noretry:
+       /*
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 56f902d..3597160 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -192,7 +192,7 @@ static bool sane_reclaim(struct scan_control *sc)
+ }
+ #endif
+ 
+-unsigned long zone_reclaimable_pages(struct zone *zone)
++static unsigned long zone_reclaimable_pages(struct zone *zone)
+ {
+       unsigned long nr;
+ 
+@@ -2492,8 +2492,10 @@ static inline bool compaction_ready(struct zone *zone, int order)
+  *
+  * If a zone is deemed to be full of pinned pages then just give it a light
+  * scan then give up on it.
++ *
++ * Returns true if a zone was reclaimable.
+  */
+-static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
++static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+ {
+       struct zoneref *z;
+       struct zone *zone;
+@@ -2501,6 +2503,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+       unsigned long nr_soft_scanned;
+       gfp_t orig_mask;
+       enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
++      bool reclaimable = false;
+ 
+       /*
+        * If the number of buffer_heads in the machine exceeds the maximum
+@@ -2565,10 +2568,17 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+                                               &nr_soft_scanned);
+                       sc->nr_reclaimed += nr_soft_reclaimed;
+                       sc->nr_scanned += nr_soft_scanned;
++                      if (nr_soft_reclaimed)
++                              reclaimable = true;
+                       /* need some check for avoid more shrink_zone() */
+               }
+ 
+-              shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
++              if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
++                      reclaimable = true;
++
++              if (global_reclaim(sc) &&
++                  !reclaimable && zone_reclaimable(zone))
++                      reclaimable = true;
+       }
+ 
+       /*
+@@ -2576,6 +2586,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+        * promoted it to __GFP_HIGHMEM.
+        */
+       sc->gfp_mask = orig_mask;
++
++      return reclaimable;
+ }
+ 
+ /*
+@@ -2600,6 +2612,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+       int initial_priority = sc->priority;
+       unsigned long total_scanned = 0;
+       unsigned long writeback_threshold;
++      bool zones_reclaimable;
+ retry:
+       delayacct_freepages_start();
+ 
+@@ -2610,7 +2623,7 @@ retry:
+               vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+                               sc->priority);
+               sc->nr_scanned = 0;
+-              shrink_zones(zonelist, sc);
++              zones_reclaimable = shrink_zones(zonelist, sc);
+ 
+               total_scanned += sc->nr_scanned;
+               if (sc->nr_reclaimed >= sc->nr_to_reclaim)
+@@ -2657,6 +2670,10 @@ retry:
+               goto retry;
+       }
+ 
++      /* Any of the zones still reclaimable?  Don't OOM. */
++      if (zones_reclaimable)
++              return 1;
++
+       return 0;
+ }
+ 
+-- 
+2.1.4
+
diff --git a/Makefile b/Makefile

index 380712a86dd4a8b8fa1c0b759e14c45813a9c5f1..a50b2379010eb51cf09c6a98415a398f25792178 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -268,6 +268,8 @@ ${KERNEL_SRC}/README ${KERNEL_CFG_ORG}: ${KERNELSRCTAR}
         # IPoIB performance regression fix
         cd ${KERNEL_SRC}; patch -p1 < ../IB-ipoib-move-back-the-IB-LL-address-into-the-hard-header.patch
         cd ${KERNEL_SRC}; patch -p1 < ../cgroup-cpuset-add-cpuset.remap_cpus.patch
+       cd ${KERNEL_SRC}; patch -p1 < ../0001-Revert-mm-throttle-on-IO-only-when-there-are-too-man.patch
+       cd ${KERNEL_SRC}; patch -p1 < ../0002-Revert-mm-oom-rework-oom-detection.patch
         sed -i ${KERNEL_SRC}/Makefile -e 's/^EXTRAVERSION.*$$/EXTRAVERSION=${EXTRAVERSION}/'
         touch $@
author	Fabian Grünbichler <f.gruenbichler@proxmox.com>
	Mon, 9 Jan 2017 09:05:16 +0000 (10:05 +0100)
committer	Fabian Grünbichler <f.gruenbichler@proxmox.com>
	Mon, 9 Jan 2017 09:05:16 +0000 (10:05 +0100)
0001-Revert-mm-throttle-on-IO-only-when-there-are-too-man.patch	[new file with mode: 0644]	patch \| blob
0002-Revert-mm-oom-rework-oom-detection.patch	[new file with mode: 0644]	patch \| blob
Makefile		patch \| blob \| blame \| history