From d354e29e89d5d62754e60f730a67a55c0341f451 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fabian=20Gr=C3=BCnbichler?= Date: Mon, 9 Jan 2017 10:05:16 +0100 Subject: [PATCH] revert buggy OOM commits --- ...le-on-IO-only-when-there-are-too-man.patch | 118 ++++++++ 0002-Revert-mm-oom-rework-oom-detection.patch | 255 ++++++++++++++++++ Makefile | 2 + 3 files changed, 375 insertions(+) create mode 100644 0001-Revert-mm-throttle-on-IO-only-when-there-are-too-man.patch create mode 100644 0002-Revert-mm-oom-rework-oom-detection.patch diff --git a/0001-Revert-mm-throttle-on-IO-only-when-there-are-too-man.patch b/0001-Revert-mm-throttle-on-IO-only-when-there-are-too-man.patch new file mode 100644 index 0000000..b4ff5a8 --- /dev/null +++ b/0001-Revert-mm-throttle-on-IO-only-when-there-are-too-man.patch @@ -0,0 +1,118 @@ +From 3168fc7faf603da9d523c9dffbec6fee5b1a8a04 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= +Date: Wed, 4 Jan 2017 11:29:00 +0100 +Subject: [PATCH 1/2] Revert "mm: throttle on IO only when there are too many + dirty and writeback pages" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This reverts commit 57e9ef475661f46769cad6c0ed9a13f0cec1dbd8. + +Signed-off-by: Fabian Grünbichler +--- + mm/backing-dev.c | 20 +++++++++++++++++--- + mm/page_alloc.c | 41 ++++------------------------------------- + 2 files changed, 21 insertions(+), 40 deletions(-) + +diff --git a/mm/backing-dev.c b/mm/backing-dev.c +index a1aef87..9ef80bf 100644 +--- a/mm/backing-dev.c ++++ b/mm/backing-dev.c +@@ -976,8 +976,9 @@ EXPORT_SYMBOL(congestion_wait); + * jiffies for either a BDI to exit congestion of the given @sync queue + * or a write to complete. + * +- * In the absence of zone congestion, cond_resched() is called to yield +- * the processor if necessary but otherwise does not sleep. ++ * In the absence of zone congestion, a short sleep or a cond_resched is ++ * performed to yield the processor and to allow other subsystems to make ++ * a forward progress. + * + * The return value is 0 if the sleep is for the full timeout. Otherwise, + * it is the number of jiffies that were still remaining when the function +@@ -997,7 +998,20 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) + */ + if (atomic_read(&nr_wb_congested[sync]) == 0 || + !test_bit(ZONE_CONGESTED, &zone->flags)) { +- cond_resched(); ++ ++ /* ++ * Memory allocation/reclaim might be called from a WQ ++ * context and the current implementation of the WQ ++ * concurrency control doesn't recognize that a particular ++ * WQ is congested if the worker thread is looping without ++ * ever sleeping. Therefore we have to do a short sleep ++ * here rather than calling cond_resched(). ++ */ ++ if (current->flags & PF_WQ_WORKER) ++ schedule_timeout_uninterruptible(1); ++ else ++ cond_resched(); ++ + /* In case we scheduled, work out time remaining */ + ret = timeout - (jiffies - start); + if (ret < 0) +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index aadbd7e..f13b503 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3038,9 +3038,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { + unsigned long available; +- unsigned long reclaimable; + +- available = reclaimable = zone_reclaimable_pages(zone); ++ available = zone_reclaimable_pages(zone); + available -= DIV_ROUND_UP(no_progress_loops * available, + MAX_RECLAIM_RETRIES); + available += zone_page_state_snapshot(zone, NR_FREE_PAGES); +@@ -3050,41 +3049,9 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, + * available? + */ + if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), +- ac->classzone_idx, alloc_flags, available)) { +- /* +- * If we didn't make any progress and have a lot of +- * dirty + writeback pages then we should wait for +- * an IO to complete to slow down the reclaim and +- * prevent from pre mature OOM +- */ +- if (!did_some_progress) { +- unsigned long writeback; +- unsigned long dirty; +- +- writeback = zone_page_state_snapshot(zone, +- NR_WRITEBACK); +- dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY); +- +- if (2*(writeback + dirty) > reclaimable) { +- congestion_wait(BLK_RW_ASYNC, HZ/10); +- return true; +- } +- } +- +- /* +- * Memory allocation/reclaim might be called from a WQ +- * context and the current implementation of the WQ +- * concurrency control doesn't recognize that +- * a particular WQ is congested if the worker thread is +- * looping without ever sleeping. Therefore we have to +- * do a short sleep here rather than calling +- * cond_resched(). +- */ +- if (current->flags & PF_WQ_WORKER) +- schedule_timeout_uninterruptible(1); +- else +- cond_resched(); +- ++ ac->high_zoneidx, alloc_flags, available)) { ++ /* Wait for some write requests to complete then retry */ ++ wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50); + return true; + } + } +-- +2.1.4 + diff --git a/0002-Revert-mm-oom-rework-oom-detection.patch b/0002-Revert-mm-oom-rework-oom-detection.patch new file mode 100644 index 0000000..5a1ec76 --- /dev/null +++ b/0002-Revert-mm-oom-rework-oom-detection.patch @@ -0,0 +1,255 @@ +From 6e2588df3dc3d1704eae939ed9c9425000f48069 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= +Date: Wed, 4 Jan 2017 11:29:26 +0100 +Subject: [PATCH 2/2] Revert "mm, oom: rework oom detection" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This reverts commit c630ec12d831521b0566481eb56d7257b051911e. + +Signed-off-by: Fabian Grünbichler +--- + include/linux/swap.h | 1 - + mm/page_alloc.c | 100 +++++---------------------------------------------- + mm/vmscan.c | 25 ++++++++++--- + 3 files changed, 29 insertions(+), 97 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 1498c5a..d8ca2ea 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -318,7 +318,6 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma); + + /* linux/mm/vmscan.c */ +-extern unsigned long zone_reclaimable_pages(struct zone *zone); + extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, + gfp_t gfp_mask, nodemask_t *mask); + extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index f13b503..56319cf 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -2988,77 +2988,6 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask) + return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; + } + +-/* +- * Maximum number of reclaim retries without any progress before OOM killer +- * is consider as the only way to move forward. +- */ +-#define MAX_RECLAIM_RETRIES 16 +- +-/* +- * Checks whether it makes sense to retry the reclaim to make a forward progress +- * for the given allocation request. +- * The reclaim feedback represented by did_some_progress (any progress during +- * the last reclaim round), pages_reclaimed (cumulative number of reclaimed +- * pages) and no_progress_loops (number of reclaim rounds without any progress +- * in a row) is considered as well as the reclaimable pages on the applicable +- * zone list (with a backoff mechanism which is a function of no_progress_loops). +- * +- * Returns true if a retry is viable or false to enter the oom path. +- */ +-static inline bool +-should_reclaim_retry(gfp_t gfp_mask, unsigned order, +- struct alloc_context *ac, int alloc_flags, +- bool did_some_progress, unsigned long pages_reclaimed, +- int no_progress_loops) +-{ +- struct zone *zone; +- struct zoneref *z; +- +- /* +- * Make sure we converge to OOM if we cannot make any progress +- * several times in the row. +- */ +- if (no_progress_loops > MAX_RECLAIM_RETRIES) +- return false; +- +- if (order > PAGE_ALLOC_COSTLY_ORDER) { +- if (pages_reclaimed >= (1<zonelist, ac->high_zoneidx, +- ac->nodemask) { +- unsigned long available; +- +- available = zone_reclaimable_pages(zone); +- available -= DIV_ROUND_UP(no_progress_loops * available, +- MAX_RECLAIM_RETRIES); +- available += zone_page_state_snapshot(zone, NR_FREE_PAGES); +- +- /* +- * Would the allocation succeed if we reclaimed the whole +- * available? +- */ +- if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), +- ac->high_zoneidx, alloc_flags, available)) { +- /* Wait for some write requests to complete then retry */ +- wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50); +- return true; +- } +- } +- +- return false; +-} +- + static inline struct page * + __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + struct alloc_context *ac) +@@ -3071,7 +3000,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + enum migrate_mode migration_mode = MIGRATE_ASYNC; + bool deferred_compaction = false; + int contended_compaction = COMPACT_CONTENDED_NONE; +- int no_progress_loops = 0; + + /* + * In the slowpath, we sanity check order to avoid ever trying to +@@ -3223,24 +3151,14 @@ retry: + if (gfp_mask & __GFP_NORETRY) + goto noretry; + +- /* +- * Do not retry costly high order allocations unless they are +- * __GFP_REPEAT +- */ +- if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) +- goto noretry; +- +- if (did_some_progress) { +- no_progress_loops = 0; +- pages_reclaimed += did_some_progress; +- } else { +- no_progress_loops++; +- } +- +- if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, +- did_some_progress > 0, pages_reclaimed, +- no_progress_loops)) ++ /* Keep reclaiming pages as long as there is reasonable progress */ ++ pages_reclaimed += did_some_progress; ++ if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || ++ ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { ++ /* Wait for some write requests to complete then retry */ ++ wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); + goto retry; ++ } + + /* Reclaim has failed us, start killing things */ + page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); +@@ -3248,10 +3166,8 @@ retry: + goto got_pg; + + /* Retry as long as the OOM killer is making progress */ +- if (did_some_progress) { +- no_progress_loops = 0; ++ if (did_some_progress) + goto retry; +- } + + noretry: + /* +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 56f902d..3597160 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -192,7 +192,7 @@ static bool sane_reclaim(struct scan_control *sc) + } + #endif + +-unsigned long zone_reclaimable_pages(struct zone *zone) ++static unsigned long zone_reclaimable_pages(struct zone *zone) + { + unsigned long nr; + +@@ -2492,8 +2492,10 @@ static inline bool compaction_ready(struct zone *zone, int order) + * + * If a zone is deemed to be full of pinned pages then just give it a light + * scan then give up on it. ++ * ++ * Returns true if a zone was reclaimable. + */ +-static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) ++static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) + { + struct zoneref *z; + struct zone *zone; +@@ -2501,6 +2503,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) + unsigned long nr_soft_scanned; + gfp_t orig_mask; + enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); ++ bool reclaimable = false; + + /* + * If the number of buffer_heads in the machine exceeds the maximum +@@ -2565,10 +2568,17 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + sc->nr_scanned += nr_soft_scanned; ++ if (nr_soft_reclaimed) ++ reclaimable = true; + /* need some check for avoid more shrink_zone() */ + } + +- shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); ++ if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx)) ++ reclaimable = true; ++ ++ if (global_reclaim(sc) && ++ !reclaimable && zone_reclaimable(zone)) ++ reclaimable = true; + } + + /* +@@ -2576,6 +2586,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) + * promoted it to __GFP_HIGHMEM. + */ + sc->gfp_mask = orig_mask; ++ ++ return reclaimable; + } + + /* +@@ -2600,6 +2612,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + int initial_priority = sc->priority; + unsigned long total_scanned = 0; + unsigned long writeback_threshold; ++ bool zones_reclaimable; + retry: + delayacct_freepages_start(); + +@@ -2610,7 +2623,7 @@ retry: + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); + sc->nr_scanned = 0; +- shrink_zones(zonelist, sc); ++ zones_reclaimable = shrink_zones(zonelist, sc); + + total_scanned += sc->nr_scanned; + if (sc->nr_reclaimed >= sc->nr_to_reclaim) +@@ -2657,6 +2670,10 @@ retry: + goto retry; + } + ++ /* Any of the zones still reclaimable? Don't OOM. */ ++ if (zones_reclaimable) ++ return 1; ++ + return 0; + } + +-- +2.1.4 + diff --git a/Makefile b/Makefile index 380712a..a50b237 100644 --- a/Makefile +++ b/Makefile @@ -268,6 +268,8 @@ ${KERNEL_SRC}/README ${KERNEL_CFG_ORG}: ${KERNELSRCTAR} # IPoIB performance regression fix cd ${KERNEL_SRC}; patch -p1 < ../IB-ipoib-move-back-the-IB-LL-address-into-the-hard-header.patch cd ${KERNEL_SRC}; patch -p1 < ../cgroup-cpuset-add-cpuset.remap_cpus.patch + cd ${KERNEL_SRC}; patch -p1 < ../0001-Revert-mm-throttle-on-IO-only-when-there-are-too-man.patch + cd ${KERNEL_SRC}; patch -p1 < ../0002-Revert-mm-oom-rework-oom-detection.patch sed -i ${KERNEL_SRC}/Makefile -e 's/^EXTRAVERSION.*$$/EXTRAVERSION=${EXTRAVERSION}/' touch $@ -- 2.39.2