]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blobdiff - mm/vmscan.c
mm/vmscan.c: avoid throttling reclaim for loop-back nfsd threads
[mirror_ubuntu-zesty-kernel.git] / mm / vmscan.c
index 32c661d66a45498e270ba5e9019cda60a114cc27..5a8776eb0f4352abf7a1788f68e84db635f559b4 100644 (file)
@@ -1438,6 +1438,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
        list_splice(&pages_to_free, page_list);
 }
 
+/*
+ * If a kernel thread (such as nfsd for loop-back mounts) services
+ * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
+ * In that case we should only throttle if the backing device it is
+ * writing to is congested.  In other cases it is safe to throttle.
+ */
+static int current_may_throttle(void)
+{
+       return !(current->flags & PF_LESS_THROTTLE) ||
+               current->backing_dev_info == NULL ||
+               bdi_write_congested(current->backing_dev_info);
+}
+
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
@@ -1566,7 +1579,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                 * implies that pages are cycling through the LRU faster than
                 * they are written so also forcibly stall.
                 */
-               if (nr_unqueued_dirty == nr_taken || nr_immediate)
+               if ((nr_unqueued_dirty == nr_taken || nr_immediate) &&
+                   current_may_throttle())
                        congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
 
@@ -1575,7 +1589,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         * is congested. Allow kswapd to continue until it starts encountering
         * unqueued dirty pages or cycling through the LRU too quickly.
         */
-       if (!sc->hibernation_mode && !current_is_kswapd())
+       if (!sc->hibernation_mode && !current_is_kswapd() &&
+           current_may_throttle())
                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
 
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1866,6 +1881,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        bool force_scan = false;
        unsigned long ap, fp;
        enum lru_list lru;
+       bool some_scanned;
+       int pass;
 
        /*
         * If the zone or memcg is small, nr[l] can be 0.  This
@@ -1989,39 +2006,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        fraction[1] = fp;
        denominator = ap + fp + 1;
 out:
-       for_each_evictable_lru(lru) {
-               int file = is_file_lru(lru);
-               unsigned long size;
-               unsigned long scan;
+       some_scanned = false;
+       /* Only use force_scan on second pass. */
+       for (pass = 0; !some_scanned && pass < 2; pass++) {
+               for_each_evictable_lru(lru) {
+                       int file = is_file_lru(lru);
+                       unsigned long size;
+                       unsigned long scan;
 
-               size = get_lru_size(lruvec, lru);
-               scan = size >> sc->priority;
+                       size = get_lru_size(lruvec, lru);
+                       scan = size >> sc->priority;
 
-               if (!scan && force_scan)
-                       scan = min(size, SWAP_CLUSTER_MAX);
+                       if (!scan && pass && force_scan)
+                               scan = min(size, SWAP_CLUSTER_MAX);
 
-               switch (scan_balance) {
-               case SCAN_EQUAL:
-                       /* Scan lists relative to size */
-                       break;
-               case SCAN_FRACT:
+                       switch (scan_balance) {
+                       case SCAN_EQUAL:
+                               /* Scan lists relative to size */
+                               break;
+                       case SCAN_FRACT:
+                               /*
+                                * Scan types proportional to swappiness and
+                                * their relative recent reclaim efficiency.
+                                */
+                               scan = div64_u64(scan * fraction[file],
+                                                       denominator);
+                               break;
+                       case SCAN_FILE:
+                       case SCAN_ANON:
+                               /* Scan one type exclusively */
+                               if ((scan_balance == SCAN_FILE) != file)
+                                       scan = 0;
+                               break;
+                       default:
+                               /* Look ma, no brain */
+                               BUG();
+                       }
+                       nr[lru] = scan;
                        /*
-                        * Scan types proportional to swappiness and
-                        * their relative recent reclaim efficiency.
+                        * Skip the second pass and don't force_scan,
+                        * if we found something to scan.
                         */
-                       scan = div64_u64(scan * fraction[file], denominator);
-                       break;
-               case SCAN_FILE:
-               case SCAN_ANON:
-                       /* Scan one type exclusively */
-                       if ((scan_balance == SCAN_FILE) != file)
-                               scan = 0;
-                       break;
-               default:
-                       /* Look ma, no brain */
-                       BUG();
+                       some_scanned |= !!scan;
                }
-               nr[lru] = scan;
        }
 }
 
@@ -2525,10 +2552,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 
        for (i = 0; i <= ZONE_NORMAL; i++) {
                zone = &pgdat->node_zones[i];
+               if (!populated_zone(zone))
+                       continue;
+
                pfmemalloc_reserve += min_wmark_pages(zone);
                free_pages += zone_page_state(zone, NR_FREE_PAGES);
        }
 
+       /* If there are no reserves (unexpected config) then do not throttle */
+       if (!pfmemalloc_reserve)
+               return true;
+
        wmark_ok = free_pages > pfmemalloc_reserve / 2;
 
        /* kswapd must be awake if processes are being throttled */
@@ -2553,9 +2587,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                                        nodemask_t *nodemask)
 {
+       struct zoneref *z;
        struct zone *zone;
-       int high_zoneidx = gfp_zone(gfp_mask);
-       pg_data_t *pgdat;
+       pg_data_t *pgdat = NULL;
 
        /*
         * Kernel threads should not be throttled as they may be indirectly
@@ -2574,10 +2608,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
        if (fatal_signal_pending(current))
                goto out;
 
-       /* Check if the pfmemalloc reserves are ok */
-       first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
-       pgdat = zone->zone_pgdat;
-       if (pfmemalloc_watermark_ok(pgdat))
+       /*
+        * Check if the pfmemalloc reserves are ok by finding the first node
+        * with a usable ZONE_NORMAL or lower zone. The expectation is that
+        * GFP_KERNEL will be required for allocating network buffers when
+        * swapping over the network so ZONE_HIGHMEM is unusable.
+        *
+        * Throttling is based on the first usable node and throttled processes
+        * wait on a queue until kswapd makes progress and wakes them. There
+        * is an affinity then between processes waking up and where reclaim
+        * progress has been made assuming the process wakes on the same node.
+        * More importantly, processes running on remote nodes will not compete
+        * for remote pfmemalloc reserves and processes on different nodes
+        * should make reasonable progress.
+        */
+       for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                       gfp_mask, nodemask) {
+               if (zone_idx(zone) > ZONE_NORMAL)
+                       continue;
+
+               /* Throttle based on the first usable node */
+               pgdat = zone->zone_pgdat;
+               if (pfmemalloc_watermark_ok(pgdat))
+                       goto out;
+               break;
+       }
+
+       /* If no zone was usable by the allocation flags then do not throttle */
+       if (!pgdat)
                goto out;
 
        /* Account for the throttling */
@@ -3422,7 +3480,7 @@ int kswapd_run(int nid)
 
 /*
  * Called by memory hotplug when all memory in a node is offlined.  Caller must
- * hold lock_memory_hotplug().
+ * hold mem_hotplug_begin/end().
  */
 void kswapd_stop(int nid)
 {