writeback, memcg: Implement foreign dirty flushing

author Tejun Heo <tj@kernel.org>

Mon, 26 Aug 2019 16:06:56 +0000 (09:06 -0700)

committer Jens Axboe <axboe@kernel.dk>

Tue, 27 Aug 2019 15:22:38 +0000 (09:22 -0600)
author Tejun Heo <tj@kernel.org>
Mon, 26 Aug 2019 16:06:56 +0000 (09:06 -0700)
committer Jens Axboe <axboe@kernel.dk>
Tue, 27 Aug 2019 15:22:38 +0000 (09:22 -0600)
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h

index 1075f2552cfc081e9115b40e02cdae534d5a55a7..4fc87dee005ab13e8208c5d19c92700384edffbd 100644 (file)
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -63,6 +63,7 @@ enum wb_reason {
          * so it has a mismatch name.
          */
         WB_REASON_FORKER_THREAD,
+       WB_REASON_FOREIGN_FLUSH,
  
         WB_REASON_MAX,
  };
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 44c41462be334dac8edbc365c293d7014bed29e9..bc69d5725760c4754ee27e30d62728976b52b17b 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -183,6 +183,23 @@ struct memcg_padding {
  #define MEMCG_PADDING(name)
  #endif
  
+/*
+ * Remember four most recent foreign writebacks with dirty pages in this
+ * cgroup.  Inode sharing is expected to be uncommon and, even if we miss
+ * one in a given round, we're likely to catch it later if it keeps
+ * foreign-dirtying, so a fairly low count should be enough.
+ *
+ * See mem_cgroup_track_foreign_dirty_slowpath() for details.
+ */
+#define MEMCG_CGWB_FRN_CNT     4
+
+struct memcg_cgwb_frn {
+       u64 bdi_id;                     /* bdi->id of the foreign inode */
+       int memcg_id;                   /* memcg->css.id of foreign inode */
+       u64 at;                         /* jiffies_64 at the time of dirtying */
+       struct wb_completion done;      /* tracks in-flight foreign writebacks */
+};
+
  /*
   * The memory controller data structure. The memory controller controls both
   * page cache and RSS per cgroup. We would eventually like to provide
@@ -307,6 +324,7 @@ struct mem_cgroup {
  #ifdef CONFIG_CGROUP_WRITEBACK
         struct list_head cgwb_list;
         struct wb_domain cgwb_domain;
+       struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
  #endif
  
         /* List of events which userspace want to receive */
@@ -1218,6 +1236,18 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
                          unsigned long *pheadroom, unsigned long *pdirty,
                          unsigned long *pwriteback);
  
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+                                            struct bdi_writeback *wb);
+
+static inline void mem_cgroup_track_foreign_dirty(struct page *page,
+                                                 struct bdi_writeback *wb)
+{
+       if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
+               mem_cgroup_track_foreign_dirty_slowpath(page, wb);
+}
+
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
+
  #else  /* CONFIG_CGROUP_WRITEBACK */
  
  static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
@@ -1233,6 +1263,15 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
  {
  }
  
+static inline void mem_cgroup_track_foreign_dirty(struct page *page,
+                                                 struct bdi_writeback *wb)
+{
+}
+
+static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+}
+
  #endif /* CONFIG_CGROUP_WRITEBACK */
  
  struct sock;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index cdbb7a84cb6e186242afeebb32f4c5c7fb2d0639..89b65f5ca634b80d10ca6b95297eb5764c10998a 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -87,6 +87,10 @@ int do_swap_account __read_mostly;
  #define do_swap_account                0
  #endif
  
+#ifdef CONFIG_CGROUP_WRITEBACK
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+#endif
+
  /* Whether legacy memory+swap accounting is active */
  static bool do_memsw_account(void)
  {
@@ -4145,6 +4149,127 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
         }
  }
  
+/*
+ * Foreign dirty flushing
+ *
+ * There's an inherent mismatch between memcg and writeback.  The former
+ * trackes ownership per-page while the latter per-inode.  This was a
+ * deliberate design decision because honoring per-page ownership in the
+ * writeback path is complicated, may lead to higher CPU and IO overheads
+ * and deemed unnecessary given that write-sharing an inode across
+ * different cgroups isn't a common use-case.
+ *
+ * Combined with inode majority-writer ownership switching, this works well
+ * enough in most cases but there are some pathological cases.  For
+ * example, let's say there are two cgroups A and B which keep writing to
+ * different but confined parts of the same inode.  B owns the inode and
+ * A's memory is limited far below B's.  A's dirty ratio can rise enough to
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+ * triggering background writeback.  A will be slowed down without a way to
+ * make writeback of the dirty pages happen.
+ *
+ * Conditions like the above can lead to a cgroup getting repatedly and
+ * severely throttled after making some progress after each
+ * dirty_expire_interval while the underyling IO device is almost
+ * completely idle.
+ *
+ * Solving this problem completely requires matching the ownership tracking
+ * granularities between memcg and writeback in either direction.  However,
+ * the more egregious behaviors can be avoided by simply remembering the
+ * most recent foreign dirtying events and initiating remote flushes on
+ * them when local writeback isn't enough to keep the memory clean enough.
+ *
+ * The following two functions implement such mechanism.  When a foreign
+ * page - a page whose memcg and writeback ownerships don't match - is
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+ * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
+ * foreign bdi_writebacks which haven't expired.  Both the numbers of
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+ * limited to MEMCG_CGWB_FRN_CNT.
+ *
+ * The mechanism only remembers IDs and doesn't hold any object references.
+ * As being wrong occasionally doesn't matter, updates and accesses to the
+ * records are lockless and racy.
+ */
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+                                            struct bdi_writeback *wb)
+{
+       struct mem_cgroup *memcg = page->mem_cgroup;
+       struct memcg_cgwb_frn *frn;
+       u64 now = get_jiffies_64();
+       u64 oldest_at = now;
+       int oldest = -1;
+       int i;
+
+       /*
+        * Pick the slot to use.  If there is already a slot for @wb, keep
+        * using it.  If not replace the oldest one which isn't being
+        * written out.
+        */
+       for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+               frn = &memcg->cgwb_frn[i];
+               if (frn->bdi_id == wb->bdi->id &&
+                   frn->memcg_id == wb->memcg_css->id)
+                       break;
+               if (time_before64(frn->at, oldest_at) &&
+                   atomic_read(&frn->done.cnt) == 1) {
+                       oldest = i;
+                       oldest_at = frn->at;
+               }
+       }
+
+       if (i < MEMCG_CGWB_FRN_CNT) {
+               /*
+                * Re-using an existing one.  Update timestamp lazily to
+                * avoid making the cacheline hot.  We want them to be
+                * reasonably up-to-date and significantly shorter than
+                * dirty_expire_interval as that's what expires the record.
+                * Use the shorter of 1s and dirty_expire_interval / 8.
+                */
+               unsigned long update_intv =
+                       min_t(unsigned long, HZ,
+                             msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+
+               if (time_before64(frn->at, now - update_intv))
+                       frn->at = now;
+       } else if (oldest >= 0) {
+               /* replace the oldest free one */
+               frn = &memcg->cgwb_frn[oldest];
+               frn->bdi_id = wb->bdi->id;
+               frn->memcg_id = wb->memcg_css->id;
+               frn->at = now;
+       }
+}
+
+/* issue foreign writeback flushes for recorded foreign dirtying events */
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+       unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+       u64 now = jiffies_64;
+       int i;
+
+       for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+               struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+
+               /*
+                * If the record is older than dirty_expire_interval,
+                * writeback on it has already started.  No need to kick it
+                * off again.  Also, don't start a new one if there's
+                * already one in flight.
+                */
+               if (time_after64(frn->at, now - intv) &&
+                   atomic_read(&frn->done.cnt) == 1) {
+                       frn->at = 0;
+                       cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+                                              WB_REASON_FOREIGN_FLUSH,
+                                              &frn->done);
+               }
+       }
+}
+
  #else  /* CONFIG_CGROUP_WRITEBACK */
  
  static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
@@ -4661,6 +4786,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
         struct mem_cgroup *memcg;
         unsigned int size;
         int node;
+       int __maybe_unused i;
  
         size = sizeof(struct mem_cgroup);
         size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@ -4704,6 +4830,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
  #endif
  #ifdef CONFIG_CGROUP_WRITEBACK
         INIT_LIST_HEAD(&memcg->cgwb_list);
+       for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+               memcg->cgwb_frn[i].done =
+                       __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
  #endif
         idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
         return memcg;
@@ -4833,7 +4962,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       int __maybe_unused i;
  
+#ifdef CONFIG_CGROUP_WRITEBACK
+       for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+               wb_wait_for_completion(&memcg->cgwb_frn[i].done);
+#endif
         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
                 static_branch_dec(&memcg_sockets_enabled_key);
  
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index 1804f64ff43c1c3cec898488c456e2cd52beb041..50055d2e4ea85cf305e209f8a7dd3c84c8251721 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
                 if (unlikely(!writeback_in_progress(wb)))
                         wb_start_background_writeback(wb);
  
+               mem_cgroup_flush_foreign(wb);
+
                 /*
                  * Calculate global domain's pos_ratio and select the
                  * global dtc by default.
@@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
                 task_io_account_write(PAGE_SIZE);
                 current->nr_dirtied++;
                 this_cpu_inc(bdp_ratelimits);
+
+               mem_cgroup_track_foreign_dirty(page, wb);
         }
  }
author	Tejun Heo <tj@kernel.org>
	Mon, 26 Aug 2019 16:06:56 +0000 (09:06 -0700)
committer	Jens Axboe <axboe@kernel.dk>
	Tue, 27 Aug 2019 15:22:38 +0000 (09:22 -0600)
include/linux/backing-dev-defs.h		patch \| blob \| blame \| history
include/linux/memcontrol.h		patch \| blob \| blame \| history
mm/memcontrol.c		patch \| blob \| blame \| history
mm/page-writeback.c		patch \| blob \| blame \| history