UBUNTU: Ubuntu-4.15.0-96.97

[mirror_ubuntu-bionic-kernel.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index ac2ffd5e02b914fb9564649c9475babc51119de6..1a34cd1d17a3c65edd42f35b3f9de0b240f64873 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -233,6 +233,12 @@ enum res_type {
  /* Used for OOM nofiier */
  #define OOM_CONTROL            (0)
  
+static inline bool should_force_charge(void)
+{
+       return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
+               (current->flags & PF_EXITING);
+}
+
  /* Some nice accessors for the vmpressure. */
  struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
  {
@@ -725,7 +731,7 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
                         if (unlikely(!memcg))
                                 memcg = root_mem_cgroup;
                 }
-       } while (!css_tryget_online(&memcg->css));
+       } while (!css_tryget(&memcg->css));
         rcu_read_unlock();
         return memcg;
  }
@@ -871,26 +877,45 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
                 css_put(&prev->css);
  }
  
-static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
+                                       struct mem_cgroup *dead_memcg)
  {
-       struct mem_cgroup *memcg = dead_memcg;
         struct mem_cgroup_reclaim_iter *iter;
         struct mem_cgroup_per_node *mz;
         int nid;
         int i;
  
-       while ((memcg = parent_mem_cgroup(memcg))) {
-               for_each_node(nid) {
-                       mz = mem_cgroup_nodeinfo(memcg, nid);
-                       for (i = 0; i <= DEF_PRIORITY; i++) {
-                               iter = &mz->iter[i];
-                               cmpxchg(&iter->position,
-                                       dead_memcg, NULL);
-                       }
+       for_each_node(nid) {
+               mz = mem_cgroup_nodeinfo(from, nid);
+               for (i = 0; i <= DEF_PRIORITY; i++) {
+                       iter = &mz->iter[i];
+                       cmpxchg(&iter->position,
+                               dead_memcg, NULL);
                 }
         }
  }
  
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+       struct mem_cgroup *memcg = dead_memcg;
+       struct mem_cgroup *last;
+
+       do {
+               __invalidate_reclaim_iterators(memcg, dead_memcg);
+               last = memcg;
+       } while ((memcg = parent_mem_cgroup(memcg)));
+
+       /*
+        * When cgruop1 non-hierarchy mode is used,
+        * parent_mem_cgroup() does not walk all the way up to the
+        * cgroup root (root_mem_cgroup). So we have to handle
+        * dead_memcg from cgroup root separately.
+        */
+       if (last != root_mem_cgroup)
+               __invalidate_reclaim_iterators(root_mem_cgroup,
+                                               dead_memcg);
+}
+
  /*
   * Iteration constructs for visiting all cgroups (under a tree).  If
   * loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -1251,8 +1276,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         };
         bool ret;
  
-       mutex_lock(&oom_lock);
-       ret = out_of_memory(&oc);
+       if (mutex_lock_killable(&oom_lock))
+               return true;
+       /*
+        * A few threads which were not waiting at mutex_lock_killable() can
+        * fail to bail out. Therefore, check again after holding oom_lock.
+        */
+       ret = should_force_charge() || out_of_memory(&oc);
         mutex_unlock(&oom_lock);
         return ret;
  }
@@ -1936,15 +1966,22 @@ retry:
                 goto retry;
         }
  
+       /*
+        * Memcg doesn't have a dedicated reserve for atomic
+        * allocations. But like the global atomic pool, we need to
+        * put the burden of reclaim on regular allocation requests
+        * and let these go through as privileged allocations.
+        */
+       if (gfp_mask & __GFP_ATOMIC)
+               goto force;
+
         /*
          * Unlike in global OOM situations, memcg is not in a physical
          * memory shortage.  Allow dying and OOM-killed tasks to
          * bypass the last charges so that they can exit quickly and
          * free their memory.
          */
-       if (unlikely(tsk_is_oom_victim(current) ||
-                    fatal_signal_pending(current) ||
-                    current->flags & PF_EXITING))
+       if (unlikely(should_force_charge()))
                 goto force;
  
         /*
@@ -2205,7 +2242,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
  {
         struct memcg_kmem_cache_create_work *cw;
  
-       cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
+       cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
         if (!cw)
                 return;
  
@@ -2333,6 +2370,16 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
  
         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
             !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+
+               /*
+                * Enforce __GFP_NOFAIL allocation because callers are not
+                * prepared to see failures and likely do not have any failure
+                * handling code.
+                */
+               if (gfp & __GFP_NOFAIL) {
+                       page_counter_charge(&memcg->kmem, nr_pages);
+                       return 0;
+               }
                 cancel_charge(memcg, nr_pages);
                 return -ENOMEM;
         }
@@ -2355,7 +2402,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
         struct mem_cgroup *memcg;
         int ret = 0;
  
-       if (memcg_kmem_bypass())
+       if (mem_cgroup_disabled() || memcg_kmem_bypass())
                 return 0;
  
         memcg = get_mem_cgroup_from_mm(current->mm);
@@ -4110,6 +4157,14 @@ static struct cftype mem_cgroup_legacy_files[] = {
  
  static DEFINE_IDR(mem_cgroup_idr);
  
+static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
+{
+       if (memcg->id.id > 0) {
+               idr_remove(&mem_cgroup_idr, memcg->id.id);
+               memcg->id.id = 0;
+       }
+}
+
  static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
  {
         VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
@@ -4120,8 +4175,7 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
  {
         VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
         if (atomic_sub_and_test(n, &memcg->id.ref)) {
-               idr_remove(&mem_cgroup_idr, memcg->id.id);
-               memcg->id.id = 0;
+               mem_cgroup_id_remove(memcg);
  
                 /* Memcg ID pins CSS */
                 css_put(&memcg->css);
@@ -4187,6 +4241,9 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
  {
         struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
  
+       if (!pn)
+               return;
+
         free_percpu(pn->lruvec_stat);
         kfree(pn);
  }
@@ -4255,8 +4312,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
         idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
         return memcg;
  fail:
-       if (memcg->id.id > 0)
-               idr_remove(&mem_cgroup_idr, memcg->id.id);
+       mem_cgroup_id_remove(memcg);
         __mem_cgroup_free(memcg);
         return NULL;
  }
@@ -4315,6 +4371,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  
         return &memcg->css;
  fail:
+       mem_cgroup_id_remove(memcg);
         mem_cgroup_free(memcg);
         return ERR_PTR(-ENOMEM);
  }
@@ -5828,6 +5885,20 @@ void mem_cgroup_sk_alloc(struct sock *sk)
         if (!mem_cgroup_sockets_enabled)
                 return;
  
+       /*
+        * Socket cloning can throw us here with sk_memcg already
+        * filled. It won't however, necessarily happen from
+        * process context. So the test for root memcg given
+        * the current task's memcg won't help us in this case.
+        *
+        * Respecting the original socket's memcg is a better
+        * decision in this case.
+        */
+       if (sk->sk_memcg) {
+               css_get(&sk->sk_memcg->css);
+               return;
+       }
+
         rcu_read_lock();
         memcg = mem_cgroup_from_task(current);
         if (memcg == root_mem_cgroup)