mm: fix negative nr_isolated counts

[mirror_ubuntu-artful-kernel.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 2f6893c2f01b6c9649992910d902b94c49ae8ece..d18d3a6e7337d944a36e1375b1f50fdce483961f 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
  #define MEM_CGROUP_RECLAIM_RETRIES     5
  static struct mem_cgroup *root_mem_cgroup __read_mostly;
  
+/* Whether the swap controller is active */
  #ifdef CONFIG_MEMCG_SWAP
-/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
  int do_swap_account __read_mostly;
-
-/* for remember boot option*/
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
-static int really_do_swap_account __initdata = 1;
-#else
-static int really_do_swap_account __initdata;
-#endif
-
  #else
  #define do_swap_account                0
  #endif
  
-
  static const char * const mem_cgroup_stat_names[] = {
         "cache",
         "rss",
@@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = {
         "swap",
  };
  
-enum mem_cgroup_events_index {
-       MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
-       MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
-       MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
-       MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
-       MEM_CGROUP_EVENTS_NSTATS,
-};
-
  static const char * const mem_cgroup_events_names[] = {
         "pgpgin",
         "pgpgout",
@@ -138,7 +121,7 @@ enum mem_cgroup_events_target {
  
  struct mem_cgroup_stat_cpu {
         long count[MEM_CGROUP_STAT_NSTATS];
-       unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+       unsigned long events[MEMCG_NR_EVENTS];
         unsigned long nr_page_events;
         unsigned long targets[MEM_CGROUP_NTARGETS];
  };
@@ -284,6 +267,10 @@ struct mem_cgroup {
         struct page_counter memsw;
         struct page_counter kmem;
  
+       /* Normal memory consumption range */
+       unsigned long low;
+       unsigned long high;
+
         unsigned long soft_limit;
  
         /* vmpressure notifications */
@@ -325,9 +312,11 @@ struct mem_cgroup {
         /*
          * set > 0 if pages under this cgroup are moving to other cgroup.
          */
-       atomic_t        moving_account;
+       atomic_t                moving_account;
         /* taken only while moving_account > 0 */
-       spinlock_t      move_lock;
+       spinlock_t              move_lock;
+       struct task_struct      *move_lock_task;
+       unsigned long           move_lock_flags;
         /*
          * percpu counter.
          */
@@ -343,11 +332,10 @@ struct mem_cgroup {
         struct cg_proto tcp_mem;
  #endif
  #if defined(CONFIG_MEMCG_KMEM)
-       /* analogous to slab_common's slab_caches list, but per-memcg;
-        * protected by memcg_slab_mutex */
-       struct list_head memcg_slab_caches;
-        /* Index in the kmem_cache->memcg_params->memcg_caches array */
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
         int kmemcg_id;
+       bool kmem_acct_activated;
+       bool kmem_acct_active;
  #endif
  
         int last_scanned_node;
@@ -366,29 +354,26 @@ struct mem_cgroup {
  };
  
  #ifdef CONFIG_MEMCG_KMEM
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
  {
-       return memcg->kmemcg_id >= 0;
+       return memcg->kmem_acct_active;
  }
  #endif
  
  /* Stuffs for move charges at task migration. */
  /*
- * Types of charges to be moved. "move_charge_at_immitgrate" and
- * "immigrate_flags" are treated as a left-shifted bitmap of these types.
+ * Types of charges to be moved.
   */
-enum move_type {
-       MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
-       MOVE_CHARGE_TYPE_FILE,  /* file page(including tmpfs) and swap of it */
-       NR_MOVE_TYPE,
-};
+#define MOVE_ANON      0x1U
+#define MOVE_FILE      0x2U
+#define MOVE_MASK      (MOVE_ANON | MOVE_FILE)
  
  /* "mc" and its members are protected by cgroup_mutex */
  static struct move_charge_struct {
         spinlock_t        lock; /* for from, to */
         struct mem_cgroup *from;
         struct mem_cgroup *to;
-       unsigned long immigrate_flags;
+       unsigned long flags;
         unsigned long precharge;
         unsigned long moved_charge;
         unsigned long moved_swap;
@@ -399,16 +384,6 @@ static struct move_charge_struct {
         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
  };
  
-static bool move_anon(void)
-{
-       return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
-}
-
-static bool move_file(void)
-{
-       return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
-}
-
  /*
   * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
   * limit reclaim to prevent infinite loops, if they ever occur.
@@ -544,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
  }
  EXPORT_SYMBOL(tcp_proto_cgroup);
  
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
-       if (!memcg_proto_activated(&memcg->tcp_mem))
-               return;
-       static_key_slow_dec(&memcg_socket_limit_enabled);
-}
-#else
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
-}
  #endif
  
  #ifdef CONFIG_MEMCG_KMEM
  /*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
   * The main reason for not using cgroup id for this:
   *  this works better in sparse environments, where we have a lot of memcgs,
   *  but only a few kmem-limited. Or also, if we have, for instance, 200
   *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
   *  200 entry array for that.
   *
- * The current size of the caches array is stored in
- * memcg_limited_groups_array_size.  It will double each time we have to
- * increase it.
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
+ * will double each time we have to increase it.
   */
-static DEFINE_IDA(kmem_limited_groups);
-int memcg_limited_groups_array_size;
+static DEFINE_IDA(memcg_cache_ida);
+int memcg_nr_cache_ids;
+
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+
+void memcg_get_cache_ids(void)
+{
+       down_read(&memcg_cache_ids_sem);
+}
+
+void memcg_put_cache_ids(void)
+{
+       up_read(&memcg_cache_ids_sem);
+}
  
  /*
   * MIN_SIZE is different than 1, because we would like to avoid going through
@@ -596,32 +573,8 @@ int memcg_limited_groups_array_size;
  struct static_key memcg_kmem_enabled_key;
  EXPORT_SYMBOL(memcg_kmem_enabled_key);
  
-static void memcg_free_cache_id(int id);
-
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
-       if (memcg_kmem_is_active(memcg)) {
-               static_key_slow_dec(&memcg_kmem_enabled_key);
-               memcg_free_cache_id(memcg->kmemcg_id);
-       }
-       /*
-        * This check can't live in kmem destruction function,
-        * since the charges will outlive the cgroup
-        */
-       WARN_ON(page_counter_read(&memcg->kmem));
-}
-#else
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
-}
  #endif /* CONFIG_MEMCG_KMEM */
  
-static void disarm_static_keys(struct mem_cgroup *memcg)
-{
-       disarm_sock_keys(memcg);
-       disarm_kmem_keys(memcg);
-}
-
  static struct mem_cgroup_per_zone *
  mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
  {
@@ -1368,6 +1321,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
         return inactive * inactive_ratio < active;
  }
  
+bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+       struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup *memcg;
+
+       if (mem_cgroup_disabled())
+               return true;
+
+       mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+       memcg = mz->memcg;
+
+       return !!(memcg->css.flags & CSS_ONLINE);
+}
+
  #define mem_cgroup_from_counter(counter, member)       \
         container_of(counter, struct mem_cgroup, member)
  
@@ -1560,7 +1527,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
          * quickly exit and free its memory.
          */
         if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-               set_thread_flag(TIF_MEMDIE);
+               mark_tsk_oom_victim(current);
                 return;
         }
  
@@ -1934,7 +1901,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
         if (!memcg)
                 return false;
  
-       if (!handle)
+       if (!handle || oom_killer_disabled)
                 goto cleanup;
  
         owait.memcg = memcg;
@@ -1980,34 +1947,33 @@ cleanup:
  /**
   * mem_cgroup_begin_page_stat - begin a page state statistics transaction
   * @page: page that is going to change accounted state
- * @locked: &memcg->move_lock slowpath was taken
- * @flags: IRQ-state flags for &memcg->move_lock
   *
   * This function must mark the beginning of an accounted page state
   * change to prevent double accounting when the page is concurrently
   * being moved to another memcg:
   *
- *   memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ *   memcg = mem_cgroup_begin_page_stat(page);
   *   if (TestClearPageState(page))
   *     mem_cgroup_update_page_stat(memcg, state, -1);
- *   mem_cgroup_end_page_stat(memcg, locked, flags);
- *
- * The RCU lock is held throughout the transaction.  The fast path can
- * get away without acquiring the memcg->move_lock (@locked is false)
- * because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when the page
- * state that is going to change is the only thing preventing the page
- * from being uncharged.  E.g. end-writeback clearing PageWriteback(),
- * which allows migration to go ahead and uncharge the page before the
- * account transaction might be complete.
+ *   mem_cgroup_end_page_stat(memcg);
   */
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
-                                             bool *locked,
-                                             unsigned long *flags)
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
  {
         struct mem_cgroup *memcg;
+       unsigned long flags;
  
+       /*
+        * The RCU lock is held throughout the transaction.  The fast
+        * path can get away without acquiring the memcg->move_lock
+        * because page moving starts with an RCU grace period.
+        *
+        * The RCU lock also protects the memcg from being freed when
+        * the page state that is going to change is the only thing
+        * preventing the page from being uncharged.
+        * E.g. end-writeback clearing PageWriteback(), which allows
+        * migration to go ahead and uncharge the page before the
+        * account transaction might be complete.
+        */
         rcu_read_lock();
  
         if (mem_cgroup_disabled())
@@ -2017,16 +1983,22 @@ again:
         if (unlikely(!memcg))
                 return NULL;
  
-       *locked = false;
         if (atomic_read(&memcg->moving_account) <= 0)
                 return memcg;
  
-       spin_lock_irqsave(&memcg->move_lock, *flags);
+       spin_lock_irqsave(&memcg->move_lock, flags);
         if (memcg != page->mem_cgroup) {
-               spin_unlock_irqrestore(&memcg->move_lock, *flags);
+               spin_unlock_irqrestore(&memcg->move_lock, flags);
                 goto again;
         }
-       *locked = true;
+
+       /*
+        * When charge migration first begins, we can have locked and
+        * unlocked page stat updates happening concurrently.  Track
+        * the task who has the lock for mem_cgroup_end_page_stat().
+        */
+       memcg->move_lock_task = current;
+       memcg->move_lock_flags = flags;
  
         return memcg;
  }
@@ -2034,14 +2006,17 @@ again:
  /**
   * mem_cgroup_end_page_stat - finish a page state statistics transaction
   * @memcg: the memcg that was accounted against
- * @locked: value received from mem_cgroup_begin_page_stat()
- * @flags: value received from mem_cgroup_begin_page_stat()
   */
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
-                             unsigned long *flags)
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
  {
-       if (memcg && *locked)
-               spin_unlock_irqrestore(&memcg->move_lock, *flags);
+       if (memcg && memcg->move_lock_task == current) {
+               unsigned long flags = memcg->move_lock_flags;
+
+               memcg->move_lock_task = NULL;
+               memcg->move_lock_flags = 0;
+
+               spin_unlock_irqrestore(&memcg->move_lock, flags);
+       }
  
         rcu_read_unlock();
  }
@@ -2134,17 +2109,6 @@ static void drain_local_stock(struct work_struct *dummy)
         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
  }
  
-static void __init memcg_stock_init(void)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               struct memcg_stock_pcp *stock =
-                                       &per_cpu(memcg_stock, cpu);
-               INIT_WORK(&stock->work, drain_local_stock);
-       }
-}
-
  /*
   * Cache charges(val) to local per_cpu area.
   * This will be consumed by consume_stock() function, later.
@@ -2294,6 +2258,8 @@ retry:
         if (!(gfp_mask & __GFP_WAIT))
                 goto nomem;
  
+       mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
+
         nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
                                                     gfp_mask, may_swap);
  
@@ -2335,6 +2301,8 @@ retry:
         if (fatal_signal_pending(current))
                 goto bypass;
  
+       mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
+
         mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
  nomem:
         if (!(gfp_mask & __GFP_NOFAIL))
@@ -2346,6 +2314,16 @@ done_restock:
         css_get_many(&memcg->css, batch);
         if (batch > nr_pages)
                 refill_stock(memcg, batch - nr_pages);
+       /*
+        * If the hierarchy is above the normal consumption range,
+        * make the charging task trim their excess contribution.
+        */
+       do {
+               if (page_counter_read(&memcg->memory) <= memcg->high)
+                       continue;
+               mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+               try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+       } while ((memcg = parent_mem_cgroup(memcg)));
  done:
         return ret;
  }
@@ -2476,27 +2454,8 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
  }
  
  #ifdef CONFIG_MEMCG_KMEM
-/*
- * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
- * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
- */
-static DEFINE_MUTEX(memcg_slab_mutex);
-
-/*
- * This is a bit cumbersome, but it is rarely used and avoids a backpointer
- * in the memcg_cache_params struct.
- */
-static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
-{
-       struct kmem_cache *cachep;
-
-       VM_BUG_ON(p->is_root_cache);
-       cachep = p->root_cache;
-       return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
-}
-
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-                            unsigned long nr_pages)
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
+                     unsigned long nr_pages)
  {
         struct page_counter *counter;
         int ret = 0;
@@ -2533,8 +2492,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
         return ret;
  }
  
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
-                               unsigned long nr_pages)
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
  {
         page_counter_uncharge(&memcg->memory, nr_pages);
         if (do_swap_account)
@@ -2560,18 +2518,19 @@ static int memcg_alloc_cache_id(void)
         int id, size;
         int err;
  
-       id = ida_simple_get(&kmem_limited_groups,
+       id = ida_simple_get(&memcg_cache_ida,
                             0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
         if (id < 0)
                 return id;
  
-       if (id < memcg_limited_groups_array_size)
+       if (id < memcg_nr_cache_ids)
                 return id;
  
         /*
          * There's no space for the new id in memcg_caches arrays,
          * so we have to grow them.
          */
+       down_write(&memcg_cache_ids_sem);
  
         size = 2 * (id + 1);
         if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2579,12 +2538,16 @@ static int memcg_alloc_cache_id(void)
         else if (size > MEMCG_CACHES_MAX_SIZE)
                 size = MEMCG_CACHES_MAX_SIZE;
  
-       mutex_lock(&memcg_slab_mutex);
         err = memcg_update_all_caches(size);
-       mutex_unlock(&memcg_slab_mutex);
+       if (!err)
+               err = memcg_update_all_list_lrus(size);
+       if (!err)
+               memcg_nr_cache_ids = size;
+
+       up_write(&memcg_cache_ids_sem);
  
         if (err) {
-               ida_simple_remove(&kmem_limited_groups, id);
+               ida_simple_remove(&memcg_cache_ida, id);
                 return err;
         }
         return id;
@@ -2592,136 +2555,23 @@ static int memcg_alloc_cache_id(void)
  
  static void memcg_free_cache_id(int id)
  {
-       ida_simple_remove(&kmem_limited_groups, id);
-}
-
-/*
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
- */
-void memcg_update_array_size(int num)
-{
-       memcg_limited_groups_array_size = num;
-}
-
-static void memcg_register_cache(struct mem_cgroup *memcg,
-                                struct kmem_cache *root_cache)
-{
-       static char memcg_name_buf[NAME_MAX + 1]; /* protected by
-                                                    memcg_slab_mutex */
-       struct kmem_cache *cachep;
-       int id;
-
-       lockdep_assert_held(&memcg_slab_mutex);
-
-       id = memcg_cache_id(memcg);
-
-       /*
-        * Since per-memcg caches are created asynchronously on first
-        * allocation (see memcg_kmem_get_cache()), several threads can try to
-        * create the same cache, but only one of them may succeed.
-        */
-       if (cache_from_memcg_idx(root_cache, id))
-               return;
-
-       cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
-       cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
-       /*
-        * If we could not create a memcg cache, do not complain, because
-        * that's not critical at all as we can always proceed with the root
-        * cache.
-        */
-       if (!cachep)
-               return;
-
-       list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-
-       /*
-        * Since readers won't lock (see cache_from_memcg_idx()), we need a
-        * barrier here to ensure nobody will see the kmem_cache partially
-        * initialized.
-        */
-       smp_wmb();
-
-       BUG_ON(root_cache->memcg_params->memcg_caches[id]);
-       root_cache->memcg_params->memcg_caches[id] = cachep;
-}
-
-static void memcg_unregister_cache(struct kmem_cache *cachep)
-{
-       struct kmem_cache *root_cache;
-       struct mem_cgroup *memcg;
-       int id;
-
-       lockdep_assert_held(&memcg_slab_mutex);
-
-       BUG_ON(is_root_cache(cachep));
-
-       root_cache = cachep->memcg_params->root_cache;
-       memcg = cachep->memcg_params->memcg;
-       id = memcg_cache_id(memcg);
-
-       BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
-       root_cache->memcg_params->memcg_caches[id] = NULL;
-
-       list_del(&cachep->memcg_params->list);
-
-       kmem_cache_destroy(cachep);
-}
-
-int __memcg_cleanup_cache_params(struct kmem_cache *s)
-{
-       struct kmem_cache *c;
-       int i, failed = 0;
-
-       mutex_lock(&memcg_slab_mutex);
-       for_each_memcg_cache_index(i) {
-               c = cache_from_memcg_idx(s, i);
-               if (!c)
-                       continue;
-
-               memcg_unregister_cache(c);
-
-               if (cache_from_memcg_idx(s, i))
-                       failed++;
-       }
-       mutex_unlock(&memcg_slab_mutex);
-       return failed;
-}
-
-static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
-       struct kmem_cache *cachep;
-       struct memcg_cache_params *params, *tmp;
-
-       if (!memcg_kmem_is_active(memcg))
-               return;
-
-       mutex_lock(&memcg_slab_mutex);
-       list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
-               cachep = memcg_params_to_cache(params);
-               memcg_unregister_cache(cachep);
-       }
-       mutex_unlock(&memcg_slab_mutex);
+       ida_simple_remove(&memcg_cache_ida, id);
  }
  
-struct memcg_register_cache_work {
+struct memcg_kmem_cache_create_work {
         struct mem_cgroup *memcg;
         struct kmem_cache *cachep;
         struct work_struct work;
  };
  
-static void memcg_register_cache_func(struct work_struct *w)
+static void memcg_kmem_cache_create_func(struct work_struct *w)
  {
-       struct memcg_register_cache_work *cw =
-               container_of(w, struct memcg_register_cache_work, work);
+       struct memcg_kmem_cache_create_work *cw =
+               container_of(w, struct memcg_kmem_cache_create_work, work);
         struct mem_cgroup *memcg = cw->memcg;
         struct kmem_cache *cachep = cw->cachep;
  
-       mutex_lock(&memcg_slab_mutex);
-       memcg_register_cache(memcg, cachep);
-       mutex_unlock(&memcg_slab_mutex);
+       memcg_create_kmem_cache(memcg, cachep);
  
         css_put(&memcg->css);
         kfree(cw);
@@ -2730,10 +2580,10 @@ static void memcg_register_cache_func(struct work_struct *w)
  /*
   * Enqueue the creation of a per-memcg kmem_cache.
   */
-static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
-                                           struct kmem_cache *cachep)
+static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+                                              struct kmem_cache *cachep)
  {
-       struct memcg_register_cache_work *cw;
+       struct memcg_kmem_cache_create_work *cw;
  
         cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
         if (!cw)
@@ -2743,18 +2593,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
  
         cw->memcg = memcg;
         cw->cachep = cachep;
+       INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
  
-       INIT_WORK(&cw->work, memcg_register_cache_func);
         schedule_work(&cw->work);
  }
  
-static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
-                                         struct kmem_cache *cachep)
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+                                            struct kmem_cache *cachep)
  {
         /*
          * We need to stop accounting when we kmalloc, because if the
          * corresponding kmalloc cache is not yet created, the first allocation
-        * in __memcg_schedule_register_cache will recurse.
+        * in __memcg_schedule_kmem_cache_create will recurse.
          *
          * However, it is better to enclose the whole function. Depending on
          * the debugging options enabled, INIT_WORK(), for instance, can
@@ -2763,24 +2613,10 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
          * the safest choice is to do it like this, wrapping the whole function.
          */
         current->memcg_kmem_skip_account = 1;
-       __memcg_schedule_register_cache(memcg, cachep);
+       __memcg_schedule_kmem_cache_create(memcg, cachep);
         current->memcg_kmem_skip_account = 0;
  }
  
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
-{
-       unsigned int nr_pages = 1 << order;
-
-       return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
-}
-
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
-{
-       unsigned int nr_pages = 1 << order;
-
-       memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
-}
-
  /*
   * Return the kmem_cache we're supposed to use for a slab allocation.
   * We try to use the current memcg's version of the cache.
@@ -2798,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
  {
         struct mem_cgroup *memcg;
         struct kmem_cache *memcg_cachep;
+       int kmemcg_id;
  
-       VM_BUG_ON(!cachep->memcg_params);
-       VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+       VM_BUG_ON(!is_root_cache(cachep));
  
         if (current->memcg_kmem_skip_account)
                 return cachep;
  
         memcg = get_mem_cgroup_from_mm(current->mm);
-       if (!memcg_kmem_is_active(memcg))
+       kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+       if (kmemcg_id < 0)
                 goto out;
  
-       memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+       memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
         if (likely(memcg_cachep))
                 return memcg_cachep;
  
@@ -2825,7 +2662,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
          * could happen with the slab_mutex held. So it's better to
          * defer everything.
          */
-       memcg_schedule_register_cache(memcg, cachep);
+       memcg_schedule_kmem_cache_create(memcg, cachep);
  out:
         css_put(&memcg->css);
         return cachep;
@@ -2834,7 +2671,7 @@ out:
  void __memcg_kmem_put_cache(struct kmem_cache *cachep)
  {
         if (!is_root_cache(cachep))
-               css_put(&cachep->memcg_params->memcg->css);
+               css_put(&cachep->memcg_params.memcg->css);
  }
  
  /*
@@ -2899,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
         memcg_uncharge_kmem(memcg, 1 << order);
         page->mem_cgroup = NULL;
  }
+
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
+{
+       struct mem_cgroup *memcg = NULL;
+       struct kmem_cache *cachep;
+       struct page *page;
+
+       page = virt_to_head_page(ptr);
+       if (PageSlab(page)) {
+               cachep = page->slab_cache;
+               if (!is_root_cache(cachep))
+                       memcg = cachep->memcg_params.memcg;
+       } else
+               /* page allocated by alloc_kmem_pages */
+               memcg = page->mem_cgroup;
+
+       return memcg;
+}
  #endif /* CONFIG_MEMCG_KMEM */
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3433,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
         int err = 0;
         int memcg_id;
  
-       if (memcg_kmem_is_active(memcg))
-               return 0;
+       BUG_ON(memcg->kmemcg_id >= 0);
+       BUG_ON(memcg->kmem_acct_activated);
+       BUG_ON(memcg->kmem_acct_active);
  
         /*
          * For simplicity, we won't allow this to be disabled.  It also can't
@@ -3477,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
          * patched.
          */
         memcg->kmemcg_id = memcg_id;
+       memcg->kmem_acct_activated = true;
+       memcg->kmem_acct_active = true;
  out:
         return err;
  }
@@ -3533,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
         int ret;
  
         buf = strstrip(buf);
-       ret = page_counter_memparse(buf, &nr_pages);
+       ret = page_counter_memparse(buf, "-1", &nr_pages);
         if (ret)
                 return ret;
  
@@ -3609,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
-       if (val >= (1 << NR_MOVE_TYPE))
+       if (val & ~MOVE_MASK)
                 return -EINVAL;
  
         /*
@@ -3687,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
         struct mem_cgroup *mi;
         unsigned int i;
  
+       BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
+                    MEM_CGROUP_STAT_NSTATS);
+       BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
+                    MEM_CGROUP_EVENTS_NSTATS);
         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
  
         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3901,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
         unsigned long usage;
         int i, size, ret;
  
-       ret = page_counter_memparse(args, &threshold);
+       ret = page_counter_memparse(args, "-1", &threshold);
         if (ret)
                 return ret;
  
@@ -4152,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
         return mem_cgroup_sockets_init(memcg, ss);
  }
  
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+       struct cgroup_subsys_state *css;
+       struct mem_cgroup *parent, *child;
+       int kmemcg_id;
+
+       if (!memcg->kmem_acct_active)
+               return;
+
+       /*
+        * Clear the 'active' flag before clearing memcg_caches arrays entries.
+        * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+        * guarantees no cache will be created for this cgroup after we are
+        * done (see memcg_create_kmem_cache()).
+        */
+       memcg->kmem_acct_active = false;
+
+       memcg_deactivate_kmem_caches(memcg);
+
+       kmemcg_id = memcg->kmemcg_id;
+       BUG_ON(kmemcg_id < 0);
+
+       parent = parent_mem_cgroup(memcg);
+       if (!parent)
+               parent = root_mem_cgroup;
+
+       /*
+        * Change kmemcg_id of this cgroup and all its descendants to the
+        * parent's id, and then move all entries from this cgroup's list_lrus
+        * to ones of the parent. After we have finished, all list_lrus
+        * corresponding to this cgroup are guaranteed to remain empty. The
+        * ordering is imposed by list_lru_node->lock taken by
+        * memcg_drain_all_list_lrus().
+        */
+       css_for_each_descendant_pre(css, &memcg->css) {
+               child = mem_cgroup_from_css(css);
+               BUG_ON(child->kmemcg_id != kmemcg_id);
+               child->kmemcg_id = parent->kmemcg_id;
+               if (!memcg->use_hierarchy)
+                       break;
+       }
+       memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+       memcg_free_cache_id(kmemcg_id);
+}
+
  static void memcg_destroy_kmem(struct mem_cgroup *memcg)
  {
-       memcg_unregister_all_caches(memcg);
+       if (memcg->kmem_acct_activated) {
+               memcg_destroy_kmem_caches(memcg);
+               static_key_slow_dec(&memcg_kmem_enabled_key);
+               WARN_ON(page_counter_read(&memcg->kmem));
+       }
         mem_cgroup_sockets_destroy(memcg);
  }
  #else
@@ -4163,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
         return 0;
  }
  
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+}
+
  static void memcg_destroy_kmem(struct mem_cgroup *memcg)
  {
  }
@@ -4391,7 +4307,7 @@ out_kfree:
         return ret;
  }
  
-static struct cftype mem_cgroup_files[] = {
+static struct cftype mem_cgroup_legacy_files[] = {
         {
                 .name = "usage_in_bytes",
                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
@@ -4502,34 +4418,6 @@ static struct cftype mem_cgroup_files[] = {
         { },    /* terminate */
  };
  
-#ifdef CONFIG_MEMCG_SWAP
-static struct cftype memsw_cgroup_files[] = {
-       {
-               .name = "memsw.usage_in_bytes",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
-               .read_u64 = mem_cgroup_read_u64,
-       },
-       {
-               .name = "memsw.max_usage_in_bytes",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
-               .write = mem_cgroup_reset,
-               .read_u64 = mem_cgroup_read_u64,
-       },
-       {
-               .name = "memsw.limit_in_bytes",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
-               .write = mem_cgroup_write,
-               .read_u64 = mem_cgroup_read_u64,
-       },
-       {
-               .name = "memsw.failcnt",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
-               .write = mem_cgroup_reset,
-               .read_u64 = mem_cgroup_read_u64,
-       },
-       { },    /* terminate */
-};
-#endif
  static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
  {
         struct mem_cgroup_per_node *pn;
@@ -4609,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
                 free_mem_cgroup_per_zone_info(memcg, node);
  
         free_percpu(memcg->stat);
-
-       disarm_static_keys(memcg);
         kfree(memcg);
  }
  
@@ -4625,29 +4511,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
  }
  EXPORT_SYMBOL(parent_mem_cgroup);
  
-static void __init mem_cgroup_soft_limit_tree_init(void)
-{
-       struct mem_cgroup_tree_per_node *rtpn;
-       struct mem_cgroup_tree_per_zone *rtpz;
-       int tmp, node, zone;
-
-       for_each_node(node) {
-               tmp = node;
-               if (!node_state(node, N_NORMAL_MEMORY))
-                       tmp = -1;
-               rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
-               BUG_ON(!rtpn);
-
-               soft_limit_tree.rb_tree_per_node[node] = rtpn;
-
-               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-                       rtpz = &rtpn->rb_tree_per_zone[zone];
-                       rtpz->rb_root = RB_ROOT;
-                       spin_lock_init(&rtpz->lock);
-               }
-       }
-}
-
  static struct cgroup_subsys_state * __ref
  mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  {
@@ -4667,6 +4530,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
         if (parent_css == NULL) {
                 root_mem_cgroup = memcg;
                 page_counter_init(&memcg->memory, NULL);
+               memcg->high = PAGE_COUNTER_MAX;
                 memcg->soft_limit = PAGE_COUNTER_MAX;
                 page_counter_init(&memcg->memsw, NULL);
                 page_counter_init(&memcg->kmem, NULL);
@@ -4682,7 +4546,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
         spin_lock_init(&memcg->event_list_lock);
  #ifdef CONFIG_MEMCG_KMEM
         memcg->kmemcg_id = -1;
-       INIT_LIST_HEAD(&memcg->memcg_slab_caches);
  #endif
  
         return &memcg->css;
@@ -4713,6 +4576,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
  
         if (parent->use_hierarchy) {
                 page_counter_init(&memcg->memory, &parent->memory);
+               memcg->high = PAGE_COUNTER_MAX;
                 memcg->soft_limit = PAGE_COUNTER_MAX;
                 page_counter_init(&memcg->memsw, &parent->memsw);
                 page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4723,6 +4587,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
                  */
         } else {
                 page_counter_init(&memcg->memory, NULL);
+               memcg->high = PAGE_COUNTER_MAX;
                 memcg->soft_limit = PAGE_COUNTER_MAX;
                 page_counter_init(&memcg->memsw, NULL);
                 page_counter_init(&memcg->kmem, NULL);
@@ -4768,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
         spin_unlock(&memcg->event_list_lock);
  
         vmpressure_cleanup(&memcg->vmpressure);
+
+       memcg_deactivate_kmem(memcg);
  }
  
  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4798,6 +4665,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
         mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
         mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
         memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+       memcg->low = 0;
+       memcg->high = PAGE_COUNTER_MAX;
         memcg->soft_limit = PAGE_COUNTER_MAX;
  }
  
@@ -4874,12 +4743,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
         if (!page || !page_mapped(page))
                 return NULL;
         if (PageAnon(page)) {
-               /* we don't move shared anon */
-               if (!move_anon())
+               if (!(mc.flags & MOVE_ANON))
                         return NULL;
-       } else if (!move_file())
-               /* we ignore mapcount for file pages */
-               return NULL;
+       } else {
+               if (!(mc.flags & MOVE_FILE))
+                       return NULL;
+       }
         if (!get_page_unless_zero(page))
                 return NULL;
  
@@ -4893,7 +4762,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
         struct page *page = NULL;
         swp_entry_t ent = pte_to_swp_entry(ptent);
  
-       if (!move_anon() || non_swap_entry(ent))
+       if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
                 return NULL;
         /*
          * Because lookup_swap_cache() updates some statistics counter,
@@ -4922,14 +4791,11 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
  
         if (!vma->vm_file) /* anonymous vma */
                 return NULL;
-       if (!move_file())
+       if (!(mc.flags & MOVE_FILE))
                 return NULL;
  
         mapping = vma->vm_file->f_mapping;
-       if (pte_none(ptent))
-               pgoff = linear_page_index(vma, addr);
-       else /* pte_file(ptent) is true */
-               pgoff = pte_to_pgoff(ptent);
+       pgoff = linear_page_index(vma, addr);
  
         /* page is moved even if it's not RSS of this task(page-faulted). */
  #ifdef CONFIG_SWAP
@@ -4961,7 +4827,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                 page = mc_handle_present_pte(vma, addr, ptent);
         else if (is_swap_pte(ptent))
                 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
-       else if (pte_none(ptent) || pte_file(ptent))
+       else if (pte_none(ptent))
                 page = mc_handle_file_pte(vma, addr, ptent, &ent);
  
         if (!page && !ent.val)
@@ -5004,7 +4870,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
  
         page = pmd_page(pmd);
         VM_BUG_ON_PAGE(!page || !PageHead(page), page);
-       if (!move_anon())
+       if (!(mc.flags & MOVE_ANON))
                 return ret;
         if (page->mem_cgroup == mc.from) {
                 ret = MC_TARGET_PAGE;
@@ -5027,7 +4893,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
                                         unsigned long addr, unsigned long end,
                                         struct mm_walk *walk)
  {
-       struct vm_area_struct *vma = walk->private;
+       struct vm_area_struct *vma = walk->vma;
         pte_t *pte;
         spinlock_t *ptl;
  
@@ -5053,20 +4919,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
  static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
  {
         unsigned long precharge;
-       struct vm_area_struct *vma;
  
+       struct mm_walk mem_cgroup_count_precharge_walk = {
+               .pmd_entry = mem_cgroup_count_precharge_pte_range,
+               .mm = mm,
+       };
         down_read(&mm->mmap_sem);
-       for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               struct mm_walk mem_cgroup_count_precharge_walk = {
-                       .pmd_entry = mem_cgroup_count_precharge_pte_range,
-                       .mm = mm,
-                       .private = vma,
-               };
-               if (is_vm_hugetlb_page(vma))
-                       continue;
-               walk_page_range(vma->vm_start, vma->vm_end,
-                                       &mem_cgroup_count_precharge_walk);
-       }
+       walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
         up_read(&mm->mmap_sem);
  
         precharge = mc.precharge;
@@ -5146,15 +5005,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
         struct task_struct *p = cgroup_taskset_first(tset);
         int ret = 0;
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-       unsigned long move_charge_at_immigrate;
+       unsigned long move_flags;
  
         /*
          * We are now commited to this value whatever it is. Changes in this
          * tunable will only affect upcoming migrations, not the current one.
          * So we need to save it, and keep it going.
          */
-       move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
-       if (move_charge_at_immigrate) {
+       move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
+       if (move_flags) {
                 struct mm_struct *mm;
                 struct mem_cgroup *from = mem_cgroup_from_task(p);
  
@@ -5174,7 +5033,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
                         spin_lock(&mc.lock);
                         mc.from = from;
                         mc.to = memcg;
-                       mc.immigrate_flags = move_charge_at_immigrate;
+                       mc.flags = move_flags;
                         spin_unlock(&mc.lock);
                         /* We set mc.moving_task later */
  
@@ -5199,7 +5058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                                 struct mm_walk *walk)
  {
         int ret = 0;
-       struct vm_area_struct *vma = walk->private;
+       struct vm_area_struct *vma = walk->vma;
         pte_t *pte;
         spinlock_t *ptl;
         enum mc_target_type target_type;
@@ -5295,7 +5154,10 @@ put:                     /* get_mctgt_type() gets the page */
  
  static void mem_cgroup_move_charge(struct mm_struct *mm)
  {
-       struct vm_area_struct *vma;
+       struct mm_walk mem_cgroup_move_charge_walk = {
+               .pmd_entry = mem_cgroup_move_charge_pte_range,
+               .mm = mm,
+       };
  
         lru_add_drain_all();
         /*
@@ -5318,24 +5180,11 @@ retry:
                 cond_resched();
                 goto retry;
         }
-       for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               int ret;
-               struct mm_walk mem_cgroup_move_charge_walk = {
-                       .pmd_entry = mem_cgroup_move_charge_pte_range,
-                       .mm = mm,
-                       .private = vma,
-               };
-               if (is_vm_hugetlb_page(vma))
-                       continue;
-               ret = walk_page_range(vma->vm_start, vma->vm_end,
-                                               &mem_cgroup_move_charge_walk);
-               if (ret)
-                       /*
-                        * means we have consumed all precharges and failed in
-                        * doing additional charge. Just abandon here.
-                        */
-                       break;
-       }
+       /*
+        * When we have consumed all precharges and failed in doing
+        * additional charge, the page walk just aborts.
+        */
+       walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
         up_read(&mm->mmap_sem);
         atomic_dec(&mc.from->moving_account);
  }
@@ -5386,118 +5235,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
                 mem_cgroup_from_css(root_css)->use_hierarchy = true;
  }
  
-struct cgroup_subsys memory_cgrp_subsys = {
-       .css_alloc = mem_cgroup_css_alloc,
-       .css_online = mem_cgroup_css_online,
-       .css_offline = mem_cgroup_css_offline,
-       .css_free = mem_cgroup_css_free,
-       .css_reset = mem_cgroup_css_reset,
-       .can_attach = mem_cgroup_can_attach,
-       .cancel_attach = mem_cgroup_cancel_attach,
-       .attach = mem_cgroup_move_task,
-       .bind = mem_cgroup_bind,
-       .legacy_cftypes = mem_cgroup_files,
-       .early_init = 0,
-};
+static u64 memory_current_read(struct cgroup_subsys_state *css,
+                              struct cftype *cft)
+{
+       return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+}
  
-#ifdef CONFIG_MEMCG_SWAP
-static int __init enable_swap_account(char *s)
+static int memory_low_show(struct seq_file *m, void *v)
  {
-       if (!strcmp(s, "1"))
-               really_do_swap_account = 1;
-       else if (!strcmp(s, "0"))
-               really_do_swap_account = 0;
-       return 1;
+       struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+       unsigned long low = ACCESS_ONCE(memcg->low);
+
+       if (low == PAGE_COUNTER_MAX)
+               seq_puts(m, "infinity\n");
+       else
+               seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
+
+       return 0;
  }
-__setup("swapaccount=", enable_swap_account);
  
-static void __init memsw_file_init(void)
+static ssize_t memory_low_write(struct kernfs_open_file *of,
+                               char *buf, size_t nbytes, loff_t off)
  {
-       WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
-                                         memsw_cgroup_files));
+       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+       unsigned long low;
+       int err;
+
+       buf = strstrip(buf);
+       err = page_counter_memparse(buf, "infinity", &low);
+       if (err)
+               return err;
+
+       memcg->low = low;
+
+       return nbytes;
  }
  
-static void __init enable_swap_cgroup(void)
+static int memory_high_show(struct seq_file *m, void *v)
  {
-       if (!mem_cgroup_disabled() && really_do_swap_account) {
-               do_swap_account = 1;
-               memsw_file_init();
-       }
+       struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+       unsigned long high = ACCESS_ONCE(memcg->high);
+
+       if (high == PAGE_COUNTER_MAX)
+               seq_puts(m, "infinity\n");
+       else
+               seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
+
+       return 0;
  }
  
-#else
-static void __init enable_swap_cgroup(void)
+static ssize_t memory_high_write(struct kernfs_open_file *of,
+                                char *buf, size_t nbytes, loff_t off)
  {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+       unsigned long high;
+       int err;
+
+       buf = strstrip(buf);
+       err = page_counter_memparse(buf, "infinity", &high);
+       if (err)
+               return err;
+
+       memcg->high = high;
+
+       return nbytes;
  }
-#endif
  
-#ifdef CONFIG_MEMCG_SWAP
-/**
- * mem_cgroup_swapout - transfer a memsw charge to swap
- * @page: page whose memsw charge to transfer
- * @entry: swap entry to move the charge to
- *
- * Transfer the memsw charge of @page to @entry.
- */
-void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+static int memory_max_show(struct seq_file *m, void *v)
  {
-       struct mem_cgroup *memcg;
-       unsigned short oldid;
+       struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+       unsigned long max = ACCESS_ONCE(memcg->memory.limit);
  
-       VM_BUG_ON_PAGE(PageLRU(page), page);
-       VM_BUG_ON_PAGE(page_count(page), page);
+       if (max == PAGE_COUNTER_MAX)
+               seq_puts(m, "infinity\n");
+       else
+               seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
  
-       if (!do_swap_account)
-               return;
+       return 0;
+}
  
-       memcg = page->mem_cgroup;
+static ssize_t memory_max_write(struct kernfs_open_file *of,
+                               char *buf, size_t nbytes, loff_t off)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+       unsigned long max;
+       int err;
  
-       /* Readahead page, never charged */
-       if (!memcg)
-               return;
+       buf = strstrip(buf);
+       err = page_counter_memparse(buf, "infinity", &max);
+       if (err)
+               return err;
  
-       oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
-       VM_BUG_ON_PAGE(oldid, page);
-       mem_cgroup_swap_statistics(memcg, true);
+       err = mem_cgroup_resize_limit(memcg, max);
+       if (err)
+               return err;
  
-       page->mem_cgroup = NULL;
+       return nbytes;
+}
  
-       if (!mem_cgroup_is_root(memcg))
-               page_counter_uncharge(&memcg->memory, 1);
+static int memory_events_show(struct seq_file *m, void *v)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
  
-       /* XXX: caller holds IRQ-safe mapping->tree_lock */
-       VM_BUG_ON(!irqs_disabled());
+       seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
+       seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
+       seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
+       seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
  
-       mem_cgroup_charge_statistics(memcg, page, -1);
-       memcg_check_events(memcg, page);
+       return 0;
  }
  
+static struct cftype memory_files[] = {
+       {
+               .name = "current",
+               .read_u64 = memory_current_read,
+       },
+       {
+               .name = "low",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = memory_low_show,
+               .write = memory_low_write,
+       },
+       {
+               .name = "high",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = memory_high_show,
+               .write = memory_high_write,
+       },
+       {
+               .name = "max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = memory_max_show,
+               .write = memory_max_write,
+       },
+       {
+               .name = "events",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = memory_events_show,
+       },
+       { }     /* terminate */
+};
+
+struct cgroup_subsys memory_cgrp_subsys = {
+       .css_alloc = mem_cgroup_css_alloc,
+       .css_online = mem_cgroup_css_online,
+       .css_offline = mem_cgroup_css_offline,
+       .css_free = mem_cgroup_css_free,
+       .css_reset = mem_cgroup_css_reset,
+       .can_attach = mem_cgroup_can_attach,
+       .cancel_attach = mem_cgroup_cancel_attach,
+       .attach = mem_cgroup_move_task,
+       .bind = mem_cgroup_bind,
+       .dfl_cftypes = memory_files,
+       .legacy_cftypes = mem_cgroup_legacy_files,
+       .early_init = 0,
+};
+
  /**
- * mem_cgroup_uncharge_swap - uncharge a swap entry
- * @entry: swap entry to uncharge
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+void mem_cgroup_events(struct mem_cgroup *memcg,
+                      enum mem_cgroup_events_index idx,
+                      unsigned int nr)
+{
+       this_cpu_add(memcg->stat->events[idx], nr);
+}
+
+/**
+ * mem_cgroup_low - check if memory consumption is below the normal range
+ * @root: the highest ancestor to consider
+ * @memcg: the memory cgroup to check
   *
- * Drop the memsw charge associated with @entry.
+ * Returns %true if memory consumption of @memcg, and that of all
+ * configurable ancestors up to @root, is below the normal range.
   */
-void mem_cgroup_uncharge_swap(swp_entry_t entry)
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
  {
-       struct mem_cgroup *memcg;
-       unsigned short id;
+       if (mem_cgroup_disabled())
+               return false;
  
-       if (!do_swap_account)
-               return;
+       /*
+        * The toplevel group doesn't have a configurable range, so
+        * it's never low when looked at directly, and it is not
+        * considered an ancestor when assessing the hierarchy.
+        */
  
-       id = swap_cgroup_record(entry, 0);
-       rcu_read_lock();
-       memcg = mem_cgroup_lookup(id);
-       if (memcg) {
-               if (!mem_cgroup_is_root(memcg))
-                       page_counter_uncharge(&memcg->memsw, 1);
-               mem_cgroup_swap_statistics(memcg, false);
-               css_put(&memcg->css);
+       if (memcg == root_mem_cgroup)
+               return false;
+
+       if (page_counter_read(&memcg->memory) > memcg->low)
+               return false;
+
+       while (memcg != root) {
+               memcg = parent_mem_cgroup(memcg);
+
+               if (memcg == root_mem_cgroup)
+                       break;
+
+               if (page_counter_read(&memcg->memory) > memcg->low)
+                       return false;
         }
-       rcu_read_unlock();
+       return true;
  }
-#endif
  
  /**
   * mem_cgroup_try_charge - try charging a page
@@ -5831,10 +5773,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
   */
  static int __init mem_cgroup_init(void)
  {
+       int cpu, node;
+
         hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
-       enable_swap_cgroup();
-       mem_cgroup_soft_limit_tree_init();
-       memcg_stock_init();
+
+       for_each_possible_cpu(cpu)
+               INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
+                         drain_local_stock);
+
+       for_each_node(node) {
+               struct mem_cgroup_tree_per_node *rtpn;
+               int zone;
+
+               rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
+                                   node_online(node) ? node : NUMA_NO_NODE);
+
+               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                       struct mem_cgroup_tree_per_zone *rtpz;
+
+                       rtpz = &rtpn->rb_tree_per_zone[zone];
+                       rtpz->rb_root = RB_ROOT;
+                       spin_lock_init(&rtpz->lock);
+               }
+               soft_limit_tree.rb_tree_per_node[node] = rtpn;
+       }
+
         return 0;
  }
  subsys_initcall(mem_cgroup_init);
+
+#ifdef CONFIG_MEMCG_SWAP
+/**
+ * mem_cgroup_swapout - transfer a memsw charge to swap
+ * @page: page whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @page to @entry.
+ */
+void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+       struct mem_cgroup *memcg;
+       unsigned short oldid;
+
+       VM_BUG_ON_PAGE(PageLRU(page), page);
+       VM_BUG_ON_PAGE(page_count(page), page);
+
+       if (!do_swap_account)
+               return;
+
+       memcg = page->mem_cgroup;
+
+       /* Readahead page, never charged */
+       if (!memcg)
+               return;
+
+       oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+       VM_BUG_ON_PAGE(oldid, page);
+       mem_cgroup_swap_statistics(memcg, true);
+
+       page->mem_cgroup = NULL;
+
+       if (!mem_cgroup_is_root(memcg))
+               page_counter_uncharge(&memcg->memory, 1);
+
+       /* XXX: caller holds IRQ-safe mapping->tree_lock */
+       VM_BUG_ON(!irqs_disabled());
+
+       mem_cgroup_charge_statistics(memcg, page, -1);
+       memcg_check_events(memcg, page);
+}
+
+/**
+ * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * @entry: swap entry to uncharge
+ *
+ * Drop the memsw charge associated with @entry.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t entry)
+{
+       struct mem_cgroup *memcg;
+       unsigned short id;
+
+       if (!do_swap_account)
+               return;
+
+       id = swap_cgroup_record(entry, 0);
+       rcu_read_lock();
+       memcg = mem_cgroup_lookup(id);
+       if (memcg) {
+               if (!mem_cgroup_is_root(memcg))
+                       page_counter_uncharge(&memcg->memsw, 1);
+               mem_cgroup_swap_statistics(memcg, false);
+               css_put(&memcg->css);
+       }
+       rcu_read_unlock();
+}
+
+/* for remember boot option*/
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata;
+#endif
+
+static int __init enable_swap_account(char *s)
+{
+       if (!strcmp(s, "1"))
+               really_do_swap_account = 1;
+       else if (!strcmp(s, "0"))
+               really_do_swap_account = 0;
+       return 1;
+}
+__setup("swapaccount=", enable_swap_account);
+
+static struct cftype memsw_cgroup_files[] = {
+       {
+               .name = "memsw.usage_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+               .read_u64 = mem_cgroup_read_u64,
+       },
+       {
+               .name = "memsw.max_usage_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+               .write = mem_cgroup_reset,
+               .read_u64 = mem_cgroup_read_u64,
+       },
+       {
+               .name = "memsw.limit_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+               .write = mem_cgroup_write,
+               .read_u64 = mem_cgroup_read_u64,
+       },
+       {
+               .name = "memsw.failcnt",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+               .write = mem_cgroup_reset,
+               .read_u64 = mem_cgroup_read_u64,
+       },
+       { },    /* terminate */
+};
+
+static int __init mem_cgroup_swap_init(void)
+{
+       if (!mem_cgroup_disabled() && really_do_swap_account) {
+               do_swap_account = 1;
+               WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+                                                 memsw_cgroup_files));
+       }
+       return 0;
+}
+subsys_initcall(mem_cgroup_swap_init);
+
+#endif /* CONFIG_MEMCG_SWAP */