cgroup: remove synchronize_rcu() from cgroup_diput()

[mirror_ubuntu-bionic-kernel.git] / kernel / cgroup.c
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index f24f724620dd8489fc2e3cb9781433df84096c96..02e4f201472e1c213d3a8b87af0fcc80582ffb87 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
  #include <linux/module.h>
  #include <linux/delayacct.h>
  #include <linux/cgroupstats.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
  #include <linux/namei.h>
  #include <linux/pid_namespace.h>
  #include <linux/idr.h>
@@ -138,6 +138,9 @@ struct cgroupfs_root {
         /* Hierarchy-specific flags */
         unsigned long flags;
  
+       /* IDs for cgroups in this hierarchy */
+       struct ida cgroup_ida;
+
         /* The path to use for release notifications. */
         char release_agent_path[PATH_MAX];
  
@@ -171,8 +174,8 @@ struct css_id {
          * The css to which this ID points. This pointer is set to valid value
          * after cgroup is populated. If cgroup is removed, this will be NULL.
          * This pointer is expected to be RCU-safe because destroy()
-        * is called after synchronize_rcu(). But for safe use, css_is_removed()
-        * css_tryget() should be used for avoiding race.
+        * is called after synchronize_rcu(). But for safe use, css_tryget()
+        * should be used for avoiding race.
          */
         struct cgroup_subsys_state __rcu *css;
         /*
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
   */
  static int need_forkexit_callback __read_mostly;
  
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+                             struct cftype cfts[], bool is_add);
+
  #ifdef CONFIG_PROVE_LOCKING
  int cgroup_lock_is_held(void)
  {
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  }
  
-static int clone_children(const struct cgroup *cgrp)
-{
-       return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
-}
-
  /*
   * for_each_subsys() allows you to iterate on each subsystem attached to
   * an active hierarchy
@@ -374,22 +376,18 @@ static int css_set_count;
   * account cgroups in empty hierarchies.
   */
  #define CSS_SET_HASH_BITS      7
-#define CSS_SET_TABLE_SIZE     (1 << CSS_SET_HASH_BITS)
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
  
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
  {
         int i;
-       int index;
-       unsigned long tmp = 0UL;
+       unsigned long key = 0UL;
  
         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
-               tmp += (unsigned long)css[i];
-       tmp = (tmp >> 16) ^ tmp;
+               key += (unsigned long)css[i];
+       key = (key >> 16) ^ key;
  
-       index = hash_long(tmp, CSS_SET_HASH_BITS);
-
-       return &css_set_table[index];
+       return key;
  }
  
  /* We don't maintain the lists running through each css_set to its
@@ -416,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
         }
  
         /* This css_set is dead. unlink it and release cgroup refcounts */
-       hlist_del(&cg->hlist);
+       hash_del(&cg->hlist);
         css_set_count--;
  
         list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -548,9 +546,9 @@ static struct css_set *find_existing_css_set(
  {
         int i;
         struct cgroupfs_root *root = cgrp->root;
-       struct hlist_head *hhead;
         struct hlist_node *node;
         struct css_set *cg;
+       unsigned long key;
  
         /*
          * Build the set of subsystem state objects that we want to see in the
@@ -570,8 +568,8 @@ static struct css_set *find_existing_css_set(
                 }
         }
  
-       hhead = css_set_hash(template);
-       hlist_for_each_entry(cg, node, hhead, hlist) {
+       key = css_set_hash(template);
+       hash_for_each_possible(css_set_table, cg, node, hlist, key) {
                 if (!compare_css_sets(cg, oldcg, cgrp, template))
                         continue;
  
@@ -655,8 +653,8 @@ static struct css_set *find_css_set(
  
         struct list_head tmp_cg_links;
  
-       struct hlist_head *hhead;
         struct cg_cgroup_link *link;
+       unsigned long key;
  
         /* First see if we already have a cgroup group that matches
          * the desired set */
@@ -702,8 +700,8 @@ static struct css_set *find_css_set(
         css_set_count++;
  
         /* Add this cgroup group to the hash table */
-       hhead = css_set_hash(res->subsys);
-       hlist_add_head(&res->hlist, hhead);
+       key = css_set_hash(res->subsys);
+       hash_add(css_set_table, &res->hlist, key);
  
         write_unlock(&css_set_lock);
  
@@ -782,12 +780,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
   *     The task_lock() exception
   *
   * The need for this exception arises from the action of
- * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
+ * cgroup_attach_task(), which overwrites one task's cgroup pointer with
   * another.  It does so using cgroup_mutex, however there are
   * several performance critical places that need to reference
   * task->cgroup without the expense of grabbing a system global
   * mutex.  Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task's cgroup pointer we use
   * task_lock(), which acts on a spinlock (task->alloc_lock) already in
   * the task_struct routinely used for such matters.
   *
@@ -854,28 +852,44 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
         return inode;
  }
  
-/*
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
- */
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
+static void cgroup_free_fn(struct work_struct *work)
  {
+       struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
         struct cgroup_subsys *ss;
-       int ret = 0;
  
-       for_each_subsys(cgrp->root, ss) {
-               if (!ss->pre_destroy)
-                       continue;
+       mutex_lock(&cgroup_mutex);
+       /*
+        * Release the subsystem state objects.
+        */
+       for_each_subsys(cgrp->root, ss)
+               ss->css_free(cgrp);
  
-               ret = ss->pre_destroy(cgrp);
-               if (ret) {
-                       /* ->pre_destroy() failure is being deprecated */
-                       WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
-                       break;
-               }
-       }
+       cgrp->root->number_of_cgroups--;
+       mutex_unlock(&cgroup_mutex);
  
-       return ret;
+       /*
+        * Drop the active superblock reference that we took when we
+        * created the cgroup
+        */
+       deactivate_super(cgrp->root->sb);
+
+       /*
+        * if we're getting rid of the cgroup, refcount should ensure
+        * that there are no pidlists left.
+        */
+       BUG_ON(!list_empty(&cgrp->pidlists));
+
+       simple_xattrs_free(&cgrp->xattrs);
+
+       ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+       kfree(cgrp);
+}
+
+static void cgroup_free_rcu(struct rcu_head *head)
+{
+       struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+
+       schedule_work(&cgrp->free_work);
  }
  
  static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -883,7 +897,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
         /* is dentry a directory ? if so, kfree() associated cgroup */
         if (S_ISDIR(inode->i_mode)) {
                 struct cgroup *cgrp = dentry->d_fsdata;
-               struct cgroup_subsys *ss;
+
                 BUG_ON(!(cgroup_is_removed(cgrp)));
                 /* It's possible for external users to be holding css
                  * reference counts on a cgroup; css_put() needs to
@@ -891,33 +905,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                  * the reference count in order to know if it needs to
                  * queue the cgroup to be handled by the release
                  * agent */
-               synchronize_rcu();
-
-               mutex_lock(&cgroup_mutex);
-               /*
-                * Release the subsystem state objects.
-                */
-               for_each_subsys(cgrp->root, ss)
-                       ss->destroy(cgrp);
-
-               cgrp->root->number_of_cgroups--;
-               mutex_unlock(&cgroup_mutex);
-
-               /*
-                * Drop the active superblock reference that we took when we
-                * created the cgroup
-                */
-               deactivate_super(cgrp->root->sb);
-
-               /*
-                * if we're getting rid of the cgroup, refcount should ensure
-                * that there are no pidlists left.
-                */
-               BUG_ON(!list_empty(&cgrp->pidlists));
-
-               simple_xattrs_free(&cgrp->xattrs);
-
-               kfree_rcu(cgrp, rcu_head);
+               call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
         } else {
                 struct cfent *cfe = __d_cfe(dentry);
                 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -946,13 +934,17 @@ static void remove_dir(struct dentry *d)
         dput(parent);
  }
  
-static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
  {
         struct cfent *cfe;
  
         lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
         lockdep_assert_held(&cgroup_mutex);
  
+       /*
+        * If we're doing cleanup due to failure of cgroup_create(),
+        * the corresponding @cfe may not exist.
+        */
         list_for_each_entry(cfe, &cgrp->files, node) {
                 struct dentry *d = cfe->dentry;
  
@@ -965,9 +957,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
                 list_del_init(&cfe->node);
                 dput(d);
  
-               return 0;
+               break;
         }
-       return -ENOENT;
  }
  
  /**
@@ -987,7 +978,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
                 if (!test_bit(ss->subsys_id, &subsys_mask))
                         continue;
                 list_for_each_entry(set, &ss->cftsets, node)
-                       cgroup_rm_file(cgrp, set->cfts);
+                       cgroup_addrm_files(cgrp, NULL, set->cfts, false);
         }
         if (base_files) {
                 while (!list_empty(&cgrp->files))
@@ -1014,33 +1005,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
         remove_dir(dentry);
  }
  
-/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
-       if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
-               wake_up_all(&cgroup_rmdir_waitq);
-}
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
-       css_get(css);
-}
-
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
-       cgroup_wakeup_rmdir_waiter(css->cgroup);
-       css_put(css);
-}
-
  /*
   * Call with cgroup_mutex held. Drops reference counts on modules, including
   * any duplicate ones that parse_cgroupfs_options took. If this function
@@ -1131,7 +1095,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                 }
         }
         root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
-       synchronize_rcu();
  
         return 0;
  }
@@ -1150,7 +1113,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
                 seq_puts(seq, ",xattr");
         if (strlen(root->release_agent_path))
                 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
-       if (clone_children(&root->top_cgroup))
+       if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
                 seq_puts(seq, ",clone_children");
         if (strlen(root->name))
                 seq_printf(seq, ",name=%s", root->name);
@@ -1162,7 +1125,7 @@ struct cgroup_sb_opts {
         unsigned long subsys_mask;
         unsigned long flags;
         char *release_agent;
-       bool clone_children;
+       bool cpuset_clone_children;
         char *name;
         /* User explicitly requested empty subsystem */
         bool none;
@@ -1213,7 +1176,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                         continue;
                 }
                 if (!strcmp(token, "clone_children")) {
-                       opts->clone_children = true;
+                       opts->cpuset_clone_children = true;
                         continue;
                 }
                 if (!strcmp(token, "xattr")) {
@@ -1381,7 +1344,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
         if (ret)
                 goto out_unlock;
  
-       /* See feature-removal-schedule.txt */
         if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
                 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
                            task_tgid_nr(current), current->comm);
@@ -1397,14 +1359,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
                 goto out_unlock;
         }
  
+       /*
+        * Clear out the files of subsystems that should be removed, do
+        * this before rebind_subsystems, since rebind_subsystems may
+        * change this hierarchy's subsys_list.
+        */
+       cgroup_clear_directory(cgrp->dentry, false, removed_mask);
+
         ret = rebind_subsystems(root, opts.subsys_mask);
         if (ret) {
+               /* rebind_subsystems failed, re-populate the removed files */
+               cgroup_populate_dir(cgrp, false, removed_mask);
                 drop_parsed_module_refcounts(opts.subsys_mask);
                 goto out_unlock;
         }
  
-       /* clear out any existing files and repopulate subsystem files */
-       cgroup_clear_directory(cgrp->dentry, false, removed_mask);
         /* re-populate subsystem files */
         cgroup_populate_dir(cgrp, false, added_mask);
  
@@ -1432,8 +1401,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
         INIT_LIST_HEAD(&cgrp->children);
         INIT_LIST_HEAD(&cgrp->files);
         INIT_LIST_HEAD(&cgrp->css_sets);
+       INIT_LIST_HEAD(&cgrp->allcg_node);
         INIT_LIST_HEAD(&cgrp->release_list);
         INIT_LIST_HEAD(&cgrp->pidlists);
+       INIT_WORK(&cgrp->free_work, cgroup_free_fn);
         mutex_init(&cgrp->pidlist_mutex);
         INIT_LIST_HEAD(&cgrp->event_list);
         spin_lock_init(&cgrp->event_list_lock);
@@ -1450,8 +1421,8 @@ static void init_cgroup_root(struct cgroupfs_root *root)
         root->number_of_cgroups = 1;
         cgrp->root = root;
         cgrp->top_cgroup = cgrp;
-       list_add_tail(&cgrp->allcg_node, &root->allcg_list);
         init_cgroup_housekeeping(cgrp);
+       list_add_tail(&cgrp->allcg_node, &root->allcg_list);
  }
  
  static bool init_root_id(struct cgroupfs_root *root)
@@ -1518,12 +1489,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
  
         root->subsys_mask = opts->subsys_mask;
         root->flags = opts->flags;
+       ida_init(&root->cgroup_ida);
         if (opts->release_agent)
                 strcpy(root->release_agent_path, opts->release_agent);
         if (opts->name)
                 strcpy(root->name, opts->name);
-       if (opts->clone_children)
-               set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
+       if (opts->cpuset_clone_children)
+               set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
         return root;
  }
  
@@ -1536,6 +1508,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
         spin_lock(&hierarchy_id_lock);
         ida_remove(&hierarchy_ida, root->hierarchy_id);
         spin_unlock(&hierarchy_id_lock);
+       ida_destroy(&root->cgroup_ida);
         kfree(root);
  }
  
@@ -1636,6 +1609,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 struct cgroupfs_root *existing_root;
                 const struct cred *cred;
                 int i;
+               struct hlist_node *node;
+               struct css_set *cg;
  
                 BUG_ON(sb->s_root != NULL);
  
@@ -1689,19 +1664,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 /* Link the top cgroup in this hierarchy into all
                  * the css_set objects */
                 write_lock(&css_set_lock);
-               for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
-                       struct hlist_head *hhead = &css_set_table[i];
-                       struct hlist_node *node;
-                       struct css_set *cg;
-
-                       hlist_for_each_entry(cg, node, hhead, hlist)
-                               link_css_set(&tmp_cg_links, cg, root_cgrp);
-               }
+               hash_for_each(css_set_table, i, node, cg, hlist)
+                       link_css_set(&tmp_cg_links, cg, root_cgrp);
                 write_unlock(&css_set_lock);
  
                 free_cg_links(&tmp_cg_links);
  
-               BUG_ON(!list_empty(&root_cgrp->sibling));
                 BUG_ON(!list_empty(&root_cgrp->children));
                 BUG_ON(root->number_of_cgroups != 1);
  
@@ -1750,7 +1718,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
  
         BUG_ON(root->number_of_cgroups != 1);
         BUG_ON(!list_empty(&cgrp->children));
-       BUG_ON(!list_empty(&cgrp->sibling));
  
         mutex_lock(&cgroup_mutex);
         mutex_lock(&cgroup_root_mutex);
@@ -1808,11 +1775,13 @@ static struct kobject *cgroup_kobj;
   */
  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
  {
+       struct dentry *dentry = cgrp->dentry;
         char *start;
-       struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
-                                                     cgroup_lock_is_held());
  
-       if (!dentry || cgrp == dummytop) {
+       rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
+                          "cgroup_path() called without proper locking");
+
+       if (cgrp == dummytop) {
                 /*
                  * Inactive subsystems have no dentry for their root
                  * cgroup
@@ -1821,9 +1790,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
                 return 0;
         }
  
-       start = buf + buflen;
+       start = buf + buflen - 1;
  
-       *--start = '\0';
+       *start = '\0';
         for (;;) {
                 int len = dentry->d_name.len;
  
@@ -1834,8 +1803,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
                 if (!cgrp)
                         break;
  
-               dentry = rcu_dereference_check(cgrp->dentry,
-                                              cgroup_lock_is_held());
+               dentry = cgrp->dentry;
                 if (!cgrp->parent)
                         continue;
                 if (--start < buf)
@@ -1930,9 +1898,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
  /*
   * cgroup_task_migrate - move a task from one cgroup to another.
   *
- * 'guarantee' is set if the caller promises that a new css_set for the task
- * will already exist. If not set, this function might sleep, and can fail with
- * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
+ * Must be called with cgroup_mutex and threadgroup locked.
   */
  static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
                                 struct task_struct *tsk, struct css_set *newcg)
@@ -2024,13 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                         ss->attach(cgrp, &tset);
         }
  
-       synchronize_rcu();
-
-       /*
-        * wake up rmdir() waiter. the rmdir should fail since the cgroup
-        * is no longer empty.
-        */
-       cgroup_wakeup_rmdir_waiter(cgrp);
  out:
         if (retval) {
                 for_each_subsys(root, ss) {
@@ -2199,8 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         /*
          * step 5: success! and cleanup
          */
-       synchronize_rcu();
-       cgroup_wakeup_rmdir_waiter(cgrp);
         retval = 0;
  out_put_css_set_refs:
         if (retval) {
@@ -2711,10 +2668,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
  
                 /* start off with i_nlink == 2 (for "." entry) */
                 inc_nlink(inode);
+               inc_nlink(dentry->d_parent->d_inode);
  
-               /* start with the directory inode held, so that we can
-                * populate it without racing with another mkdir */
-               mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+               /*
+                * Control reaches here with cgroup_mutex held.
+                * @inode->i_mutex should nest outside cgroup_mutex but we
+                * want to populate it immediately without releasing
+                * cgroup_mutex.  As @inode isn't visible to anyone else
+                * yet, trylock will always succeed without affecting
+                * lockdep checks.
+                */
+               WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
         } else if (S_ISREG(mode)) {
                 inode->i_size = 0;
                 inode->i_fop = &cgroup_file_operations;
@@ -2725,32 +2689,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
         return 0;
  }
  
-/*
- * cgroup_create_dir - create a directory for an object.
- * @cgrp: the cgroup we create the directory for. It must have a valid
- *        ->parent field. And we are going to fill its ->dentry field.
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new directory.
- */
-static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
-                               umode_t mode)
-{
-       struct dentry *parent;
-       int error = 0;
-
-       parent = cgrp->parent->dentry;
-       error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
-       if (!error) {
-               dentry->d_fsdata = cgrp;
-               inc_nlink(parent->d_inode);
-               rcu_assign_pointer(cgrp->dentry, dentry);
-               dget(dentry);
-       }
-       dput(dentry);
-
-       return error;
-}
-
  /**
   * cgroup_file_mode - deduce file mode of a control file
   * @cft: the control file in question
@@ -2791,12 +2729,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
  
         simple_xattrs_init(&cft->xattrs);
  
-       /* does @cft->flags tell us to skip creation on @cgrp? */
-       if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
-               return 0;
-       if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
-               return 0;
-
         if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
                 strcpy(name, subsys->name);
                 strcat(name, ".");
@@ -2837,14 +2769,20 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
         int err, ret = 0;
  
         for (cft = cfts; cft->name[0] != '\0'; cft++) {
-               if (is_add)
+               /* does cft->flags tell us to skip this file on @cgrp? */
+               if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
+                       continue;
+               if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
+                       continue;
+
+               if (is_add) {
                         err = cgroup_add_file(cgrp, subsys, cft);
-               else
-                       err = cgroup_rm_file(cgrp, cft);
-               if (err) {
-                       pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
-                                  is_add ? "add" : "remove", cft->name, err);
+                       if (err)
+                               pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
+                                       cft->name, err);
                         ret = err;
+               } else {
+                       cgroup_rm_file(cgrp, cft);
                 }
         }
         return ret;
@@ -3044,6 +2982,118 @@ static void cgroup_enable_task_cg_lists(void)
         write_unlock(&css_set_lock);
  }
  
+/**
+ * cgroup_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_pre().  Find the next
+ * descendant to visit for pre-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
+                                         struct cgroup *cgroup)
+{
+       struct cgroup *next;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       /* if first iteration, pretend we just visited @cgroup */
+       if (!pos) {
+               if (list_empty(&cgroup->children))
+                       return NULL;
+               pos = cgroup;
+       }
+
+       /* visit the first child if exists */
+       next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
+       if (next)
+               return next;
+
+       /* no child, visit my or the closest ancestor's next sibling */
+       do {
+               next = list_entry_rcu(pos->sibling.next, struct cgroup,
+                                     sibling);
+               if (&next->sibling != &pos->parent->children)
+                       return next;
+
+               pos = pos->parent;
+       } while (pos != cgroup);
+
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+
+/**
+ * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * @pos: cgroup of interest
+ *
+ * Return the rightmost descendant of @pos.  If there's no descendant,
+ * @pos is returned.  This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ */
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+{
+       struct cgroup *last, *tmp;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       do {
+               last = pos;
+               /* ->prev isn't RCU safe, walk ->next till the end */
+               pos = NULL;
+               list_for_each_entry_rcu(tmp, &last->children, sibling)
+                       pos = tmp;
+       } while (pos);
+
+       return last;
+}
+EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
+
+static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
+{
+       struct cgroup *last;
+
+       do {
+               last = pos;
+               pos = list_first_or_null_rcu(&pos->children, struct cgroup,
+                                            sibling);
+       } while (pos);
+
+       return last;
+}
+
+/**
+ * cgroup_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_post().  Find the next
+ * descendant to visit for post-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
+                                          struct cgroup *cgroup)
+{
+       struct cgroup *next;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       /* if first iteration, visit the leftmost descendant */
+       if (!pos) {
+               next = cgroup_leftmost_descendant(cgroup);
+               return next != cgroup ? next : NULL;
+       }
+
+       /* if there's an unvisited sibling, visit its leftmost descendant */
+       next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+       if (&next->sibling != &pos->parent->children)
+               return cgroup_leftmost_descendant(next);
+
+       /* no sibling left, visit parent */
+       next = pos->parent;
+       return next != cgroup ? next : NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
+
  void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
         __acquires(css_set_lock)
  {
@@ -3390,7 +3440,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
  {
         struct cgroup_pidlist *l;
         /* don't need task_nsproxy() if we're looking at ourself */
-       struct pid_namespace *ns = current->nsproxy->pid_ns;
+       struct pid_namespace *ns = task_active_pid_ns(current);
  
         /*
          * We can't drop the pidlist_mutex before taking the l->mutex in case
@@ -3757,7 +3807,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
         if (flags & POLLHUP) {
                 __remove_wait_queue(event->wqh, &event->wait);
                 spin_lock(&cgrp->event_list_lock);
-               list_del(&event->list);
+               list_del_init(&event->list);
                 spin_unlock(&cgrp->event_list_lock);
                 /*
                  * We are in atomic context, but cgroup_event_remove() may
@@ -3894,7 +3944,7 @@ fail:
  static u64 cgroup_clone_children_read(struct cgroup *cgrp,
                                     struct cftype *cft)
  {
-       return clone_children(cgrp);
+       return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
  }
  
  static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3902,9 +3952,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
                                      u64 val)
  {
         if (val)
-               set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+               set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
         else
-               clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+               clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
         return 0;
  }
  
@@ -4017,19 +4067,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
         css->flags = 0;
         css->id = NULL;
         if (cgrp == dummytop)
-               set_bit(CSS_ROOT, &css->flags);
+               css->flags |= CSS_ROOT;
         BUG_ON(cgrp->subsys[ss->subsys_id]);
         cgrp->subsys[ss->subsys_id] = css;
  
         /*
-        * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
-        * which is put on the last css_put().  dput() requires process
-        * context, which css_put() may be called without.  @css->dput_work
-        * will be used to invoke dput() asynchronously from css_put().
+        * css holds an extra ref to @cgrp->dentry which is put on the last
+        * css_put().  dput() requires process context, which css_put() may
+        * be called without.  @css->dput_work will be used to invoke
+        * dput() asynchronously from css_put().
          */
         INIT_WORK(&css->dput_work, css_dput_fn);
-       if (ss->__DEPRECATED_clear_css_refs)
-               set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
+}
+
+/* invoke ->post_create() on a new CSS and mark it online if successful */
+static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+       int ret = 0;
+
+       lockdep_assert_held(&cgroup_mutex);
+
+       if (ss->css_online)
+               ret = ss->css_online(cgrp);
+       if (!ret)
+               cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
+       return ret;
+}
+
+/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
+static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
+       __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+{
+       struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+       lockdep_assert_held(&cgroup_mutex);
+
+       if (!(css->flags & CSS_ONLINE))
+               return;
+
+       /*
+        * css_offline() should be called with cgroup_mutex unlocked.  See
+        * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
+        * details.  This temporary unlocking should go away once
+        * cgroup_mutex is unexported from controllers.
+        */
+       if (ss->css_offline) {
+               mutex_unlock(&cgroup_mutex);
+               ss->css_offline(cgrp);
+               mutex_lock(&cgroup_mutex);
+       }
+
+       cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
  }
  
  /*
@@ -4049,10 +4137,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         struct cgroup_subsys *ss;
         struct super_block *sb = root->sb;
  
+       /* allocate the cgroup and its ID, 0 is reserved for the root */
         cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
         if (!cgrp)
                 return -ENOMEM;
  
+       cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
+       if (cgrp->id < 0)
+               goto err_free_cgrp;
+
+       /*
+        * Only live parents can have children.  Note that the liveliness
+        * check isn't strictly necessary because cgroup_mkdir() and
+        * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
+        * anyway so that locking is contained inside cgroup proper and we
+        * don't get nasty surprises if we ever grow another caller.
+        */
+       if (!cgroup_lock_live_group(parent)) {
+               err = -ENODEV;
+               goto err_free_id;
+       }
+
         /* Grab a reference on the superblock so the hierarchy doesn't
          * get deleted on unmount if there are child cgroups.  This
          * can be done outside cgroup_mutex, since the sb can't
@@ -4060,10 +4165,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
          * fs */
         atomic_inc(&sb->s_active);
  
-       mutex_lock(&cgroup_mutex);
-
         init_cgroup_housekeeping(cgrp);
  
+       dentry->d_fsdata = cgrp;
+       cgrp->dentry = dentry;
+
         cgrp->parent = parent;
         cgrp->root = parent->root;
         cgrp->top_cgroup = parent->top_cgroup;
@@ -4071,26 +4177,49 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         if (notify_on_release(parent))
                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  
-       if (clone_children(parent))
-               set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+       if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
+               set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
  
         for_each_subsys(root, ss) {
                 struct cgroup_subsys_state *css;
  
-               css = ss->create(cgrp);
+               css = ss->css_alloc(cgrp);
                 if (IS_ERR(css)) {
                         err = PTR_ERR(css);
-                       goto err_destroy;
+                       goto err_free_all;
                 }
                 init_cgroup_css(css, ss, cgrp);
                 if (ss->use_id) {
                         err = alloc_css_id(ss, parent, cgrp);
                         if (err)
-                               goto err_destroy;
+                               goto err_free_all;
                 }
-               /* At error, ->destroy() callback has to free assigned ID. */
-               if (clone_children(parent) && ss->post_clone)
-                       ss->post_clone(cgrp);
+       }
+
+       /*
+        * Create directory.  cgroup_create_file() returns with the new
+        * directory locked on success so that it can be populated without
+        * dropping cgroup_mutex.
+        */
+       err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
+       if (err < 0)
+               goto err_free_all;
+       lockdep_assert_held(&dentry->d_inode->i_mutex);
+
+       /* allocation complete, commit to creation */
+       list_add_tail(&cgrp->allcg_node, &root->allcg_list);
+       list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
+       root->number_of_cgroups++;
+
+       /* each css holds a ref to the cgroup's dentry */
+       for_each_subsys(root, ss)
+               dget(dentry);
+
+       /* creation succeeded, notify subsystems */
+       for_each_subsys(root, ss) {
+               err = online_css(ss, cgrp);
+               if (err)
+                       goto err_destroy;
  
                 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
                     parent->parent) {
@@ -4102,50 +4231,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                 }
         }
  
-       list_add(&cgrp->sibling, &cgrp->parent->children);
-       root->number_of_cgroups++;
-
-       err = cgroup_create_dir(cgrp, dentry, mode);
-       if (err < 0)
-               goto err_remove;
-
-       /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
-       for_each_subsys(root, ss)
-               if (!ss->__DEPRECATED_clear_css_refs)
-                       dget(dentry);
-
-       /* The cgroup directory was pre-locked for us */
-       BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
-
-       list_add_tail(&cgrp->allcg_node, &root->allcg_list);
-
         err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
-       /* If err < 0, we have a half-filled directory - oh well ;) */
+       if (err)
+               goto err_destroy;
  
         mutex_unlock(&cgroup_mutex);
         mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
  
         return 0;
  
- err_remove:
-
-       list_del(&cgrp->sibling);
-       root->number_of_cgroups--;
-
- err_destroy:
-
+err_free_all:
         for_each_subsys(root, ss) {
                 if (cgrp->subsys[ss->subsys_id])
-                       ss->destroy(cgrp);
+                       ss->css_free(cgrp);
         }
-
         mutex_unlock(&cgroup_mutex);
-
         /* Release the reference count that we took on the superblock */
         deactivate_super(sb);
-
+err_free_id:
+       ida_simple_remove(&root->cgroup_ida, cgrp->id);
+err_free_cgrp:
         kfree(cgrp);
         return err;
+
+err_destroy:
+       cgroup_destroy_locked(cgrp);
+       mutex_unlock(&cgroup_mutex);
+       mutex_unlock(&dentry->d_inode->i_mutex);
+       return err;
  }
  
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -4197,153 +4310,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
         return 0;
  }
  
-/*
- * Atomically mark all (or else none) of the cgroup's CSS objects as
- * CSS_REMOVED. Return true on success, or false if the cgroup has
- * busy subsystems. Call with cgroup_mutex held
- *
- * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
- * not, cgroup removal behaves differently.
- *
- * If clear is set, css refcnt for the subsystem should be zero before
- * cgroup removal can be committed.  This is implemented by
- * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
- * called multiple times until all css refcnts reach zero and is allowed to
- * veto removal on any invocation.  This behavior is deprecated and will be
- * removed as soon as the existing user (memcg) is updated.
- *
- * If clear is not set, each css holds an extra reference to the cgroup's
- * dentry and cgroup removal proceeds regardless of css refs.
- * ->pre_destroy() will be called at least once and is not allowed to fail.
- * On the last put of each css, whenever that may be, the extra dentry ref
- * is put so that dentry destruction happens only after all css's are
- * released.
- */
-static int cgroup_clear_css_refs(struct cgroup *cgrp)
+static int cgroup_destroy_locked(struct cgroup *cgrp)
+       __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
+       struct dentry *d = cgrp->dentry;
+       struct cgroup *parent = cgrp->parent;
+       DEFINE_WAIT(wait);
+       struct cgroup_event *event, *tmp;
         struct cgroup_subsys *ss;
-       unsigned long flags;
-       bool failed = false;
+       LIST_HEAD(tmp_list);
  
-       local_irq_save(flags);
+       lockdep_assert_held(&d->d_inode->i_mutex);
+       lockdep_assert_held(&cgroup_mutex);
+
+       if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
+               return -EBUSY;
  
         /*
-        * Block new css_tryget() by deactivating refcnt.  If all refcnts
-        * for subsystems w/ clear_css_refs set were 1 at the moment of
-        * deactivation, we succeeded.
+        * Block new css_tryget() by deactivating refcnt and mark @cgrp
+        * removed.  This makes future css_tryget() and child creation
+        * attempts fail thus maintaining the removal conditions verified
+        * above.
          */
         for_each_subsys(cgrp->root, ss) {
                 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  
                 WARN_ON(atomic_read(&css->refcnt) < 0);
                 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-
-               if (ss->__DEPRECATED_clear_css_refs)
-                       failed |= css_refcnt(css) != 1;
-       }
-
-       /*
-        * If succeeded, set REMOVED and put all the base refs; otherwise,
-        * restore refcnts to positive values.  Either way, all in-progress
-        * css_tryget() will be released.
-        */
-       for_each_subsys(cgrp->root, ss) {
-               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-
-               if (!failed) {
-                       set_bit(CSS_REMOVED, &css->flags);
-                       css_put(css);
-               } else {
-                       atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
-               }
-       }
-
-       local_irq_restore(flags);
-       return !failed;
-}
-
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
-{
-       struct cgroup *cgrp = dentry->d_fsdata;
-       struct dentry *d;
-       struct cgroup *parent;
-       DEFINE_WAIT(wait);
-       struct cgroup_event *event, *tmp;
-       int ret;
-
-       /* the vfs holds both inode->i_mutex already */
-again:
-       mutex_lock(&cgroup_mutex);
-       if (atomic_read(&cgrp->count) != 0) {
-               mutex_unlock(&cgroup_mutex);
-               return -EBUSY;
         }
-       if (!list_empty(&cgrp->children)) {
-               mutex_unlock(&cgroup_mutex);
-               return -EBUSY;
-       }
-       mutex_unlock(&cgroup_mutex);
+       set_bit(CGRP_REMOVED, &cgrp->flags);
  
-       /*
-        * In general, subsystem has no css->refcnt after pre_destroy(). But
-        * in racy cases, subsystem may have to get css->refcnt after
-        * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
-        * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
-        * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
-        * and subsystem's reference count handling. Please see css_get/put
-        * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
-        */
-       set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+       /* tell subsystems to initate destruction */
+       for_each_subsys(cgrp->root, ss)
+               offline_css(ss, cgrp);
  
         /*
-        * Call pre_destroy handlers of subsys. Notify subsystems
-        * that rmdir() request comes.
+        * Put all the base refs.  Each css holds an extra reference to the
+        * cgroup's dentry and cgroup removal proceeds regardless of css
+        * refs.  On the last put of each css, whenever that may be, the
+        * extra dentry ref is put so that dentry destruction happens only
+        * after all css's are released.
          */
-       ret = cgroup_call_pre_destroy(cgrp);
-       if (ret) {
-               clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-               return ret;
-       }
-
-       mutex_lock(&cgroup_mutex);
-       parent = cgrp->parent;
-       if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
-               clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-               mutex_unlock(&cgroup_mutex);
-               return -EBUSY;
-       }
-       prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
-       if (!cgroup_clear_css_refs(cgrp)) {
-               mutex_unlock(&cgroup_mutex);
-               /*
-                * Because someone may call cgroup_wakeup_rmdir_waiter() before
-                * prepare_to_wait(), we need to check this flag.
-                */
-               if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
-                       schedule();
-               finish_wait(&cgroup_rmdir_waitq, &wait);
-               clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-               if (signal_pending(current))
-                       return -EINTR;
-               goto again;
-       }
-       /* NO css_tryget() can success after here. */
-       finish_wait(&cgroup_rmdir_waitq, &wait);
-       clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+       for_each_subsys(cgrp->root, ss)
+               css_put(cgrp->subsys[ss->subsys_id]);
  
         raw_spin_lock(&release_list_lock);
-       set_bit(CGRP_REMOVED, &cgrp->flags);
         if (!list_empty(&cgrp->release_list))
                 list_del_init(&cgrp->release_list);
         raw_spin_unlock(&release_list_lock);
  
         /* delete this cgroup from parent->children */
-       list_del_init(&cgrp->sibling);
-
+       list_del_rcu(&cgrp->sibling);
         list_del_init(&cgrp->allcg_node);
  
-       d = dget(cgrp->dentry);
-
+       dget(d);
         cgroup_d_remove_dir(d);
         dput(d);
  
@@ -4353,21 +4373,35 @@ again:
         /*
          * Unregister events and notify userspace.
          * Notify userspace about cgroup removing only after rmdir of cgroup
-        * directory to avoid race between userspace and kernelspace
+        * directory to avoid race between userspace and kernelspace. Use
+        * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
+        * cgroup_event_wake() is called with the wait queue head locked,
+        * remove_wait_queue() cannot be called while holding event_list_lock.
          */
         spin_lock(&cgrp->event_list_lock);
-       list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
-               list_del(&event->list);
+       list_splice_init(&cgrp->event_list, &tmp_list);
+       spin_unlock(&cgrp->event_list_lock);
+       list_for_each_entry_safe(event, tmp, &tmp_list, list) {
+               list_del_init(&event->list);
                 remove_wait_queue(event->wqh, &event->wait);
                 eventfd_signal(event->eventfd, 1);
                 schedule_work(&event->remove);
         }
-       spin_unlock(&cgrp->event_list_lock);
  
-       mutex_unlock(&cgroup_mutex);
         return 0;
  }
  
+static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
+{
+       int ret;
+
+       mutex_lock(&cgroup_mutex);
+       ret = cgroup_destroy_locked(dentry->d_fsdata);
+       mutex_unlock(&cgroup_mutex);
+
+       return ret;
+}
+
  static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
  {
         INIT_LIST_HEAD(&ss->cftsets);
@@ -4388,13 +4422,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
  
         printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
  
+       mutex_lock(&cgroup_mutex);
+
         /* init base cftset */
         cgroup_init_cftsets(ss);
  
         /* Create the top cgroup state for this subsystem */
         list_add(&ss->sibling, &rootnode.subsys_list);
         ss->root = &rootnode;
-       css = ss->create(dummytop);
+       css = ss->css_alloc(dummytop);
         /* We don't handle early failures gracefully */
         BUG_ON(IS_ERR(css));
         init_cgroup_css(css, ss, dummytop);
@@ -4403,7 +4439,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
          * pointer to this state - since the subsystem is
          * newly registered, all tasks and hence the
          * init_css_set is in the subsystem's top cgroup. */
-       init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
+       init_css_set.subsys[ss->subsys_id] = css;
  
         need_forkexit_callback |= ss->fork || ss->exit;
  
@@ -4413,6 +4449,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         BUG_ON(!list_empty(&init_task.tasks));
  
         ss->active = 1;
+       BUG_ON(online_css(ss, dummytop));
+
+       mutex_unlock(&cgroup_mutex);
  
         /* this function shouldn't be used with modular subsystems, since they
          * need to register a subsys_id, among other things */
@@ -4430,12 +4469,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
   */
  int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
  {
-       int i;
         struct cgroup_subsys_state *css;
+       int i, ret;
+       struct hlist_node *node, *tmp;
+       struct css_set *cg;
+       unsigned long key;
  
         /* check name and function validity */
         if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
-           ss->create == NULL || ss->destroy == NULL)
+           ss->css_alloc == NULL || ss->css_free == NULL)
                 return -EINVAL;
  
         /*
@@ -4464,10 +4506,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         subsys[ss->subsys_id] = ss;
  
         /*
-        * no ss->create seems to need anything important in the ss struct, so
-        * this can happen first (i.e. before the rootnode attachment).
+        * no ss->css_alloc seems to need anything important in the ss
+        * struct, so this can happen first (i.e. before the rootnode
+        * attachment).
          */
-       css = ss->create(dummytop);
+       css = ss->css_alloc(dummytop);
         if (IS_ERR(css)) {
                 /* failure case - need to deassign the subsys[] slot. */
                 subsys[ss->subsys_id] = NULL;
@@ -4482,14 +4525,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         init_cgroup_css(css, ss, dummytop);
         /* init_idr must be after init_cgroup_css because it sets css->id. */
         if (ss->use_id) {
-               int ret = cgroup_init_idr(ss, css);
-               if (ret) {
-                       dummytop->subsys[ss->subsys_id] = NULL;
-                       ss->destroy(dummytop);
-                       subsys[ss->subsys_id] = NULL;
-                       mutex_unlock(&cgroup_mutex);
-                       return ret;
-               }
+               ret = cgroup_init_idr(ss, css);
+               if (ret)
+                       goto err_unload;
         }
  
         /*
@@ -4501,31 +4539,34 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
          * this is all done under the css_set_lock.
          */
         write_lock(&css_set_lock);
-       for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
-               struct css_set *cg;
-               struct hlist_node *node, *tmp;
-               struct hlist_head *bucket = &css_set_table[i], *new_bucket;
-
-               hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
-                       /* skip entries that we already rehashed */
-                       if (cg->subsys[ss->subsys_id])
-                               continue;
-                       /* remove existing entry */
-                       hlist_del(&cg->hlist);
-                       /* set new value */
-                       cg->subsys[ss->subsys_id] = css;
-                       /* recompute hash and restore entry */
-                       new_bucket = css_set_hash(cg->subsys);
-                       hlist_add_head(&cg->hlist, new_bucket);
-               }
+       hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) {
+               /* skip entries that we already rehashed */
+               if (cg->subsys[ss->subsys_id])
+                       continue;
+               /* remove existing entry */
+               hash_del(&cg->hlist);
+               /* set new value */
+               cg->subsys[ss->subsys_id] = css;
+               /* recompute hash and restore entry */
+               key = css_set_hash(cg->subsys);
+               hash_add(css_set_table, node, key);
         }
         write_unlock(&css_set_lock);
  
         ss->active = 1;
+       ret = online_css(ss, dummytop);
+       if (ret)
+               goto err_unload;
  
         /* success! */
         mutex_unlock(&cgroup_mutex);
         return 0;
+
+err_unload:
+       mutex_unlock(&cgroup_mutex);
+       /* @ss can't be mounted here as try_module_get() would fail */
+       cgroup_unload_subsys(ss);
+       return ret;
  }
  EXPORT_SYMBOL_GPL(cgroup_load_subsys);
  
@@ -4540,7 +4581,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
  void cgroup_unload_subsys(struct cgroup_subsys *ss)
  {
         struct cg_cgroup_link *link;
-       struct hlist_head *hhead;
  
         BUG_ON(ss->module == NULL);
  
@@ -4552,6 +4592,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
         BUG_ON(ss->root != &rootnode);
  
         mutex_lock(&cgroup_mutex);
+
+       offline_css(ss, dummytop);
+       ss->active = 0;
+
+       if (ss->use_id) {
+               idr_remove_all(&ss->idr);
+               idr_destroy(&ss->idr);
+       }
+
         /* deassign the subsys_id */
         subsys[ss->subsys_id] = NULL;
  
@@ -4565,22 +4614,22 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
         write_lock(&css_set_lock);
         list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
                 struct css_set *cg = link->cg;
+               unsigned long key;
  
-               hlist_del(&cg->hlist);
-               BUG_ON(!cg->subsys[ss->subsys_id]);
+               hash_del(&cg->hlist);
                 cg->subsys[ss->subsys_id] = NULL;
-               hhead = css_set_hash(cg->subsys);
-               hlist_add_head(&cg->hlist, hhead);
+               key = css_set_hash(cg->subsys);
+               hash_add(css_set_table, &cg->hlist, key);
         }
         write_unlock(&css_set_lock);
  
         /*
-        * remove subsystem's css from the dummytop and free it - need to free
-        * before marking as null because ss->destroy needs the cgrp->subsys
-        * pointer to find their state. note that this also takes care of
-        * freeing the css_id.
+        * remove subsystem's css from the dummytop and free it - need to
+        * free before marking as null because ss->css_free needs the
+        * cgrp->subsys pointer to find their state. note that this also
+        * takes care of freeing the css_id.
          */
-       ss->destroy(dummytop);
+       ss->css_free(dummytop);
         dummytop->subsys[ss->subsys_id] = NULL;
  
         mutex_unlock(&cgroup_mutex);
@@ -4612,9 +4661,6 @@ int __init cgroup_init_early(void)
         list_add(&init_css_set_link.cg_link_list,
                  &init_css_set.cg_links);
  
-       for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
-               INIT_HLIST_HEAD(&css_set_table[i]);
-
         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                 struct cgroup_subsys *ss = subsys[i];
  
@@ -4624,8 +4670,8 @@ int __init cgroup_init_early(void)
  
                 BUG_ON(!ss->name);
                 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
-               BUG_ON(!ss->create);
-               BUG_ON(!ss->destroy);
+               BUG_ON(!ss->css_alloc);
+               BUG_ON(!ss->css_free);
                 if (ss->subsys_id != i) {
                         printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
                                ss->name, ss->subsys_id);
@@ -4648,7 +4694,7 @@ int __init cgroup_init(void)
  {
         int err;
         int i;
-       struct hlist_head *hhead;
+       unsigned long key;
  
         err = bdi_init(&cgroup_backing_dev_info);
         if (err)
@@ -4667,8 +4713,8 @@ int __init cgroup_init(void)
         }
  
         /* Add init_css_set to the hash table */
-       hhead = css_set_hash(init_css_set.subsys);
-       hlist_add_head(&init_css_set.hlist, hhead);
+       key = css_set_hash(init_css_set.subsys);
+       hash_add(css_set_table, &init_css_set.hlist, key);
         BUG_ON(!init_root_id(&rootnode));
  
         cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4831,45 +4877,20 @@ void cgroup_fork(struct task_struct *child)
         INIT_LIST_HEAD(&child->cg_list);
  }
  
-/**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
-       if (need_forkexit_callback) {
-               int i;
-               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                       struct cgroup_subsys *ss = subsys[i];
-
-                       /*
-                        * forkexit callbacks are only supported for
-                        * builtin subsystems.
-                        */
-                       if (!ss || ss->module)
-                               continue;
-
-                       if (ss->fork)
-                               ss->fork(child);
-               }
-       }
-}
-
  /**
   * cgroup_post_fork - called on a new task after adding it to the task list
   * @child: the task in question
   *
- * Adds the task to the list running through its css_set if necessary.
- * Has to be after the task is visible on the task list in case we race
- * with the first call to cgroup_iter_start() - to guarantee that the
- * new task ends up on its list.
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks.  Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_iter_start() - to guarantee that the new task ends up on its
+ * list.
   */
  void cgroup_post_fork(struct task_struct *child)
  {
+       int i;
+
         /*
          * use_task_css_set_links is set to 1 before we walk the tasklist
          * under the tasklist_lock and we read it here after we added the child
@@ -4889,7 +4910,30 @@ void cgroup_post_fork(struct task_struct *child)
                 task_unlock(child);
                 write_unlock(&css_set_lock);
         }
+
+       /*
+        * Call ss->fork().  This must happen after @child is linked on
+        * css_set; otherwise, @child might change state between ->fork()
+        * and addition to css_set.
+        */
+       if (need_forkexit_callback) {
+               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                       struct cgroup_subsys *ss = subsys[i];
+
+                       /*
+                        * fork/exit callbacks are supported only for
+                        * builtin subsystems and we don't need further
+                        * synchronization as they never go away.
+                        */
+                       if (!ss || ss->module)
+                               continue;
+
+                       if (ss->fork)
+                               ss->fork(child);
+               }
+       }
  }
+
  /**
   * cgroup_exit - detach cgroup from exiting task
   * @tsk: pointer to task_struct of exiting process
@@ -4965,8 +5009,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
         }
         task_unlock(tsk);
  
-       if (cg)
-               put_css_set_taskexit(cg);
+       put_css_set_taskexit(cg);
  }
  
  /**
@@ -5022,15 +5065,17 @@ static void check_for_release(struct cgroup *cgrp)
  /* Caller must verify that the css is not for root cgroup */
  bool __css_tryget(struct cgroup_subsys_state *css)
  {
-       do {
-               int v = css_refcnt(css);
+       while (true) {
+               int t, v;
  
-               if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
+               v = css_refcnt(css);
+               t = atomic_cmpxchg(&css->refcnt, v, v + 1);
+               if (likely(t == v))
                         return true;
+               else if (t < 0)
+                       return false;
                 cpu_relax();
-       } while (!test_bit(CSS_REMOVED, &css->flags));
-
-       return false;
+       }
  }
  EXPORT_SYMBOL_GPL(__css_tryget);
  
@@ -5049,11 +5094,9 @@ void __css_put(struct cgroup_subsys_state *css)
                         set_bit(CGRP_RELEASABLE, &cgrp->flags);
                         check_for_release(cgrp);
                 }
-               cgroup_wakeup_rmdir_waiter(cgrp);
                 break;
         case 0:
-               if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
-                       schedule_work(&css->dput_work);
+               schedule_work(&css->dput_work);
                 break;
         }
         rcu_read_unlock();
@@ -5439,7 +5482,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
  }
  
  #ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
+static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
  {
         struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
  
@@ -5449,7 +5492,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
         return css;
  }
  
-static void debug_destroy(struct cgroup *cont)
+static void debug_css_free(struct cgroup *cont)
  {
         kfree(cont->subsys[debug_subsys_id]);
  }
@@ -5578,8 +5621,8 @@ static struct cftype debug_files[] =  {
  
  struct cgroup_subsys debug_subsys = {
         .name = "debug",
-       .create = debug_create,
-       .destroy = debug_destroy,
+       .css_alloc = debug_css_alloc,
+       .css_free = debug_css_free,
         .subsys_id = debug_subsys_id,
         .base_cftypes = debug_files,
  };