]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blobdiff - kernel/cgroup.c
UBUNTU: Ubuntu-snapdragon-4.4.0-1028.31
[mirror_ubuntu-zesty-kernel.git] / kernel / cgroup.c
index 470f6536b9e8cfb029eedb5ecdb3c3e8c3c341be..df11deded5e2218f7ec43db8a91fec55acd4d6a1 100644 (file)
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/kthread.h>
 #include <linux/delay.h>
-
+#include <linux/cpuset.h>
 #include <linux/atomic.h>
+#include <linux/cpuset.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
+#include <net/sock.h>
 
 /*
  * pidlists linger the following amount before being destroyed.  The goal
@@ -208,6 +212,15 @@ static unsigned long have_fork_callback __read_mostly;
 static unsigned long have_exit_callback __read_mostly;
 static unsigned long have_free_callback __read_mostly;
 
+/* Cgroup namespace for init task */
+struct cgroup_namespace init_cgroup_ns = {
+       .count          = { .counter = 2, },
+       .user_ns        = &init_user_ns,
+       .ns.ops         = &cgroupns_operations,
+       .ns.inum        = PROC_CGROUP_INIT_INO,
+       .root_cset      = &init_css_set,
+};
+
 /* Ditto for the can_fork callback. */
 static unsigned long have_canfork_callback __read_mostly;
 
@@ -236,6 +249,9 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
  */
 static bool cgroup_ssid_enabled(int ssid)
 {
+       if (CGROUP_SUBSYS_COUNT == 0)
+               return false;
+
        return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 }
 
@@ -1159,6 +1175,41 @@ static void cgroup_destroy_root(struct cgroup_root *root)
        cgroup_free_root(root);
 }
 
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
+{
+       struct cgroup *res = NULL;
+       struct css_set *cset;
+
+       lockdep_assert_held(&css_set_lock);
+
+       rcu_read_lock();
+
+       cset = current->nsproxy->cgroup_ns->root_cset;
+       if (cset == &init_css_set) {
+               res = &root->cgrp;
+       } else {
+               struct cgrp_cset_link *link;
+
+               list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+                       struct cgroup *c = link->cgrp;
+
+                       if (c->root == root) {
+                               res = c;
+                               break;
+                       }
+               }
+       }
+       rcu_read_unlock();
+
+       BUG_ON(!res);
+       return res;
+}
+
 /* look up cgroup associated with given css_set on the specified hierarchy */
 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
                                            struct cgroup_root *root)
@@ -1576,6 +1627,33 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
        return 0;
 }
 
+static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+                           struct kernfs_root *kf_root)
+{
+       int len = 0;
+       char *buf = NULL;
+       struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
+       struct cgroup *ns_cgroup;
+
+       buf = kmalloc(PATH_MAX, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       spin_lock_bh(&css_set_lock);
+       ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
+       len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
+       spin_unlock_bh(&css_set_lock);
+
+       if (len >= PATH_MAX)
+               len = -ERANGE;
+       else if (len > 0) {
+               seq_escape(sf, buf, " \t\n\\");
+               len = 0;
+       }
+       kfree(buf);
+       return len;
+}
+
 static int cgroup_show_options(struct seq_file *seq,
                               struct kernfs_root *kf_root)
 {
@@ -2005,6 +2083,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                         void *data)
 {
        struct super_block *pinned_sb = NULL;
+       struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
        struct cgroup_subsys *ss;
        struct cgroup_root *root;
        struct cgroup_sb_opts opts;
@@ -2013,6 +2092,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        int i;
        bool new_sb;
 
+       get_cgroup_ns(ns);
+
+       /* Check if the caller has permission to mount. */
+       if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+               put_cgroup_ns(ns);
+               return ERR_PTR(-EPERM);
+       }
+
        /*
         * The first time anyone tries to mount a cgroup, enable the list
         * linking each css_set to its tasks and fix up all existing tasks.
@@ -2126,6 +2213,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                goto out_unlock;
        }
 
+       /*
+        * We know this subsystem has not yet been bound.  Users in a non-init
+        * user namespace may only mount hierarchies with no bound subsystems,
+        * i.e. 'none,name=user1'
+        */
+       if (!opts.none && !capable(CAP_SYS_ADMIN)) {
+               ret = -EPERM;
+               goto out_unlock;
+       }
+
        root = kzalloc(sizeof(*root), GFP_KERNEL);
        if (!root) {
                ret = -ENOMEM;
@@ -2144,11 +2241,36 @@ out_free:
        kfree(opts.release_agent);
        kfree(opts.name);
 
-       if (ret)
+       if (ret) {
+               put_cgroup_ns(ns);
                return ERR_PTR(ret);
+       }
 
        dentry = kernfs_mount(fs_type, flags, root->kf_root,
-                               CGROUP_SUPER_MAGIC, &new_sb);
+                             CGROUP_SUPER_MAGIC, &new_sb);
+
+       /*
+        * In non-init cgroup namespace, instead of root cgroup's
+        * dentry, we return the dentry corresponding to the
+        * cgroupns->root_cgrp.
+        */
+       if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+               struct dentry *nsdentry;
+               struct cgroup *cgrp;
+
+               mutex_lock(&cgroup_mutex);
+               spin_lock_bh(&css_set_lock);
+
+               cgrp = cset_cgroup_from_root(ns->root_cset, root);
+
+               spin_unlock_bh(&css_set_lock);
+               mutex_unlock(&cgroup_mutex);
+
+               nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+               dput(dentry);
+               dentry = nsdentry;
+       }
+
        if (IS_ERR(dentry) || !new_sb)
                cgroup_put(&root->cgrp);
 
@@ -2161,6 +2283,7 @@ out_free:
                deactivate_super(pinned_sb);
        }
 
+       put_cgroup_ns(ns);
        return dentry;
 }
 
@@ -2189,8 +2312,39 @@ static struct file_system_type cgroup_fs_type = {
        .name = "cgroup",
        .mount = cgroup_mount,
        .kill_sb = cgroup_kill_sb,
+       .fs_flags = FS_USERNS_MOUNT,
 };
 
+static char *
+cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+                     struct cgroup_namespace *ns)
+{
+       int ret;
+       struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
+
+       ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
+       if (ret < 0 || ret >= buflen)
+               return NULL;
+       return buf;
+}
+
+char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+                    struct cgroup_namespace *ns)
+{
+       char *ret;
+
+       mutex_lock(&cgroup_mutex);
+       spin_lock_bh(&css_set_lock);
+
+       ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
+
+       spin_unlock_bh(&css_set_lock);
+       mutex_unlock(&cgroup_mutex);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(cgroup_path_ns);
+
 /**
  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
  * @task: target task
@@ -2218,7 +2372,8 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 
        if (root) {
                cgrp = task_cgroup_from_root(task, root);
-               path = cgroup_path(cgrp, buf, buflen);
+               path = cgroup_path_ns_locked(cgrp, buf, buflen,
+                                            &init_cgroup_ns);
        } else {
                /* if no hierarchy exists, everyone is in "/" */
                if (strlcpy(buf, "/", buflen) < buflen)
@@ -2498,6 +2653,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
        lockdep_assert_held(&cgroup_mutex);
        lockdep_assert_held(&css_set_lock);
 
+       /*
+        * If ->dead, @src_set is associated with one or more dead cgroups
+        * and doesn't contain any migratable tasks.  Ignore it early so
+        * that the rest of migration path doesn't get confused by it.
+        */
+       if (src_cset->dead)
+               return;
+
        src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
 
        if (!list_empty(&src_cset->mg_preload_node))
@@ -2713,9 +2876,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
                                    size_t nbytes, loff_t off, bool threadgroup)
 {
        struct task_struct *tsk;
+       struct cgroup_subsys *ss;
        struct cgroup *cgrp;
        pid_t pid;
-       int ret;
+       int ssid, ret;
 
        if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
                return -EINVAL;
@@ -2740,11 +2904,12 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
                tsk = tsk->group_leader;
 
        /*
-        * Workqueue threads may acquire PF_NO_SETAFFINITY and become
-        * trapped in a cpuset, or RT worker may be born in a cgroup
-        * with no rt_runtime allocated.  Just say no.
+        * kthreads may acquire PF_NO_SETAFFINITY during initialization.
+        * If userland migrates such a kthread to a non-root cgroup, it can
+        * become trapped in a cpuset, or RT kthread may be born in a
+        * cgroup with no rt_runtime allocated.  Just say no.
         */
-       if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+       if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
                ret = -EINVAL;
                goto out_unlock_rcu;
        }
@@ -2763,6 +2928,9 @@ out_unlock_rcu:
        rcu_read_unlock();
 out_unlock_threadgroup:
        percpu_up_write(&cgroup_threadgroup_rwsem);
+       for_each_subsys(ss, ssid)
+               if (ss->post_attach)
+                       ss->post_attach();
        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
 }
@@ -4680,14 +4848,15 @@ static void css_free_work_fn(struct work_struct *work)
 
        if (ss) {
                /* css free path */
+               struct cgroup_subsys_state *parent = css->parent;
                int id = css->id;
 
-               if (css->parent)
-                       css_put(css->parent);
-
                ss->css_free(css);
                cgroup_idr_remove(&ss->css_idr, id);
                cgroup_put(cgrp);
+
+               if (parent)
+                       css_put(parent);
        } else {
                /* cgroup free path */
                atomic_dec(&cgrp->root->nr_cgrps);
@@ -4780,9 +4949,11 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
        memset(css, 0, sizeof(*css));
        css->cgroup = cgrp;
        css->ss = ss;
+       css->id = -1;
        INIT_LIST_HEAD(&css->sibling);
        INIT_LIST_HEAD(&css->children);
        css->serial_nr = css_serial_nr_next++;
+       atomic_set(&css->online_cnt, 0);
 
        if (cgroup_parent(cgrp)) {
                css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -4805,6 +4976,10 @@ static int online_css(struct cgroup_subsys_state *css)
        if (!ret) {
                css->flags |= CSS_ONLINE;
                rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
+
+               atomic_inc(&css->online_cnt);
+               if (css->parent)
+                       atomic_inc(&css->parent->online_cnt);
        }
        return ret;
 }
@@ -5036,10 +5211,15 @@ static void css_killed_work_fn(struct work_struct *work)
                container_of(work, struct cgroup_subsys_state, destroy_work);
 
        mutex_lock(&cgroup_mutex);
-       offline_css(css);
-       mutex_unlock(&cgroup_mutex);
 
-       css_put(css);
+       do {
+               offline_css(css);
+               css_put(css);
+               /* @css can't go away while we're holding cgroup_mutex */
+               css = css->parent;
+       } while (css && atomic_dec_and_test(&css->online_cnt));
+
+       mutex_unlock(&cgroup_mutex);
 }
 
 /* css kill confirmation processing requires process context, bounce */
@@ -5048,8 +5228,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);
 
-       INIT_WORK(&css->destroy_work, css_killed_work_fn);
-       queue_work(cgroup_destroy_wq, &css->destroy_work);
+       if (atomic_dec_and_test(&css->online_cnt)) {
+               INIT_WORK(&css->destroy_work, css_killed_work_fn);
+               queue_work(cgroup_destroy_wq, &css->destroy_work);
+       }
 }
 
 /**
@@ -5118,6 +5300,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
        struct cgroup_subsys_state *css;
+       struct cgrp_cset_link *link;
        int ssid;
 
        lockdep_assert_held(&cgroup_mutex);
@@ -5138,11 +5321,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
                return -EBUSY;
 
        /*
-        * Mark @cgrp dead.  This prevents further task migration and child
-        * creation by disabling cgroup_lock_live_group().
+        * Mark @cgrp and the associated csets dead.  The former prevents
+        * further task migration and child creation by disabling
+        * cgroup_lock_live_group().  The latter makes the csets ignored by
+        * the migration path.
         */
        cgrp->self.flags &= ~CSS_ONLINE;
 
+       spin_lock_bh(&css_set_lock);
+       list_for_each_entry(link, &cgrp->cset_links, cset_link)
+               link->cset->dead = true;
+       spin_unlock_bh(&css_set_lock);
+
        /* initiate massacre of all css's */
        for_each_css(css, ssid, cgrp)
                kill_css(css);
@@ -5182,6 +5372,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
        .mkdir                  = cgroup_mkdir,
        .rmdir                  = cgroup_rmdir,
        .rename                 = cgroup_rename,
+       .show_path              = cgroup_show_path,
 };
 
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
@@ -5291,6 +5482,8 @@ int __init cgroup_init(void)
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
+       get_user_ns(init_cgroup_ns.user_ns);
+
        mutex_lock(&cgroup_mutex);
 
        /* Add init_css_set to the hash table */
@@ -5427,7 +5620,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                 * " (deleted)" is appended to the cgroup path.
                 */
                if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
-                       path = cgroup_path(cgrp, buf, PATH_MAX);
+                       path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
+                                                    current->nsproxy->cgroup_ns);
                        if (!path) {
                                retval = -ENAMETOOLONG;
                                goto out_unlock;
@@ -5725,7 +5919,10 @@ static void cgroup_release_agent(struct work_struct *work)
        if (!pathbuf || !agentbuf)
                goto out;
 
-       path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+       spin_lock_bh(&css_set_lock);
+       path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX,
+                                    &init_cgroup_ns);
+       spin_unlock_bh(&css_set_lock);
        if (!path)
                goto out;
 
@@ -5822,6 +6019,133 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
        return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
 }
 
+/* cgroup namespaces */
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+       struct cgroup_namespace *new_ns;
+       int ret;
+
+       new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+       if (!new_ns)
+               return ERR_PTR(-ENOMEM);
+       ret = ns_alloc_inum(&new_ns->ns);
+       if (ret) {
+               kfree(new_ns);
+               return ERR_PTR(ret);
+       }
+       atomic_set(&new_ns->count, 1);
+       new_ns->ns.ops = &cgroupns_operations;
+       return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+       put_css_set(ns->root_cset);
+       put_user_ns(ns->user_ns);
+       ns_free_inum(&ns->ns);
+       kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *
+copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
+              struct cgroup_namespace *old_ns)
+{
+       struct cgroup_namespace *new_ns;
+       struct css_set *cset;
+
+       BUG_ON(!old_ns);
+
+       if (!(flags & CLONE_NEWCGROUP)) {
+               get_cgroup_ns(old_ns);
+               return old_ns;
+       }
+
+       /* Allow only sysadmin to create cgroup namespace. */
+       if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       mutex_lock(&cgroup_mutex);
+       spin_lock_bh(&css_set_lock);
+
+       cset = task_css_set(current);
+       get_css_set(cset);
+
+       spin_unlock_bh(&css_set_lock);
+       mutex_unlock(&cgroup_mutex);
+
+       new_ns = alloc_cgroup_ns();
+       if (IS_ERR(new_ns)) {
+               put_css_set(cset);
+               return new_ns;
+       }
+
+       new_ns->user_ns = get_user_ns(user_ns);
+       new_ns->root_cset = cset;
+
+       return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+       return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+       struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+       if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+           !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+               return -EPERM;
+
+       /* Don't need to do anything if we are attaching to our own cgroupns. */
+       if (cgroup_ns == nsproxy->cgroup_ns)
+               return 0;
+
+       get_cgroup_ns(cgroup_ns);
+       put_cgroup_ns(nsproxy->cgroup_ns);
+       nsproxy->cgroup_ns = cgroup_ns;
+
+       return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+       struct cgroup_namespace *ns = NULL;
+       struct nsproxy *nsproxy;
+
+       task_lock(task);
+       nsproxy = task->nsproxy;
+       if (nsproxy) {
+               ns = nsproxy->cgroup_ns;
+               get_cgroup_ns(ns);
+       }
+       task_unlock(task);
+
+       return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+       put_cgroup_ns(to_cg_ns(ns));
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+       .name           = "cgroup",
+       .type           = CLONE_NEWCGROUP,
+       .get            = cgroupns_get,
+       .put            = cgroupns_put,
+       .install        = cgroupns_install,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+       return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)