bindings: split cgroup part of lxcfs into separate files

author Christian Brauner <christian.brauner@ubuntu.com>

Tue, 25 Feb 2020 16:17:10 +0000 (17:17 +0100)

committer Christian Brauner <christian.brauner@ubuntu.com>

Tue, 25 Feb 2020 16:18:44 +0000 (17:18 +0100)
author Christian Brauner <christian.brauner@ubuntu.com>
Tue, 25 Feb 2020 16:17:10 +0000 (17:17 +0100)
committer Christian Brauner <christian.brauner@ubuntu.com>
Tue, 25 Feb 2020 16:18:44 +0000 (17:18 +0100)
diff --git a/Makefile.am b/Makefile.am

index d37aa7f2e2f82ca7907943c8751de594a30fab8a..e3c4c24936c52e49b16c0c1520af7b0ccfe5ad7b 100644 (file)
--- a/Makefile.am
+++ b/Makefile.am
@@ -13,6 +13,7 @@ AM_LDFLAGS = $(FUSE_LIBS) -pthread
  AM_CFLAGS += -DRUNTIME_PATH=\"$(RUNTIME_PATH)\"
  
  liblxcfs_la_SOURCES = bindings.c bindings.h \
+                     cgroup_fuse.c cgroup_fuse.h
                       cgroups/cgfsng.c \
                       cgroups/cgroup.c cgroups/cgroup.h \
                       cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \
@@ -25,6 +26,7 @@ liblxcfs_la_CFLAGS = $(AM_CFLAGS)
  liblxcfs_la_LDFLAGS = $(AM_CFLAGS) -module -avoid-version -shared
  
  liblxcfstest_la_SOURCES = bindings.c bindings.h \
+                         cgroup_fuse.c cgroup_fuse.h
                           cgroups/cgfsng.c \
                           cgroups/cgroup.c cgroups/cgroup.h \
                           cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \
diff --git a/bindings.c b/bindings.c

index ddaa528fe7bcc2bb4e43e2e046f4a0736f8a99ba..83243b8f4403fcdf17c48c063ead9a3ab90a7480 100644 (file)
--- a/bindings.c
+++ b/bindings.c
@@ -39,6 +39,7 @@
  
  #include "bindings.h"
  #include "config.h"
+#include "cgroup_fuse.h"
  #include "cgroups/cgroup.h"
  #include "cgroups/cgroup_utils.h"
  #include "memory_utils.h"
@@ -574,41 +575,6 @@ static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
         return NULL;
  }
  
-static int is_dir(const char *path, int fd)
-{
-       struct stat statbuf;
-       int ret = fstatat(fd, path, &statbuf, fd);
-       if (ret == 0 && S_ISDIR(statbuf.st_mode))
-               return 1;
-       return 0;
-}
-
-static bool write_string(const char *fnam, const char *string, int fd)
-{
-       FILE *f;
-       size_t len, ret;
-
-       f = fdopen(fd, "w");
-       if (!f)
-               return false;
-
-       len = strlen(string);
-       ret = fwrite(string, 1, len, f);
-       if (ret != len) {
-               lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
-                           strerror(errno), string, fnam);
-               fclose(f);
-               return false;
-       }
-
-       if (fclose(f) < 0) {
-               lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
-               return false;
-       }
-
-       return true;
-}
-
  struct cgfs_files {
         char *name;
         uint32_t uid, gid;
@@ -627,10 +593,9 @@ static void print_subsystems(void)
         }
  }
  
-bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
-               const char *value)
+bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
  {
-       int ret, fd, cfd;
+       int ret, cfd;
         size_t len;
         char *fnam;
  
@@ -647,2401 +612,353 @@ bool cgfs_set_value(const char *controller, const char *cgroup, const char *file
         if (ret < 0 || (size_t)ret >= len)
                 return false;
  
-       fd = openat(cfd, fnam, O_WRONLY);
-       if (fd < 0)
-               return false;
-
-       return write_string(fnam, value, fd);
+       return (faccessat(cfd, fnam, F_OK, 0) == 0);
  }
  
-// Chown all the files in the cgroup directory.  We do this when we create
-// a cgroup on behalf of a user.
-static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
-{
-       struct dirent *direntp;
-       char path[MAXPATHLEN];
-       size_t len;
-       DIR *d;
-       int fd1, ret;
+#define SEND_CREDS_OK 0
+#define SEND_CREDS_NOTSK 1
+#define SEND_CREDS_FAIL 2
+static bool recv_creds(int sock, struct ucred *cred, char *v);
+static int wait_for_pid(pid_t pid);
+static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
+static int send_creds_clone_wrapper(void *arg);
  
-       len = strlen(dirname);
-       if (len >= MAXPATHLEN) {
-               lxcfs_error("Pathname too long: %s\n", dirname);
-               return;
-       }
+/*
+ * clone a task which switches to @task's namespace and writes '1'.
+ * over a unix sock so we can read the task's reaper's pid in our
+ * namespace
+ *
+ * Note: glibc's fork() does not respect pidns, which can lead to failed
+ * assertions inside glibc (and thus failed forks) if the child's pid in
+ * the pidns and the parent pid outside are identical. Using clone prevents
+ * this issue.
+ */
+static void write_task_init_pid_exit(int sock, pid_t target)
+{
+       char fnam[100];
+       pid_t pid;
+       int fd, ret;
+       size_t stack_size = sysconf(_SC_PAGESIZE);
+       void *stack = alloca(stack_size);
  
-       fd1 = openat(fd, dirname, O_DIRECTORY);
-       if (fd1 < 0)
-               return;
+       ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
+       if (ret < 0 || ret >= sizeof(fnam))
+               _exit(1);
  
-       d = fdopendir(fd1);
-       if (!d) {
-               lxcfs_error("Failed to open %s\n", dirname);
-               return;
+       fd = open(fnam, O_RDONLY);
+       if (fd < 0) {
+               perror("write_task_init_pid_exit open of ns/pid");
+               _exit(1);
         }
-
-       while ((direntp = readdir(d))) {
-               if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
-                       continue;
-               ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
-               if (ret < 0 || ret >= MAXPATHLEN) {
-                       lxcfs_error("Pathname too long under %s\n", dirname);
-                       continue;
-               }
-               if (fchownat(fd, path, uid, gid, 0) < 0)
-                       lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
+       if (setns(fd, 0)) {
+               perror("write_task_init_pid_exit setns 1");
+               close(fd);
+               _exit(1);
+       }
+       pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
+       if (pid < 0)
+               _exit(1);
+       if (pid != 0) {
+               if (!wait_for_pid(pid))
+                       _exit(1);
+               _exit(0);
         }
-       closedir(d);
  }
  
-int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
-{
-       int cfd;
-       size_t len;
-       char *dirnam;
-
-       cfd = get_cgroup_fd(controller);
-       if (cfd < 0)
-               return -EINVAL;
-
-       /* Make sure we pass a relative path to *at() family of functions.
-        * . + /cg + \0
-        */
-       len = strlen(cg) + 2;
-       dirnam = alloca(len);
-       snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
-
-       if (mkdirat(cfd, dirnam, 0755) < 0)
-               return -errno;
-
-       if (uid == 0 && gid == 0)
-               return 0;
-
-       if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
-               return -errno;
-
-       chown_all_cgroup_files(dirnam, uid, gid, cfd);
+static int send_creds_clone_wrapper(void *arg) {
+       struct ucred cred;
+       char v;
+       int sock = *(int *)arg;
  
+       /* we are the child */
+       cred.uid = 0;
+       cred.gid = 0;
+       cred.pid = 1;
+       v = '1';
+       if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
+               return 1;
         return 0;
  }
  
-static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
+static pid_t get_init_pid_for_task(pid_t task)
  {
-       struct dirent *direntp;
-       DIR *dir;
-       bool ret = false;
-       char pathname[MAXPATHLEN];
-       int dupfd;
-
-       dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
-       if (dupfd < 0)
-               return false;
-
-       dir = fdopendir(dupfd);
-       if (!dir) {
-               lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
-               close(dupfd);
-               return false;
-       }
-
-       while ((direntp = readdir(dir))) {
-               struct stat mystat;
-               int rc;
-
-               if (!strcmp(direntp->d_name, ".") ||
-                   !strcmp(direntp->d_name, ".."))
-                       continue;
-
-               rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
-               if (rc < 0 || rc >= MAXPATHLEN) {
-                       lxcfs_error("%s\n", "Pathname too long.");
-                       continue;
-               }
-
-               rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
-               if (rc) {
-                       lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
-                       continue;
-               }
-               if (S_ISDIR(mystat.st_mode))
-                       if (!recursive_rmdir(pathname, fd, cfd))
-                               lxcfs_debug("Error removing %s.\n", pathname);
-       }
+       int sock[2];
+       pid_t pid;
+       pid_t ret = -1;
+       char v = '0';
+       struct ucred cred;
  
-       ret = true;
-       if (closedir(dir) < 0) {
-               lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
-               ret = false;
+       if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
+               perror("socketpair");
+               return -1;
         }
  
-       if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
-               lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
-               ret = false;
+       pid = fork();
+       if (pid < 0)
+               goto out;
+       if (!pid) {
+               close(sock[1]);
+               write_task_init_pid_exit(sock[0], task);
+               _exit(0);
         }
  
-       close(dupfd);
+       if (!recv_creds(sock[1], &cred, &v))
+               goto out;
+       ret = cred.pid;
  
+out:
+       close(sock[0]);
+       close(sock[1]);
+       if (pid > 0)
+               wait_for_pid(pid);
         return ret;
  }
  
-bool cgfs_remove(const char *controller, const char *cg)
+pid_t lookup_initpid_in_store(pid_t qpid)
  {
-       int fd, cfd;
-       size_t len;
-       char *dirnam;
-       bool bret;
-
-       cfd = get_cgroup_fd(controller);
-       if (cfd < 0)
-               return false;
-
-       /* Make sure we pass a relative path to *at() family of functions.
-        * . +  /cg + \0
-        */
-       len = strlen(cg) + 2;
-       dirnam = alloca(len);
-       snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
+       pid_t answer = 0;
+       struct stat sb;
+       struct pidns_init_store *e;
+       char fnam[100];
  
-       fd = openat(cfd, dirnam, O_DIRECTORY);
-       if (fd < 0)
-               return false;
+       snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
+       store_lock();
+       if (stat(fnam, &sb) < 0)
+               goto out;
+       e = lookup_verify_initpid(&sb);
+       if (e) {
+               answer = e->initpid;
+               goto out;
+       }
+       answer = get_init_pid_for_task(qpid);
+       if (answer > 0)
+               save_initpid(&sb, answer);
  
-       bret = recursive_rmdir(dirnam, fd, cfd);
-       close(fd);
-       return bret;
+out:
+       /* we prune at end in case we are returning
+        * the value we were about to return */
+       prune_initpid_store();
+       store_unlock();
+       return answer;
  }
  
-bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
+static int wait_for_pid(pid_t pid)
  {
-       int cfd;
-       size_t len;
-       char *pathname;
-
-       cfd = get_cgroup_fd(controller);
-       if (cfd < 0)
-               return false;
+       int status, ret;
  
-       /* Make sure we pass a relative path to *at() family of functions.
-        * . + /file + \0
-        */
-       len = strlen(file) + 2;
-       pathname = alloca(len);
-       snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
-       if (fchmodat(cfd, pathname, mode, 0) < 0)
-               return false;
-       return true;
-}
+       if (pid <= 0)
+               return -1;
  
-static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
-{
-       size_t len;
-       char *fname;
-
-       len = strlen(dirname) + strlen("/cgroup.procs") + 1;
-       fname = alloca(len);
-       snprintf(fname, len, "%s/tasks", dirname);
-       if (fchownat(fd, fname, uid, gid, 0) != 0)
-               return -errno;
-       snprintf(fname, len, "%s/cgroup.procs", dirname);
-       if (fchownat(fd, fname, uid, gid, 0) != 0)
-               return -errno;
+again:
+       ret = waitpid(pid, &status, 0);
+       if (ret == -1) {
+               if (errno == EINTR)
+                       goto again;
+               return -1;
+       }
+       if (ret != pid)
+               goto again;
+       if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
+               return -1;
         return 0;
  }
  
-int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
+char *get_pid_cgroup(pid_t pid, const char *contrl)
  {
         int cfd;
-       size_t len;
-       char *pathname;
  
-       cfd = get_cgroup_fd(controller);
+       cfd = get_cgroup_fd(contrl);
         if (cfd < 0)
                 return false;
  
-       /* Make sure we pass a relative path to *at() family of functions.
-        * . + /file + \0
-        */
-       len = strlen(file) + 2;
-       pathname = alloca(len);
-       snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
-       if (fchownat(cfd, pathname, uid, gid, 0) < 0)
-               return -errno;
-
-       if (is_dir(pathname, cfd))
-               // like cgmanager did, we want to chown the tasks file as well
-               return chown_tasks_files(pathname, uid, gid, cfd);
+       if (pure_unified_layout(cgroup_ops))
+               return cg_unified_get_current_cgroup(pid);
  
-       return 0;
+       return cg_legacy_get_current_cgroup(pid, contrl);
  }
  
-FILE *open_pids_file(const char *controller, const char *cgroup)
+#define INITSCOPE "/init.scope"
+void prune_init_slice(char *cg)
  {
-       int fd, cfd;
-       size_t len;
-       char *pathname;
-
-       cfd = get_cgroup_fd(controller);
-       if (cfd < 0)
-               return false;
-
-       /* Make sure we pass a relative path to *at() family of functions.
-        * . + /cgroup + / "cgroup.procs" + \0
-        */
-       len = strlen(cgroup) + strlen("cgroup.procs") + 3;
-       pathname = alloca(len);
-       snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
+       char *point;
+       size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
  
-       fd = openat(cfd, pathname, O_WRONLY);
-       if (fd < 0)
-               return NULL;
+       if (cg_len < initscope_len)
+               return;
  
-       return fdopen(fd, "w");
+       point = cg + cg_len - initscope_len;
+       if (strcmp(point, INITSCOPE) == 0) {
+               if (point == cg)
+                       *(point+1) = '\0';
+               else
+                       *point = '\0';
+       }
  }
  
-static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
-                                void ***list, size_t typesize,
-                                void* (*iterator)(const char*, const char*, const char*))
+#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
+
+static bool wait_for_sock(int sock, int timeout)
  {
-       int cfd, fd, ret;
-       size_t len;
-       char *cg;
-       char pathname[MAXPATHLEN];
-       size_t sz = 0, asz = 0;
-       struct dirent *dirent;
-       DIR *dir;
+       struct epoll_event ev;
+       int epfd, ret, now, starttime, deltatime, saved_errno;
  
-       cfd = get_cgroup_fd(controller);
-       *list = NULL;
-       if (cfd < 0)
+       if ((starttime = time(NULL)) < 0)
                 return false;
  
-       /* Make sure we pass a relative path to *at() family of functions. */
-       len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
-       cg = alloca(len);
-       ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
-       if (ret < 0 || (size_t)ret >= len) {
-               lxcfs_error("Pathname too long under %s\n", cgroup);
+       if ((epfd = epoll_create(1)) < 0) {
+               lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
                 return false;
         }
  
-       fd = openat(cfd, cg, O_DIRECTORY);
-       if (fd < 0)
-               return false;
-
-       dir = fdopendir(fd);
-       if (!dir)
-               return false;
-
-       while ((dirent = readdir(dir))) {
-               struct stat mystat;
-
-               if (!strcmp(dirent->d_name, ".") ||
-                   !strcmp(dirent->d_name, ".."))
-                       continue;
-
-               ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
-               if (ret < 0 || ret >= MAXPATHLEN) {
-                       lxcfs_error("Pathname too long under %s\n", cg);
-                       continue;
-               }
-
-               ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
-               if (ret) {
-                       lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
-                       continue;
-               }
-               if ((!directories && !S_ISREG(mystat.st_mode)) ||
-                   (directories && !S_ISDIR(mystat.st_mode)))
-                       continue;
-
-               if (sz+2 >= asz) {
-                       void **tmp;
-                       asz += BATCH_SIZE;
-                       do {
-                               tmp = realloc(*list, asz * typesize);
-                       } while  (!tmp);
-                       *list = tmp;
-               }
-               (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
-               (*list)[sz+1] = NULL;
-               sz++;
-       }
-       if (closedir(dir) < 0) {
-               lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
-               return false;
-       }
-       return true;
-}
-
-static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
-{
-       char *dup;
-       do {
-               dup = strdup(dir_entry);
-       } while (!dup);
-       return dup;
-}
-
-bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
-{
-       return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
-}
-
-void free_key(struct cgfs_files *k)
-{
-       if (!k)
-               return;
-       free_disarm(k->name);
-       free_disarm(k);
-}
-
-void free_keys(struct cgfs_files **keys)
-{
-       int i;
-
-       if (!keys)
-               return;
-       for (i = 0; keys[i]; i++) {
-               free_key(keys[i]);
-       }
-       free_disarm(keys);
-}
-
-bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
-{
-       int ret, cfd;
-       size_t len;
-       char *fnam;
-
-       cfd = get_cgroup_fd(controller);
-       if (cfd < 0)
-               return false;
-
-       /* Make sure we pass a relative path to *at() family of functions.
-        * . + /cgroup + / + file + \0
-        */
-       len = strlen(cgroup) + strlen(file) + 3;
-       fnam = alloca(len);
-       ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
-       if (ret < 0 || (size_t)ret >= len)
-               return false;
-
-       return (faccessat(cfd, fnam, F_OK, 0) == 0);
-}
-
-struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
-{
-       int ret, cfd;
-       size_t len;
-       char *fnam;
-       struct stat sb;
-       struct cgfs_files *newkey;
-
-       cfd = get_cgroup_fd(controller);
-       if (cfd < 0)
-               return false;
-
-       if (file && *file == '/')
-               file++;
-
-       if (file && strchr(file, '/'))
-               return NULL;
-
-       /* Make sure we pass a relative path to *at() family of functions.
-        * . + /cgroup + / + file + \0
-        */
-       len = strlen(cgroup) + 3;
-       if (file)
-               len += strlen(file) + 1;
-       fnam = alloca(len);
-       snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
-                file ? "/" : "", file ? file : "");
-
-       ret = fstatat(cfd, fnam, &sb, 0);
-       if (ret < 0)
-               return NULL;
-
-       do {
-               newkey = malloc(sizeof(struct cgfs_files));
-       } while (!newkey);
-       if (file)
-               newkey->name = must_copy_string(file);
-       else if (strrchr(cgroup, '/'))
-               newkey->name = must_copy_string(strrchr(cgroup, '/'));
-       else
-               newkey->name = must_copy_string(cgroup);
-       newkey->uid = sb.st_uid;
-       newkey->gid = sb.st_gid;
-       newkey->mode = sb.st_mode;
-
-       return newkey;
-}
-
-static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
-{
-       struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
-       if (!entry) {
-               lxcfs_error("Error getting files under %s:%s\n", controller,
-                            cgroup);
-       }
-       return entry;
-}
-
-bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
-{
-       return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
-}
-
-bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
-{
-       int cfd;
-       size_t len;
-       char *fnam;
-       int ret;
-       struct stat sb;
-
-       cfd = get_cgroup_fd(controller);
-       if (cfd < 0)
-               return false;
-
-       /* Make sure we pass a relative path to *at() family of functions.
-        * . + /cgroup + / + f + \0
-        */
-       len = strlen(cgroup) + strlen(f) + 3;
-       fnam = alloca(len);
-       ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
-       if (ret < 0 || (size_t)ret >= len)
-               return false;
-
-       ret = fstatat(cfd, fnam, &sb, 0);
-       if (ret < 0 || !S_ISDIR(sb.st_mode))
-               return false;
-
-       return true;
-}
-
-#define SEND_CREDS_OK 0
-#define SEND_CREDS_NOTSK 1
-#define SEND_CREDS_FAIL 2
-static bool recv_creds(int sock, struct ucred *cred, char *v);
-static int wait_for_pid(pid_t pid);
-static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
-static int send_creds_clone_wrapper(void *arg);
-
-/*
- * clone a task which switches to @task's namespace and writes '1'.
- * over a unix sock so we can read the task's reaper's pid in our
- * namespace
- *
- * Note: glibc's fork() does not respect pidns, which can lead to failed
- * assertions inside glibc (and thus failed forks) if the child's pid in
- * the pidns and the parent pid outside are identical. Using clone prevents
- * this issue.
- */
-static void write_task_init_pid_exit(int sock, pid_t target)
-{
-       char fnam[100];
-       pid_t pid;
-       int fd, ret;
-       size_t stack_size = sysconf(_SC_PAGESIZE);
-       void *stack = alloca(stack_size);
-
-       ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
-       if (ret < 0 || ret >= sizeof(fnam))
-               _exit(1);
-
-       fd = open(fnam, O_RDONLY);
-       if (fd < 0) {
-               perror("write_task_init_pid_exit open of ns/pid");
-               _exit(1);
-       }
-       if (setns(fd, 0)) {
-               perror("write_task_init_pid_exit setns 1");
-               close(fd);
-               _exit(1);
-       }
-       pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
-       if (pid < 0)
-               _exit(1);
-       if (pid != 0) {
-               if (!wait_for_pid(pid))
-                       _exit(1);
-               _exit(0);
-       }
-}
-
-static int send_creds_clone_wrapper(void *arg) {
-       struct ucred cred;
-       char v;
-       int sock = *(int *)arg;
-
-       /* we are the child */
-       cred.uid = 0;
-       cred.gid = 0;
-       cred.pid = 1;
-       v = '1';
-       if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
-               return 1;
-       return 0;
-}
-
-static pid_t get_init_pid_for_task(pid_t task)
-{
-       int sock[2];
-       pid_t pid;
-       pid_t ret = -1;
-       char v = '0';
-       struct ucred cred;
-
-       if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
-               perror("socketpair");
-               return -1;
-       }
-
-       pid = fork();
-       if (pid < 0)
-               goto out;
-       if (!pid) {
-               close(sock[1]);
-               write_task_init_pid_exit(sock[0], task);
-               _exit(0);
-       }
-
-       if (!recv_creds(sock[1], &cred, &v))
-               goto out;
-       ret = cred.pid;
-
-out:
-       close(sock[0]);
-       close(sock[1]);
-       if (pid > 0)
-               wait_for_pid(pid);
-       return ret;
-}
-
-pid_t lookup_initpid_in_store(pid_t qpid)
-{
-       pid_t answer = 0;
-       struct stat sb;
-       struct pidns_init_store *e;
-       char fnam[100];
-
-       snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
-       store_lock();
-       if (stat(fnam, &sb) < 0)
-               goto out;
-       e = lookup_verify_initpid(&sb);
-       if (e) {
-               answer = e->initpid;
-               goto out;
-       }
-       answer = get_init_pid_for_task(qpid);
-       if (answer > 0)
-               save_initpid(&sb, answer);
-
-out:
-       /* we prune at end in case we are returning
-        * the value we were about to return */
-       prune_initpid_store();
-       store_unlock();
-       return answer;
-}
-
-static int wait_for_pid(pid_t pid)
-{
-       int status, ret;
-
-       if (pid <= 0)
-               return -1;
-
-again:
-       ret = waitpid(pid, &status, 0);
-       if (ret == -1) {
-               if (errno == EINTR)
-                       goto again;
-               return -1;
-       }
-       if (ret != pid)
-               goto again;
-       if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
-               return -1;
-       return 0;
-}
-
-
-/*
- * append pid to *src.
- * src: a pointer to a char* in which ot append the pid.
- * sz: the number of characters printed so far, minus trailing \0.
- * asz: the allocated size so far
- * pid: the pid to append
- */
-static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
-{
-       must_strcat(src, sz, asz, "%d\n", (int)pid);
-}
-
-/*
- * Given a open file * to /proc/pid/{u,g}id_map, and an id
- * valid in the caller's namespace, return the id mapped into
- * pid's namespace.
- * Returns the mapped id, or -1 on error.
- */
-unsigned int
-convert_id_to_ns(FILE *idfile, unsigned int in_id)
-{
-       unsigned int nsuid,   // base id for a range in the idfile's namespace
-                    hostuid, // base id for a range in the caller's namespace
-                    count;   // number of ids in this range
-       char line[400];
-       int ret;
-
-       fseek(idfile, 0L, SEEK_SET);
-       while (fgets(line, 400, idfile)) {
-               ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
-               if (ret != 3)
-                       continue;
-               if (hostuid + count < hostuid || nsuid + count < nsuid) {
-                       /*
-                        * uids wrapped around - unexpected as this is a procfile,
-                        * so just bail.
-                        */
-                       lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
-                               nsuid, hostuid, count, line);
-                       return -1;
-               }
-               if (hostuid <= in_id && hostuid+count > in_id) {
-                       /*
-                        * now since hostuid <= in_id < hostuid+count, and
-                        * hostuid+count and nsuid+count do not wrap around,
-                        * we know that nsuid+(in_id-hostuid) which must be
-                        * less that nsuid+(count) must not wrap around
-                        */
-                       return (in_id - hostuid) + nsuid;
-               }
-       }
-
-       // no answer found
-       return -1;
-}
-
-/*
- * for is_privileged_over,
- * specify whether we require the calling uid to be root in his
- * namespace
- */
-#define NS_ROOT_REQD true
-#define NS_ROOT_OPT false
-
-#define PROCLEN 100
-
-static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
-{
-       char fpath[PROCLEN];
-       int ret;
-       bool answer = false;
-       uid_t nsuid;
-
-       if (victim == -1 || uid == -1)
-               return false;
-
-       /*
-        * If the request is one not requiring root in the namespace,
-        * then having the same uid suffices.  (i.e. uid 1000 has write
-        * access to files owned by uid 1000
-        */
-       if (!req_ns_root && uid == victim)
-               return true;
-
-       ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
-       if (ret < 0 || ret >= PROCLEN)
-               return false;
-       FILE *f = fopen(fpath, "r");
-       if (!f)
-               return false;
-
-       /* if caller's not root in his namespace, reject */
-       nsuid = convert_id_to_ns(f, uid);
-       if (nsuid)
-               goto out;
-
-       /*
-        * If victim is not mapped into caller's ns, reject.
-        * XXX I'm not sure this check is needed given that fuse
-        * will be sending requests where the vfs has converted
-        */
-       nsuid = convert_id_to_ns(f, victim);
-       if (nsuid == -1)
-               goto out;
-
-       answer = true;
-
-out:
-       fclose(f);
-       return answer;
-}
-
-static bool perms_include(int fmode, mode_t req_mode)
-{
-       mode_t r;
-
-       switch (req_mode & O_ACCMODE) {
-       case O_RDONLY:
-               r = S_IROTH;
-               break;
-       case O_WRONLY:
-               r = S_IWOTH;
-               break;
-       case O_RDWR:
-               r = S_IROTH | S_IWOTH;
-               break;
-       default:
-               return false;
-       }
-       return ((fmode & r) == r);
-}
-
-
-/*
- * taskcg is  a/b/c
- * querycg is /a/b/c/d/e
- * we return 'd'
- */
-static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
-{
-       char *start, *end;
-
-       if (strlen(taskcg) <= strlen(querycg)) {
-               lxcfs_error("%s\n", "I was fed bad input.");
-               return NULL;
-       }
-
-       if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
-               start =  strdup(taskcg + 1);
-       else
-               start = strdup(taskcg + strlen(querycg) + 1);
-       if (!start)
-               return NULL;
-       end = strchr(start, '/');
-       if (end)
-               *end = '\0';
-       return start;
-}
-
-char *get_pid_cgroup(pid_t pid, const char *contrl)
-{
-       int cfd;
-
-       cfd = get_cgroup_fd(contrl);
-       if (cfd < 0)
-               return false;
-
-       if (pure_unified_layout(cgroup_ops))
-               return cg_unified_get_current_cgroup(pid);
-
-       return cg_legacy_get_current_cgroup(pid, contrl);
-}
-
-/*
- * check whether a fuse context may access a cgroup dir or file
- *
- * If file is not null, it is a cgroup file to check under cg.
- * If file is null, then we are checking perms on cg itself.
- *
- * For files we can check the mode of the list_keys result.
- * For cgroups, we must make assumptions based on the files under the
- * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
- * yet.
- */
-static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
-{
-       struct cgfs_files *k = NULL;
-       bool ret = false;
-
-       k = cgfs_get_key(contrl, cg, file);
-       if (!k)
-               return false;
-
-       if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
-               if (perms_include(k->mode >> 6, mode)) {
-                       ret = true;
-                       goto out;
-               }
-       }
-       if (fc->gid == k->gid) {
-               if (perms_include(k->mode >> 3, mode)) {
-                       ret = true;
-                       goto out;
-               }
-       }
-       ret = perms_include(k->mode, mode);
-
-out:
-       free_key(k);
-       return ret;
-}
-
-#define INITSCOPE "/init.scope"
-void prune_init_slice(char *cg)
-{
-       char *point;
-       size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
-
-       if (cg_len < initscope_len)
-               return;
-
-       point = cg + cg_len - initscope_len;
-       if (strcmp(point, INITSCOPE) == 0) {
-               if (point == cg)
-                       *(point+1) = '\0';
-               else
-                       *point = '\0';
-       }
-}
-
-/*
- * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
- * If pid is in /a, he may act on /a/b, but not on /b.
- * if the answer is false and nextcg is not NULL, then *nextcg will point
- * to a string containing the next cgroup directory under cg, which must be
- * freed by the caller.
- */
-static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
-{
-       bool answer = false;
-       char *c2 = get_pid_cgroup(pid, contrl);
-       char *linecmp;
-
-       if (!c2)
-               return false;
-       prune_init_slice(c2);
-
-       /*
-        * callers pass in '/' or './' (openat()) for root cgroup, otherwise
-        * they pass in a cgroup without leading '/'
-        *
-        * The original line here was:
-        *      linecmp = *cg == '/' ? c2 : c2+1;
-        * TODO: I'm not sure why you'd want to increment when *cg != '/'?
-        *       Serge, do you know?
-        */
-       if (*cg == '/' || !strncmp(cg, "./", 2))
-               linecmp = c2;
-       else
-               linecmp = c2 + 1;
-       if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
-               if (nextcg) {
-                       *nextcg = get_next_cgroup_dir(linecmp, cg);
-               }
-               goto out;
-       }
-       answer = true;
-
-out:
-       free(c2);
-       return answer;
-}
-
-/*
- * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
- */
-static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
-{
-       bool answer = false;
-       char *c2, *task_cg;
-       size_t target_len, task_len;
-
-       if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
-               return true;
-
-       c2 = get_pid_cgroup(pid, contrl);
-       if (!c2)
-               return false;
-       prune_init_slice(c2);
-
-       task_cg = c2 + 1;
-       target_len = strlen(cg);
-       task_len = strlen(task_cg);
-       if (task_len == 0) {
-               /* Task is in the root cg, it can see everything. This case is
-                * not handled by the strmcps below, since they test for the
-                * last /, but that is the first / that we've chopped off
-                * above.
-                */
-               answer = true;
-               goto out;
-       }
-       if (strcmp(cg, task_cg) == 0) {
-               answer = true;
-               goto out;
-       }
-       if (target_len < task_len) {
-               /* looking up a parent dir */
-               if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
-                       answer = true;
-               goto out;
-       }
-       if (target_len > task_len) {
-               /* looking up a child dir */
-               if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
-                       answer = true;
-               goto out;
-       }
-
-out:
-       free(c2);
-       return answer;
-}
-
-/*
- * given /cgroup/freezer/a/b, return "freezer".
- * the returned char* should NOT be freed.
- */
-static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
-{
-       const char *p1;
-       char *contr, *slash;
-
-       if (strlen(path) < 9) {
-               errno = EACCES;
-               return NULL;
-       }
-       if (*(path + 7) != '/') {
-               errno = EINVAL;
-               return NULL;
-       }
-       p1 = path + 8;
-       contr = strdupa(p1);
-       if (!contr) {
-               errno = ENOMEM;
-               return NULL;
-       }
-       slash = strstr(contr, "/");
-       if (slash)
-               *slash = '\0';
-
-       for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
-               if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
-                       return (*h)->__controllers;
-       }
-       errno = ENOENT;
-       return NULL;
-}
-
-/*
- * Find the start of cgroup in /cgroup/controller/the/cgroup/path
- * Note that the returned value may include files (keynames) etc
- */
-static const char *find_cgroup_in_path(const char *path)
-{
-       const char *p1;
-
-       if (strlen(path) < 9) {
-               errno = EACCES;
-               return NULL;
-       }
-       p1 = strstr(path + 8, "/");
-       if (!p1) {
-               errno = EINVAL;
-               return NULL;
-       }
-       errno = 0;
-       return p1 + 1;
-}
-
-/*
- * split the last path element from the path in @cg.
- * @dir is newly allocated and should be freed, @last not
-*/
-static void get_cgdir_and_path(const char *cg, char **dir, char **last)
-{
-       char *p;
-
-       do {
-               *dir = strdup(cg);
-       } while (!*dir);
-       *last = strrchr(cg, '/');
-       if (!*last) {
-               *last = NULL;
-               return;
-       }
-       p = strrchr(*dir, '/');
-       *p = '\0';
-}
-
-/*
- * FUSE ops for /cgroup
- */
-
-int cg_getattr(const char *path, struct stat *sb)
-{
-       struct timespec now;
-       struct fuse_context *fc = fuse_get_context();
-       char * cgdir = NULL;
-       char *last = NULL, *path1, *path2;
-       struct cgfs_files *k = NULL;
-       const char *cgroup;
-       const char *controller = NULL;
-       int ret = -ENOENT;
-
-
-       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-               return -EIO;
-
-       memset(sb, 0, sizeof(struct stat));
-
-       if (clock_gettime(CLOCK_REALTIME, &now) < 0)
-               return -EINVAL;
-
-       sb->st_uid = sb->st_gid = 0;
-       sb->st_atim = sb->st_mtim = sb->st_ctim = now;
-       sb->st_size = 0;
-
-       if (strcmp(path, "/cgroup") == 0) {
-               sb->st_mode = S_IFDIR | 00755;
-               sb->st_nlink = 2;
-               return 0;
-       }
-
-       controller = pick_controller_from_path(fc, path);
-       if (!controller)
-               return -errno;
-       cgroup = find_cgroup_in_path(path);
-       if (!cgroup) {
-               /* this is just /cgroup/controller, return it as a dir */
-               sb->st_mode = S_IFDIR | 00755;
-               sb->st_nlink = 2;
-               return 0;
-       }
-
-       get_cgdir_and_path(cgroup, &cgdir, &last);
-
-       if (!last) {
-               path1 = "/";
-               path2 = cgdir;
-       } else {
-               path1 = cgdir;
-               path2 = last;
-       }
-
-       pid_t initpid = lookup_initpid_in_store(fc->pid);
-       if (initpid <= 1 || is_shared_pidns(initpid))
-               initpid = fc->pid;
-       /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
-        * Then check that caller's cgroup is under path if last is a child
-        * cgroup, or cgdir if last is a file */
-
-       if (is_child_cgroup(controller, path1, path2)) {
-               if (!caller_may_see_dir(initpid, controller, cgroup)) {
-                       ret = -ENOENT;
-                       goto out;
-               }
-               if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
-                       /* this is just /cgroup/controller, return it as a dir */
-                       sb->st_mode = S_IFDIR | 00555;
-                       sb->st_nlink = 2;
-                       ret = 0;
-                       goto out;
-               }
-               if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
-                       ret = -EACCES;
-                       goto out;
-               }
-
-               // get uid, gid, from '/tasks' file and make up a mode
-               // That is a hack, until cgmanager gains a GetCgroupPerms fn.
-               sb->st_mode = S_IFDIR | 00755;
-               k = cgfs_get_key(controller, cgroup, NULL);
-               if (!k) {
-                       sb->st_uid = sb->st_gid = 0;
-               } else {
-                       sb->st_uid = k->uid;
-                       sb->st_gid = k->gid;
-               }
-               free_key(k);
-               sb->st_nlink = 2;
-               ret = 0;
-               goto out;
-       }
-
-       if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
-               sb->st_mode = S_IFREG | k->mode;
-               sb->st_nlink = 1;
-               sb->st_uid = k->uid;
-               sb->st_gid = k->gid;
-               sb->st_size = 0;
-               free_key(k);
-               if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
-                       ret = -ENOENT;
-                       goto out;
-               }
-               ret = 0;
-       }
-
-out:
-       free(cgdir);
-       return ret;
-}
-
-int cg_opendir(const char *path, struct fuse_file_info *fi)
-{
-       struct fuse_context *fc = fuse_get_context();
-       const char *cgroup;
-       struct file_info *dir_info;
-       char *controller = NULL;
-
-       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-               return -EIO;
-
-       if (strcmp(path, "/cgroup") == 0) {
-               cgroup = NULL;
-               controller = NULL;
-       } else {
-               // return list of keys for the controller, and list of child cgroups
-               controller = pick_controller_from_path(fc, path);
-               if (!controller)
-                       return -errno;
-
-               cgroup = find_cgroup_in_path(path);
-               if (!cgroup) {
-                       /* this is just /cgroup/controller, return its contents */
-                       cgroup = "/";
-               }
-       }
-
-       pid_t initpid = lookup_initpid_in_store(fc->pid);
-       if (initpid <= 1 || is_shared_pidns(initpid))
-               initpid = fc->pid;
-       if (cgroup) {
-               if (!caller_may_see_dir(initpid, controller, cgroup))
-                       return -ENOENT;
-               if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
-                       return -EACCES;
-       }
-
-       /* we'll free this at cg_releasedir */
-       dir_info = malloc(sizeof(*dir_info));
-       if (!dir_info)
-               return -ENOMEM;
-       dir_info->controller = must_copy_string(controller);
-       dir_info->cgroup = must_copy_string(cgroup);
-       dir_info->type = LXC_TYPE_CGDIR;
-       dir_info->buf = NULL;
-       dir_info->file = NULL;
-       dir_info->buflen = 0;
-
-       fi->fh = (unsigned long)dir_info;
-       return 0;
-}
-
-int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
-               struct fuse_file_info *fi)
-{
-       struct file_info *d = (struct file_info *)fi->fh;
-       struct cgfs_files **list = NULL;
-       int i, ret;
-       char *nextcg = NULL;
-       struct fuse_context *fc = fuse_get_context();
-       char **clist = NULL;
-
-       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-               return -EIO;
-
-       if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
-               return -EIO;
-
-       if (d->type != LXC_TYPE_CGDIR) {
-               lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
-               return -EIO;
-       }
-       if (!d->cgroup && !d->controller) {
-               /*
-                * ls /var/lib/lxcfs/cgroup - just show list of controllers.
-                * This only works with the legacy hierarchy.
-                */
-               for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
-                       if (is_unified_hierarchy(*h))
-                               continue;
-
-                       if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
-                               return -EIO;
-               }
-
-               return 0;
-       }
-
-       if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
-               // not a valid cgroup
-               ret = -EINVAL;
-               goto out;
-       }
-
-       pid_t initpid = lookup_initpid_in_store(fc->pid);
-       if (initpid <= 1 || is_shared_pidns(initpid))
-               initpid = fc->pid;
-       if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
-               if (nextcg) {
-                       ret = filler(buf, nextcg,  NULL, 0);
-                       free(nextcg);
-                       if (ret != 0) {
-                               ret = -EIO;
-                               goto out;
-                       }
-               }
-               ret = 0;
-               goto out;
-       }
-
-       for (i = 0; list && list[i]; i++) {
-               if (filler(buf, list[i]->name, NULL, 0) != 0) {
-                       ret = -EIO;
-                       goto out;
-               }
-       }
-
-       // now get the list of child cgroups
-
-       if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
-               ret = 0;
-               goto out;
-       }
-       if (clist) {
-               for (i = 0; clist[i]; i++) {
-                       if (filler(buf, clist[i], NULL, 0) != 0) {
-                               ret = -EIO;
-                               goto out;
-                       }
-               }
-       }
-       ret = 0;
-
-out:
-       free_keys(list);
-       if (clist) {
-               for (i = 0; clist[i]; i++)
-                       free(clist[i]);
-               free(clist);
-       }
-       return ret;
-}
-
-void do_release_file_info(struct fuse_file_info *fi)
-{
-       struct file_info *f = (struct file_info *)fi->fh;
-
-       if (!f)
-               return;
-
-       fi->fh = 0;
-
-       free_disarm(f->controller);
-       free_disarm(f->cgroup);
-       free_disarm(f->file);
-       free_disarm(f->buf);
-       free_disarm(f);
-}
-
-int cg_releasedir(const char *path, struct fuse_file_info *fi)
-{
-       do_release_file_info(fi);
-       return 0;
-}
-
-int cg_open(const char *path, struct fuse_file_info *fi)
-{
-       const char *cgroup;
-       char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
-       struct cgfs_files *k = NULL;
-       struct file_info *file_info;
-       struct fuse_context *fc = fuse_get_context();
-       int ret;
-
-       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-               return -EIO;
-
-       controller = pick_controller_from_path(fc, path);
-       if (!controller)
-               return -errno;
-       cgroup = find_cgroup_in_path(path);
-       if (!cgroup)
-               return -errno;
-
-       get_cgdir_and_path(cgroup, &cgdir, &last);
-       if (!last) {
-               path1 = "/";
-               path2 = cgdir;
-       } else {
-               path1 = cgdir;
-               path2 = last;
-       }
-
-       k = cgfs_get_key(controller, path1, path2);
-       if (!k) {
-               ret = -EINVAL;
-               goto out;
-       }
-       free_key(k);
-
-       pid_t initpid = lookup_initpid_in_store(fc->pid);
-       if (initpid <= 1 || is_shared_pidns(initpid))
-               initpid = fc->pid;
-       if (!caller_may_see_dir(initpid, controller, path1)) {
-               ret = -ENOENT;
-               goto out;
-       }
-       if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
-               ret = -EACCES;
-               goto out;
-       }
-
-       /* we'll free this at cg_release */
-       file_info = malloc(sizeof(*file_info));
-       if (!file_info) {
-               ret = -ENOMEM;
-               goto out;
-       }
-       file_info->controller = must_copy_string(controller);
-       file_info->cgroup = must_copy_string(path1);
-       file_info->file = must_copy_string(path2);
-       file_info->type = LXC_TYPE_CGFILE;
-       file_info->buf = NULL;
-       file_info->buflen = 0;
-
-       fi->fh = (unsigned long)file_info;
-       ret = 0;
-
-out:
-       free(cgdir);
-       return ret;
-}
-
-int cg_access(const char *path, int mode)
-{
-       int ret;
-       const char *cgroup;
-       char *path1, *path2, *controller;
-       char *last = NULL, *cgdir = NULL;
-       struct cgfs_files *k = NULL;
-       struct fuse_context *fc = fuse_get_context();
-
-       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-               return -EIO;
-
-       if (strcmp(path, "/cgroup") == 0)
-               return 0;
-
-       controller = pick_controller_from_path(fc, path);
-       if (!controller)
-               return -errno;
-       cgroup = find_cgroup_in_path(path);
-       if (!cgroup) {
-               // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
-               if ((mode & W_OK) == 0)
-                       return 0;
-               return -EACCES;
-       }
-
-       get_cgdir_and_path(cgroup, &cgdir, &last);
-       if (!last) {
-               path1 = "/";
-               path2 = cgdir;
-       } else {
-               path1 = cgdir;
-               path2 = last;
-       }
-
-       k = cgfs_get_key(controller, path1, path2);
-       if (!k) {
-               if ((mode & W_OK) == 0)
-                       ret = 0;
-               else
-                       ret = -EACCES;
-               goto out;
-       }
-       free_key(k);
-
-       pid_t initpid = lookup_initpid_in_store(fc->pid);
-       if (initpid <= 1 || is_shared_pidns(initpid))
-               initpid = fc->pid;
-       if (!caller_may_see_dir(initpid, controller, path1)) {
-               ret = -ENOENT;
-               goto out;
-       }
-       if (!fc_may_access(fc, controller, path1, path2, mode)) {
-               ret = -EACCES;
-               goto out;
-       }
-
-       ret = 0;
-
-out:
-       free(cgdir);
-       return ret;
-}
-
-int cg_release(const char *path, struct fuse_file_info *fi)
-{
-       do_release_file_info(fi);
-       return 0;
-}
-
-#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
-
-static bool wait_for_sock(int sock, int timeout)
-{
-       struct epoll_event ev;
-       int epfd, ret, now, starttime, deltatime, saved_errno;
-
-       if ((starttime = time(NULL)) < 0)
-               return false;
-
-       if ((epfd = epoll_create(1)) < 0) {
-               lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
-               return false;
-       }
-
-       ev.events = POLLIN_SET;
-       ev.data.fd = sock;
-       if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
-               lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
-               close(epfd);
-               return false;
-       }
-
-again:
-       if ((now = time(NULL)) < 0) {
-               close(epfd);
-               return false;
-       }
-
-       deltatime = (starttime + timeout) - now;
-       if (deltatime < 0) { // timeout
-               errno = 0;
-               close(epfd);
-               return false;
-       }
-       ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
-       if (ret < 0 && errno == EINTR)
-               goto again;
-       saved_errno = errno;
-       close(epfd);
-
-       if (ret <= 0) {
-               errno = saved_errno;
-               return false;
-       }
-       return true;
-}
-
-static int msgrecv(int sockfd, void *buf, size_t len)
-{
-       if (!wait_for_sock(sockfd, 2))
-               return -1;
-       return recv(sockfd, buf, len, MSG_DONTWAIT);
-}
-
-static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
-{
-       struct msghdr msg = { 0 };
-       struct iovec iov;
-       struct cmsghdr *cmsg;
-       char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
-       char buf[1];
-       buf[0] = 'p';
-
-       if (pingfirst) {
-               if (msgrecv(sock, buf, 1) != 1) {
-                       lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
-                       return SEND_CREDS_FAIL;
-               }
-       }
-
-       msg.msg_control = cmsgbuf;
-       msg.msg_controllen = sizeof(cmsgbuf);
-
-       cmsg = CMSG_FIRSTHDR(&msg);
-       cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
-       cmsg->cmsg_level = SOL_SOCKET;
-       cmsg->cmsg_type = SCM_CREDENTIALS;
-       memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
-
-       msg.msg_name = NULL;
-       msg.msg_namelen = 0;
-
-       buf[0] = v;
-       iov.iov_base = buf;
-       iov.iov_len = sizeof(buf);
-       msg.msg_iov = &iov;
-       msg.msg_iovlen = 1;
-
-       if (sendmsg(sock, &msg, 0) < 0) {
-               lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
-               if (errno == 3)
-                       return SEND_CREDS_NOTSK;
-               return SEND_CREDS_FAIL;
-       }
-
-       return SEND_CREDS_OK;
-}
-
-static bool recv_creds(int sock, struct ucred *cred, char *v)
-{
-       struct msghdr msg = { 0 };
-       struct iovec iov;
-       struct cmsghdr *cmsg;
-       char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
-       char buf[1];
-       int ret;
-       int optval = 1;
-
-       *v = '1';
-
-       cred->pid = -1;
-       cred->uid = -1;
-       cred->gid = -1;
-
-       if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
-               lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
-               return false;
-       }
-       buf[0] = '1';
-       if (write(sock, buf, 1) != 1) {
-               lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
-               return false;
-       }
-
-       msg.msg_name = NULL;
-       msg.msg_namelen = 0;
-       msg.msg_control = cmsgbuf;
-       msg.msg_controllen = sizeof(cmsgbuf);
-
-       iov.iov_base = buf;
-       iov.iov_len = sizeof(buf);
-       msg.msg_iov = &iov;
-       msg.msg_iovlen = 1;
-
-       if (!wait_for_sock(sock, 2)) {
-               lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
-               return false;
-       }
-       ret = recvmsg(sock, &msg, MSG_DONTWAIT);
-       if (ret < 0) {
-               lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
-               return false;
-       }
-
-       cmsg = CMSG_FIRSTHDR(&msg);
-
-       if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
-                       cmsg->cmsg_level == SOL_SOCKET &&
-                       cmsg->cmsg_type == SCM_CREDENTIALS) {
-               memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
-       }
-       *v = buf[0];
-
-       return true;
-}
-
-struct pid_ns_clone_args {
-       int *cpipe;
-       int sock;
-       pid_t tpid;
-       int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
-};
-
-/*
- * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
- * with clone(). This simply writes '1' as ACK back to the parent
- * before calling the actual wrapped function.
- */
-static int pid_ns_clone_wrapper(void *arg) {
-       struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
-       char b = '1';
-
-       close(args->cpipe[0]);
-       if (write(args->cpipe[1], &b, sizeof(char)) < 0)
-               lxcfs_error("(child): error on write: %s.\n", strerror(errno));
-       close(args->cpipe[1]);
-       return args->wrapped(args->sock, args->tpid);
-}
-
-/*
- * pid_to_ns - reads pids from a ucred over a socket, then writes the
- * int value back over the socket.  This shifts the pid from the
- * sender's pidns into tpid's pidns.
- */
-static int pid_to_ns(int sock, pid_t tpid)
-{
-       char v = '0';
-       struct ucred cred;
-
-       while (recv_creds(sock, &cred, &v)) {
-               if (v == '1')
-                       return 0;
-               if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
-                       return 1;
-       }
-       return 0;
-}
-
-
-/*
- * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
- * in your old pidns.  Only children which you clone will be in the target
- * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
- * actually convert pids.
- *
- * Note: glibc's fork() does not respect pidns, which can lead to failed
- * assertions inside glibc (and thus failed forks) if the child's pid in
- * the pidns and the parent pid outside are identical. Using clone prevents
- * this issue.
- */
-static void pid_to_ns_wrapper(int sock, pid_t tpid)
-{
-       int newnsfd = -1, ret, cpipe[2];
-       char fnam[100];
-       pid_t cpid;
-       char v;
-
-       ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
-       if (ret < 0 || ret >= sizeof(fnam))
-               _exit(1);
-       newnsfd = open(fnam, O_RDONLY);
-       if (newnsfd < 0)
-               _exit(1);
-       if (setns(newnsfd, 0) < 0)
-               _exit(1);
-       close(newnsfd);
-
-       if (pipe(cpipe) < 0)
-               _exit(1);
-
-       struct pid_ns_clone_args args = {
-               .cpipe = cpipe,
-               .sock = sock,
-               .tpid = tpid,
-               .wrapped = &pid_to_ns
-       };
-       size_t stack_size = sysconf(_SC_PAGESIZE);
-       void *stack = alloca(stack_size);
-
-       cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
-       if (cpid < 0)
-               _exit(1);
-
-       // give the child 1 second to be done forking and
-       // write its ack
-       if (!wait_for_sock(cpipe[0], 1))
-               _exit(1);
-       ret = read(cpipe[0], &v, 1);
-       if (ret != sizeof(char) || v != '1')
-               _exit(1);
-
-       if (!wait_for_pid(cpid))
-               _exit(1);
-       _exit(0);
-}
-
-/*
- * To read cgroup files with a particular pid, we will setns into the child
- * pidns, open a pipe, fork a child - which will be the first to really be in
- * the child ns - which does the cgfs_get_value and writes the data to the pipe.
- */
-bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
-{
-       int sock[2] = {-1, -1};
-       char *tmpdata = NULL;
-       int ret;
-       pid_t qpid, cpid = -1;
-       bool answer = false;
-       char v = '0';
-       struct ucred cred;
-       size_t sz = 0, asz = 0;
-
-       if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
-               return false;
-
-       /*
-        * Now we read the pids from returned data one by one, pass
-        * them into a child in the target namespace, read back the
-        * translated pids, and put them into our to-return data
-        */
-
-       if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
-               perror("socketpair");
-               free(tmpdata);
-               return false;
-       }
-
-       cpid = fork();
-       if (cpid == -1)
-               goto out;
-
-       if (!cpid) // child - exits when done
-               pid_to_ns_wrapper(sock[1], tpid);
-
-       char *ptr = tmpdata;
-       cred.uid = 0;
-       cred.gid = 0;
-       while (sscanf(ptr, "%d\n", &qpid) == 1) {
-               cred.pid = qpid;
-               ret = send_creds(sock[0], &cred, v, true);
-
-               if (ret == SEND_CREDS_NOTSK)
-                       goto next;
-               if (ret == SEND_CREDS_FAIL)
-                       goto out;
-
-               // read converted results
-               if (!wait_for_sock(sock[0], 2)) {
-                       lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
-                       goto out;
-               }
-               if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
-                       lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
-                       goto out;
-               }
-               must_strcat_pid(d, &sz, &asz, qpid);
-next:
-               ptr = strchr(ptr, '\n');
-               if (!ptr)
-                       break;
-               ptr++;
-       }
-
-       cred.pid = getpid();
-       v = '1';
-       if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
-               // failed to ask child to exit
-               lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
-               goto out;
-       }
-
-       answer = true;
-
-out:
-       free(tmpdata);
-       if (cpid != -1)
-               wait_for_pid(cpid);
-       if (sock[0] != -1) {
-               close(sock[0]);
-               close(sock[1]);
-       }
-       return answer;
-}
-
-int cg_read(const char *path, char *buf, size_t size, off_t offset,
-               struct fuse_file_info *fi)
-{
-       struct fuse_context *fc = fuse_get_context();
-       struct file_info *f = (struct file_info *)fi->fh;
-       struct cgfs_files *k = NULL;
-       char *data = NULL;
-       int ret, s;
-       bool r;
-
-       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-               return -EIO;
-
-       if (f->type != LXC_TYPE_CGFILE) {
-               lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
-               return -EIO;
-       }
-
-       if (offset)
-               return 0;
-
-       if (!f->controller)
-               return -EINVAL;
-
-       if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
-               return -EINVAL;
-       }
-       free_key(k);
-
-
-       if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
-               ret = -EACCES;
-               goto out;
-       }
-
-       if (strcmp(f->file, "tasks") == 0 ||
-                       strcmp(f->file, "/tasks") == 0 ||
-                       strcmp(f->file, "/cgroup.procs") == 0 ||
-                       strcmp(f->file, "cgroup.procs") == 0)
-               // special case - we have to translate the pids
-               r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
-       else
-               r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
-
-       if (!r) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       if (!data) {
-               ret = 0;
-               goto out;
-       }
-       s = strlen(data);
-       if (s > size)
-               s = size;
-       memcpy(buf, data, s);
-       if (s > 0 && s < size && data[s-1] != '\n')
-               buf[s++] = '\n';
-
-       ret = s;
-
-out:
-       free(data);
-       return ret;
-}
-
-static int pid_from_ns(int sock, pid_t tpid)
-{
-       pid_t vpid;
-       struct ucred cred;
-       char v;
-       int ret;
-
-       cred.uid = 0;
-       cred.gid = 0;
-       while (1) {
-               if (!wait_for_sock(sock, 2)) {
-                       lxcfs_error("%s\n", "Timeout reading from parent.");
-                       return 1;
-               }
-               if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
-                       lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
-                       return 1;
-               }
-               if (vpid == -1) // done
-                       break;
-               v = '0';
-               cred.pid = vpid;
-               if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
-                       v = '1';
-                       cred.pid = getpid();
-                       if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
-                               return 1;
-               }
-       }
-       return 0;
-}
-
-static void pid_from_ns_wrapper(int sock, pid_t tpid)
-{
-       int newnsfd = -1, ret, cpipe[2];
-       char fnam[100];
-       pid_t cpid;
-       char v;
-
-       ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
-       if (ret < 0 || ret >= sizeof(fnam))
-               _exit(1);
-       newnsfd = open(fnam, O_RDONLY);
-       if (newnsfd < 0)
-               _exit(1);
-       if (setns(newnsfd, 0) < 0)
-               _exit(1);
-       close(newnsfd);
-
-       if (pipe(cpipe) < 0)
-               _exit(1);
-
-       struct pid_ns_clone_args args = {
-               .cpipe = cpipe,
-               .sock = sock,
-               .tpid = tpid,
-               .wrapped = &pid_from_ns
-       };
-       size_t stack_size = sysconf(_SC_PAGESIZE);
-       void *stack = alloca(stack_size);
-
-       cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
-       if (cpid < 0)
-               _exit(1);
-
-       // give the child 1 second to be done forking and
-       // write its ack
-       if (!wait_for_sock(cpipe[0], 1))
-               _exit(1);
-       ret = read(cpipe[0], &v, 1);
-       if (ret != sizeof(char) || v != '1')
-               _exit(1);
-
-       if (!wait_for_pid(cpid))
-               _exit(1);
-       _exit(0);
-}
-
-/*
- * Given host @uid, return the uid to which it maps in
- * @pid's user namespace, or -1 if none.
- */
-bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
-{
-       FILE *f;
-       char line[400];
-
-       sprintf(line, "/proc/%d/uid_map", pid);
-       if ((f = fopen(line, "r")) == NULL) {
-               return false;
-       }
-
-       *answer = convert_id_to_ns(f, uid);
-       fclose(f);
-
-       if (*answer == -1)
-               return false;
-       return true;
-}
-
-/*
- * get_pid_creds: get the real uid and gid of @pid from
- * /proc/$$/status
- * (XXX should we use euid here?)
- */
-void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
-{
-       char line[400];
-       uid_t u;
-       gid_t g;
-       FILE *f;
-
-       *uid = -1;
-       *gid = -1;
-       sprintf(line, "/proc/%d/status", pid);
-       if ((f = fopen(line, "r")) == NULL) {
-               lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
-               return;
-       }
-       while (fgets(line, 400, f)) {
-               if (strncmp(line, "Uid:", 4) == 0) {
-                       if (sscanf(line+4, "%u", &u) != 1) {
-                               lxcfs_error("bad uid line for pid %u\n", pid);
-                               fclose(f);
-                               return;
-                       }
-                       *uid = u;
-               } else if (strncmp(line, "Gid:", 4) == 0) {
-                       if (sscanf(line+4, "%u", &g) != 1) {
-                               lxcfs_error("bad gid line for pid %u\n", pid);
-                               fclose(f);
-                               return;
-                       }
-                       *gid = g;
-               }
-       }
-       fclose(f);
-}
-
-/*
- * May the requestor @r move victim @v to a new cgroup?
- * This is allowed if
- *   . they are the same task
- *   . they are ownedy by the same uid
- *   . @r is root on the host, or
- *   . @v's uid is mapped into @r's where @r is root.
- */
-bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
-{
-       uid_t v_uid, tmpuid;
-       gid_t v_gid;
-
-       if (r == v)
-               return true;
-       if (r_uid == 0)
-               return true;
-       get_pid_creds(v, &v_uid, &v_gid);
-       if (r_uid == v_uid)
-               return true;
-       if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
-                       && hostuid_to_ns(v_uid, r, &tmpuid))
-               return true;
-       return false;
-}
-
-static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
-               const char *file, const char *buf)
-{
-       int sock[2] = {-1, -1};
-       pid_t qpid, cpid = -1;
-       FILE *pids_file = NULL;
-       bool answer = false, fail = false;
-
-       pids_file = open_pids_file(contrl, cg);
-       if (!pids_file)
+       ev.events = POLLIN_SET;
+       ev.data.fd = sock;
+       if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
+               lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
+               close(epfd);
                 return false;
-
-       /*
-        * write the pids to a socket, have helper in writer's pidns
-        * call movepid for us
-        */
-       if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
-               perror("socketpair");
-               goto out;
-       }
-
-       cpid = fork();
-       if (cpid == -1)
-               goto out;
-
-       if (!cpid) { // child
-               fclose(pids_file);
-               pid_from_ns_wrapper(sock[1], tpid);
-       }
-
-       const char *ptr = buf;
-       while (sscanf(ptr, "%d", &qpid) == 1) {
-               struct ucred cred;
-               char v;
-
-               if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
-                       lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
-                       goto out;
-               }
-
-               if (recv_creds(sock[0], &cred, &v)) {
-                       if (v == '0') {
-                               if (!may_move_pid(tpid, tuid, cred.pid)) {
-                                       fail = true;
-                                       break;
-                               }
-                               if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
-                                       fail = true;
-                       }
-               }
-
-               ptr = strchr(ptr, '\n');
-               if (!ptr)
-                       break;
-               ptr++;
-       }
-
-       /* All good, write the value */
-       qpid = -1;
-       if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
-               lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
-
-       if (!fail)
-               answer = true;
-
-out:
-       if (cpid != -1)
-               wait_for_pid(cpid);
-       if (sock[0] != -1) {
-               close(sock[0]);
-               close(sock[1]);
         }
-       if (pids_file) {
-               if (fclose(pids_file) != 0)
-                       answer = false;
-       }
-       return answer;
-}
-
-int cg_write(const char *path, const char *buf, size_t size, off_t offset,
-            struct fuse_file_info *fi)
-{
-       struct fuse_context *fc = fuse_get_context();
-       char *localbuf = NULL;
-       struct cgfs_files *k = NULL;
-       struct file_info *f = (struct file_info *)fi->fh;
-       bool r;
  
-       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-               return -EIO;
-
-       if (f->type != LXC_TYPE_CGFILE) {
-               lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
-               return -EIO;
+again:
+       if ((now = time(NULL)) < 0) {
+               close(epfd);
+               return false;
         }
  
-       if (offset)
-               return 0;
-
-       localbuf = alloca(size+1);
-       localbuf[size] = '\0';
-       memcpy(localbuf, buf, size);
-
-       if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
-               size = -EINVAL;
-               goto out;
+       deltatime = (starttime + timeout) - now;
+       if (deltatime < 0) { // timeout
+               errno = 0;
+               close(epfd);
+               return false;
         }
+       ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
+       if (ret < 0 && errno == EINTR)
+               goto again;
+       saved_errno = errno;
+       close(epfd);
  
-       if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
-               size = -EACCES;
-               goto out;
+       if (ret <= 0) {
+               errno = saved_errno;
+               return false;
         }
-
-       if (strcmp(f->file, "tasks") == 0 ||
-                       strcmp(f->file, "/tasks") == 0 ||
-                       strcmp(f->file, "/cgroup.procs") == 0 ||
-                       strcmp(f->file, "cgroup.procs") == 0)
-               // special case - we have to translate the pids
-               r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
-       else
-               r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
-
-       if (!r)
-               size = -EINVAL;
-
-out:
-       free_key(k);
-       return size;
+       return true;
  }
  
-int cg_chown(const char *path, uid_t uid, gid_t gid)
+static int msgrecv(int sockfd, void *buf, size_t len)
  {
-       struct fuse_context *fc = fuse_get_context();
-       char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
-       struct cgfs_files *k = NULL;
-       const char *cgroup;
-       int ret;
-
-       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-               return -EIO;
-
-       if (strcmp(path, "/cgroup") == 0)
-               return -EPERM;
-
-       controller = pick_controller_from_path(fc, path);
-       if (!controller)
-               return errno == ENOENT ? -EPERM : -errno;
-
-       cgroup = find_cgroup_in_path(path);
-       if (!cgroup)
-               /* this is just /cgroup/controller */
-               return -EPERM;
-
-       get_cgdir_and_path(cgroup, &cgdir, &last);
-
-       if (!last) {
-               path1 = "/";
-               path2 = cgdir;
-       } else {
-               path1 = cgdir;
-               path2 = last;
-       }
-
-       if (is_child_cgroup(controller, path1, path2)) {
-               // get uid, gid, from '/tasks' file and make up a mode
-               // That is a hack, until cgmanager gains a GetCgroupPerms fn.
-               k = cgfs_get_key(controller, cgroup, "tasks");
-
-       } else
-               k = cgfs_get_key(controller, path1, path2);
-
-       if (!k) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       /*
-        * This being a fuse request, the uid and gid must be valid
-        * in the caller's namespace.  So we can just check to make
-        * sure that the caller is root in his uid, and privileged
-        * over the file's current owner.
-        */
-       if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
-               ret = -EACCES;
-               goto out;
-       }
-
-       ret = cgfs_chown_file(controller, cgroup, uid, gid);
-
-out:
-       free_key(k);
-       free(cgdir);
-
-       return ret;
+       if (!wait_for_sock(sockfd, 2))
+               return -1;
+       return recv(sockfd, buf, len, MSG_DONTWAIT);
  }
  
-int cg_chmod(const char *path, mode_t mode)
+static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
  {
-       struct fuse_context *fc = fuse_get_context();
-       char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
-       struct cgfs_files *k = NULL;
-       const char *cgroup;
-       int ret;
-
-       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-               return -EIO;
-
-       if (strcmp(path, "/cgroup") == 0)
-               return -EPERM;
-
-       controller = pick_controller_from_path(fc, path);
-       if (!controller)
-               return errno == ENOENT ? -EPERM : -errno;
-
-       cgroup = find_cgroup_in_path(path);
-       if (!cgroup)
-               /* this is just /cgroup/controller */
-               return -EPERM;
-
-       get_cgdir_and_path(cgroup, &cgdir, &last);
+       struct msghdr msg = { 0 };
+       struct iovec iov;
+       struct cmsghdr *cmsg;
+       char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
+       char buf[1];
+       buf[0] = 'p';
  
-       if (!last) {
-               path1 = "/";
-               path2 = cgdir;
-       } else {
-               path1 = cgdir;
-               path2 = last;
+       if (pingfirst) {
+               if (msgrecv(sock, buf, 1) != 1) {
+                       lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
+                       return SEND_CREDS_FAIL;
+               }
         }
  
-       if (is_child_cgroup(controller, path1, path2)) {
-               // get uid, gid, from '/tasks' file and make up a mode
-               // That is a hack, until cgmanager gains a GetCgroupPerms fn.
-               k = cgfs_get_key(controller, cgroup, "tasks");
+       msg.msg_control = cmsgbuf;
+       msg.msg_controllen = sizeof(cmsgbuf);
  
-       } else
-               k = cgfs_get_key(controller, path1, path2);
+       cmsg = CMSG_FIRSTHDR(&msg);
+       cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+       cmsg->cmsg_level = SOL_SOCKET;
+       cmsg->cmsg_type = SCM_CREDENTIALS;
+       memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
  
-       if (!k) {
-               ret = -EINVAL;
-               goto out;
-       }
+       msg.msg_name = NULL;
+       msg.msg_namelen = 0;
  
-       /*
-        * This being a fuse request, the uid and gid must be valid
-        * in the caller's namespace.  So we can just check to make
-        * sure that the caller is root in his uid, and privileged
-        * over the file's current owner.
-        */
-       if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
-               ret = -EPERM;
-               goto out;
-       }
+       buf[0] = v;
+       iov.iov_base = buf;
+       iov.iov_len = sizeof(buf);
+       msg.msg_iov = &iov;
+       msg.msg_iovlen = 1;
  
-       if (!cgfs_chmod_file(controller, cgroup, mode)) {
-               ret = -EINVAL;
-               goto out;
+       if (sendmsg(sock, &msg, 0) < 0) {
+               lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
+               if (errno == 3)
+                       return SEND_CREDS_NOTSK;
+               return SEND_CREDS_FAIL;
         }
  
-       ret = 0;
-out:
-       free_key(k);
-       free(cgdir);
-       return ret;
+       return SEND_CREDS_OK;
  }
  
-int cg_mkdir(const char *path, mode_t mode)
+static bool recv_creds(int sock, struct ucred *cred, char *v)
  {
-       struct fuse_context *fc = fuse_get_context();
-       char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
-       const char *cgroup;
+       struct msghdr msg = { 0 };
+       struct iovec iov;
+       struct cmsghdr *cmsg;
+       char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
+       char buf[1];
         int ret;
+       int optval = 1;
  
-       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-               return -EIO;
-
-       controller = pick_controller_from_path(fc, path);
-       if (!controller)
-               return errno == ENOENT ? -EPERM : -errno;
-
-       cgroup = find_cgroup_in_path(path);
-       if (!cgroup)
-               return -errno;
-
-       get_cgdir_and_path(cgroup, &cgdir, &last);
-       if (!last)
-               path1 = "/";
-       else
-               path1 = cgdir;
+       *v = '1';
  
-       pid_t initpid = lookup_initpid_in_store(fc->pid);
-       if (initpid <= 1 || is_shared_pidns(initpid))
-               initpid = fc->pid;
-       if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
-               if (!next)
-                       ret = -EINVAL;
-               else if (last && strcmp(next, last) == 0)
-                       ret = -EEXIST;
-               else
-                       ret = -EPERM;
-               goto out;
-       }
+       cred->pid = -1;
+       cred->uid = -1;
+       cred->gid = -1;
  
-       if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
-               ret = -EACCES;
-               goto out;
+       if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
+               lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
+               return false;
         }
-       if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
-               ret = -EACCES;
-               goto out;
+       buf[0] = '1';
+       if (write(sock, buf, 1) != 1) {
+               lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
+               return false;
         }
  
-       ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
-
-out:
-       free(cgdir);
-       free(next);
-       return ret;
-}
-
-int cg_rmdir(const char *path)
-{
-       struct fuse_context *fc = fuse_get_context();
-       char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
-       const char *cgroup;
-       int ret;
-
-       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-               return -EIO;
-
-       controller = pick_controller_from_path(fc, path);
-       if (!controller) /* Someone's trying to delete "/cgroup". */
-               return -EPERM;
+       msg.msg_name = NULL;
+       msg.msg_namelen = 0;
+       msg.msg_control = cmsgbuf;
+       msg.msg_controllen = sizeof(cmsgbuf);
  
-       cgroup = find_cgroup_in_path(path);
-       if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
-               return -EPERM;
+       iov.iov_base = buf;
+       iov.iov_len = sizeof(buf);
+       msg.msg_iov = &iov;
+       msg.msg_iovlen = 1;
  
-       get_cgdir_and_path(cgroup, &cgdir, &last);
-       if (!last) {
-               /* Someone's trying to delete a cgroup on the same level as the
-                * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
-                * rmdir "/cgroup/blkio/init.slice".
-                */
-               ret = -EPERM;
-               goto out;
+       if (!wait_for_sock(sock, 2)) {
+               lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
+               return false;
         }
-
-       pid_t initpid = lookup_initpid_in_store(fc->pid);
-       if (initpid <= 1 || is_shared_pidns(initpid))
-               initpid = fc->pid;
-       if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
-               if (!last || (next && (strcmp(next, last) == 0)))
-                       ret = -EBUSY;
-               else
-                       ret = -ENOENT;
-               goto out;
+       ret = recvmsg(sock, &msg, MSG_DONTWAIT);
+       if (ret < 0) {
+               lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
+               return false;
         }
  
-       if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
-               ret = -EACCES;
-               goto out;
-       }
-       if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
-               ret = -EACCES;
-               goto out;
-       }
+       cmsg = CMSG_FIRSTHDR(&msg);
  
-       if (!cgfs_remove(controller, cgroup)) {
-               ret = -EINVAL;
-               goto out;
+       if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
+                       cmsg->cmsg_level == SOL_SOCKET &&
+                       cmsg->cmsg_type == SCM_CREDENTIALS) {
+               memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
         }
+       *v = buf[0];
  
-       ret = 0;
-
-out:
-       free(cgdir);
-       free(next);
-       return ret;
+       return true;
  }
  
+struct pid_ns_clone_args {
+       int *cpipe;
+       int sock;
+       pid_t tpid;
+       int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
+};
+
  static bool startswith(const char *line, const char *pref)
  {
         if (strncmp(line, pref, strlen(pref)) == 0)
diff --git a/bindings.h b/bindings.h

index e3c0c834e939d527f0b9a4c83116a718b190b6fd..7f928d6e2adfd10a332664a5dcfa5fab18713d50 100644 (file)
--- a/bindings.h
+++ b/bindings.h
@@ -2,6 +2,7 @@
  #define __LXCFS_BINDINGS_H
  
  #include "macro.h"
+#include "cgroup_fuse.h"
  #include "sysfs_fuse.h"
  
  /* directory under which we mount the controllers - /run/lxcfs/controllers */
@@ -42,23 +43,6 @@ struct lxcfs_opts {
         bool swap_off;
  };
  
-extern int cg_write(const char *path, const char *buf, size_t size, off_t offset,
-            struct fuse_file_info *fi);
-extern int cg_mkdir(const char *path, mode_t mode);
-extern int cg_chown(const char *path, uid_t uid, gid_t gid);
-extern int cg_rmdir(const char *path);
-extern int cg_chmod(const char *path, mode_t mode);
-extern int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
-               struct fuse_file_info *fi);
-extern int cg_releasedir(const char *path, struct fuse_file_info *fi);
-extern int cg_release(const char *path, struct fuse_file_info *fi);
-extern int cg_read(const char *path, char *buf, size_t size, off_t offset,
-               struct fuse_file_info *fi);
-extern int cg_opendir(const char *path, struct fuse_file_info *fi);
-extern int cg_getattr(const char *path, struct stat *sb);
-extern int cg_open(const char *path, struct fuse_file_info *fi);
-extern int cg_access(const char *path, int mode);
-
  extern int proc_getattr(const char *path, struct stat *sb);
  extern int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
                 struct fuse_file_info *fi);
diff --git a/cgroup_fuse.c b/cgroup_fuse.c

new file mode 100644 (file)

index 0000000..e7833a2
--- /dev/null
+++ b/cgroup_fuse.c
@@ -0,0 +1,2302 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#define FUSE_USE_VERSION 26
+
+#define __STDC_FORMAT_MACROS
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <fuse.h>
+#include <inttypes.h>
+#include <libgen.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <wait.h>
+#include <linux/magic.h>
+#include <linux/sched.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <sys/sysinfo.h>
+#include <sys/vfs.h>
+
+#include "bindings.h"
+#include "config.h"
+#include "cgroups/cgroup.h"
+#include "cgroups/cgroup_utils.h"
+#include "memory_utils.h"
+#include "utils.h"
+
+struct cgfs_files {
+       char *name;
+       uint32_t uid, gid;
+       uint32_t mode;
+};
+
+struct pid_ns_clone_args {
+       int *cpipe;
+       int sock;
+       pid_t tpid;
+       /* pid_from_ns or pid_to_ns. */
+       int (*wrapped) (int, pid_t);
+};
+
+/*
+ * given /cgroup/freezer/a/b, return "freezer".
+ * the returned char* should NOT be freed.
+ */
+static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
+{
+       const char *p1;
+       char *contr, *slash;
+
+       if (strlen(path) < 9) {
+               errno = EACCES;
+               return NULL;
+       }
+       if (*(path + 7) != '/') {
+               errno = EINVAL;
+               return NULL;
+       }
+       p1 = path + 8;
+       contr = strdupa(p1);
+       if (!contr) {
+               errno = ENOMEM;
+               return NULL;
+       }
+       slash = strstr(contr, "/");
+       if (slash)
+               *slash = '\0';
+
+       for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
+               if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
+                       return (*h)->__controllers;
+       }
+       errno = ENOENT;
+       return NULL;
+}
+
+/*
+ * Find the start of cgroup in /cgroup/controller/the/cgroup/path
+ * Note that the returned value may include files (keynames) etc
+ */
+static const char *find_cgroup_in_path(const char *path)
+{
+       const char *p1;
+
+       if (strlen(path) < 9) {
+               errno = EACCES;
+               return NULL;
+       }
+       p1 = strstr(path + 8, "/");
+       if (!p1) {
+               errno = EINVAL;
+               return NULL;
+       }
+       errno = 0;
+       return p1 + 1;
+}
+
+/*
+ * split the last path element from the path in @cg.
+ * @dir is newly allocated and should be freed, @last not
+*/
+static void get_cgdir_and_path(const char *cg, char **dir, char **last)
+{
+       char *p;
+
+       do {
+               *dir = strdup(cg);
+       } while (!*dir);
+       *last = strrchr(cg, '/');
+       if (!*last) {
+               *last = NULL;
+               return;
+       }
+       p = strrchr(*dir, '/');
+       *p = '\0';
+}
+
+static bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
+{
+       int cfd;
+       size_t len;
+       char *fnam;
+       int ret;
+       struct stat sb;
+
+       cfd = get_cgroup_fd(controller);
+       if (cfd < 0)
+               return false;
+
+       /* Make sure we pass a relative path to *at() family of functions.
+        * . + /cgroup + / + f + \0
+        */
+       len = strlen(cgroup) + strlen(f) + 3;
+       fnam = alloca(len);
+       ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
+       if (ret < 0 || (size_t)ret >= len)
+               return false;
+
+       ret = fstatat(cfd, fnam, &sb, 0);
+       if (ret < 0 || !S_ISDIR(sb.st_mode))
+               return false;
+
+       return true;
+}
+
+/*
+ * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
+ */
+static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
+{
+       bool answer = false;
+       char *c2, *task_cg;
+       size_t target_len, task_len;
+
+       if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
+               return true;
+
+       c2 = get_pid_cgroup(pid, contrl);
+       if (!c2)
+               return false;
+       prune_init_slice(c2);
+
+       task_cg = c2 + 1;
+       target_len = strlen(cg);
+       task_len = strlen(task_cg);
+       if (task_len == 0) {
+               /* Task is in the root cg, it can see everything. This case is
+                * not handled by the strmcps below, since they test for the
+                * last /, but that is the first / that we've chopped off
+                * above.
+                */
+               answer = true;
+               goto out;
+       }
+       if (strcmp(cg, task_cg) == 0) {
+               answer = true;
+               goto out;
+       }
+       if (target_len < task_len) {
+               /* looking up a parent dir */
+               if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
+                       answer = true;
+               goto out;
+       }
+       if (target_len > task_len) {
+               /* looking up a child dir */
+               if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
+                       answer = true;
+               goto out;
+       }
+
+out:
+       free(c2);
+       return answer;
+}
+
+/*
+ * taskcg is  a/b/c
+ * querycg is /a/b/c/d/e
+ * we return 'd'
+ */
+static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
+{
+       char *start, *end;
+
+       if (strlen(taskcg) <= strlen(querycg)) {
+               lxcfs_error("%s\n", "I was fed bad input.");
+               return NULL;
+       }
+
+       if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
+               start =  strdup(taskcg + 1);
+       else
+               start = strdup(taskcg + strlen(querycg) + 1);
+       if (!start)
+               return NULL;
+       end = strchr(start, '/');
+       if (end)
+               *end = '\0';
+       return start;
+}
+
+/*
+ * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
+ * If pid is in /a, he may act on /a/b, but not on /b.
+ * if the answer is false and nextcg is not NULL, then *nextcg will point
+ * to a string containing the next cgroup directory under cg, which must be
+ * freed by the caller.
+ */
+static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
+{
+       bool answer = false;
+       char *c2 = get_pid_cgroup(pid, contrl);
+       char *linecmp;
+
+       if (!c2)
+               return false;
+       prune_init_slice(c2);
+
+       /*
+        * callers pass in '/' or './' (openat()) for root cgroup, otherwise
+        * they pass in a cgroup without leading '/'
+        *
+        * The original line here was:
+        *      linecmp = *cg == '/' ? c2 : c2+1;
+        * TODO: I'm not sure why you'd want to increment when *cg != '/'?
+        *       Serge, do you know?
+        */
+       if (*cg == '/' || !strncmp(cg, "./", 2))
+               linecmp = c2;
+       else
+               linecmp = c2 + 1;
+       if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
+               if (nextcg) {
+                       *nextcg = get_next_cgroup_dir(linecmp, cg);
+               }
+               goto out;
+       }
+       answer = true;
+
+out:
+       free(c2);
+       return answer;
+}
+
+static struct cgfs_files *cgfs_get_key(const char *controller,
+                                      const char *cgroup, const char *file)
+{
+       int ret, cfd;
+       size_t len;
+       char *fnam;
+       struct stat sb;
+       struct cgfs_files *newkey;
+
+       cfd = get_cgroup_fd(controller);
+       if (cfd < 0)
+               return false;
+
+       if (file && *file == '/')
+               file++;
+
+       if (file && strchr(file, '/'))
+               return NULL;
+
+       /* Make sure we pass a relative path to *at() family of functions.
+        * . + /cgroup + / + file + \0
+        */
+       len = strlen(cgroup) + 3;
+       if (file)
+               len += strlen(file) + 1;
+       fnam = alloca(len);
+       snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
+                file ? "/" : "", file ? file : "");
+
+       ret = fstatat(cfd, fnam, &sb, 0);
+       if (ret < 0)
+               return NULL;
+
+       do {
+               newkey = malloc(sizeof(struct cgfs_files));
+       } while (!newkey);
+       if (file)
+               newkey->name = must_copy_string(file);
+       else if (strrchr(cgroup, '/'))
+               newkey->name = must_copy_string(strrchr(cgroup, '/'));
+       else
+               newkey->name = must_copy_string(cgroup);
+       newkey->uid = sb.st_uid;
+       newkey->gid = sb.st_gid;
+       newkey->mode = sb.st_mode;
+
+       return newkey;
+}
+
+/*
+ * Given a open file * to /proc/pid/{u,g}id_map, and an id
+ * valid in the caller's namespace, return the id mapped into
+ * pid's namespace.
+ * Returns the mapped id, or -1 on error.
+ */
+static unsigned int convert_id_to_ns(FILE *idfile, unsigned int in_id)
+{
+       unsigned int nsuid,   // base id for a range in the idfile's namespace
+                    hostuid, // base id for a range in the caller's namespace
+                    count;   // number of ids in this range
+       char line[400];
+       int ret;
+
+       fseek(idfile, 0L, SEEK_SET);
+       while (fgets(line, 400, idfile)) {
+               ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
+               if (ret != 3)
+                       continue;
+               if (hostuid + count < hostuid || nsuid + count < nsuid) {
+                       /*
+                        * uids wrapped around - unexpected as this is a procfile,
+                        * so just bail.
+                        */
+                       lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
+                               nsuid, hostuid, count, line);
+                       return -1;
+               }
+               if (hostuid <= in_id && hostuid+count > in_id) {
+                       /*
+                        * now since hostuid <= in_id < hostuid+count, and
+                        * hostuid+count and nsuid+count do not wrap around,
+                        * we know that nsuid+(in_id-hostuid) which must be
+                        * less that nsuid+(count) must not wrap around
+                        */
+                       return (in_id - hostuid) + nsuid;
+               }
+       }
+
+       // no answer found
+       return -1;
+}
+
+/*
+ * for is_privileged_over,
+ * specify whether we require the calling uid to be root in his
+ * namespace
+ */
+#define NS_ROOT_REQD true
+#define NS_ROOT_OPT false
+
+#define PROCLEN 100
+
+static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
+{
+       char fpath[PROCLEN];
+       int ret;
+       bool answer = false;
+       uid_t nsuid;
+
+       if (victim == -1 || uid == -1)
+               return false;
+
+       /*
+        * If the request is one not requiring root in the namespace,
+        * then having the same uid suffices.  (i.e. uid 1000 has write
+        * access to files owned by uid 1000
+        */
+       if (!req_ns_root && uid == victim)
+               return true;
+
+       ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
+       if (ret < 0 || ret >= PROCLEN)
+               return false;
+       FILE *f = fopen(fpath, "r");
+       if (!f)
+               return false;
+
+       /* if caller's not root in his namespace, reject */
+       nsuid = convert_id_to_ns(f, uid);
+       if (nsuid)
+               goto out;
+
+       /*
+        * If victim is not mapped into caller's ns, reject.
+        * XXX I'm not sure this check is needed given that fuse
+        * will be sending requests where the vfs has converted
+        */
+       nsuid = convert_id_to_ns(f, victim);
+       if (nsuid == -1)
+               goto out;
+
+       answer = true;
+
+out:
+       fclose(f);
+       return answer;
+}
+
+static bool perms_include(int fmode, mode_t req_mode)
+{
+       mode_t r;
+
+       switch (req_mode & O_ACCMODE) {
+       case O_RDONLY:
+               r = S_IROTH;
+               break;
+       case O_WRONLY:
+               r = S_IWOTH;
+               break;
+       case O_RDWR:
+               r = S_IROTH | S_IWOTH;
+               break;
+       default:
+               return false;
+       }
+       return ((fmode & r) == r);
+}
+
+static void free_key(struct cgfs_files *k)
+{
+       if (!k)
+               return;
+       free_disarm(k->name);
+       free_disarm(k);
+}
+
+/*
+ * check whether a fuse context may access a cgroup dir or file
+ *
+ * If file is not null, it is a cgroup file to check under cg.
+ * If file is null, then we are checking perms on cg itself.
+ *
+ * For files we can check the mode of the list_keys result.
+ * For cgroups, we must make assumptions based on the files under the
+ * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
+ * yet.
+ */
+static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
+{
+       struct cgfs_files *k = NULL;
+       bool ret = false;
+
+       k = cgfs_get_key(contrl, cg, file);
+       if (!k)
+               return false;
+
+       if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
+               if (perms_include(k->mode >> 6, mode)) {
+                       ret = true;
+                       goto out;
+               }
+       }
+       if (fc->gid == k->gid) {
+               if (perms_include(k->mode >> 3, mode)) {
+                       ret = true;
+                       goto out;
+               }
+       }
+       ret = perms_include(k->mode, mode);
+
+out:
+       free_key(k);
+       return ret;
+}
+
+int cg_getattr(const char *path, struct stat *sb)
+{
+       struct timespec now;
+       struct fuse_context *fc = fuse_get_context();
+       char * cgdir = NULL;
+       char *last = NULL, *path1, *path2;
+       struct cgfs_files *k = NULL;
+       const char *cgroup;
+       const char *controller = NULL;
+       int ret = -ENOENT;
+
+
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
+       memset(sb, 0, sizeof(struct stat));
+
+       if (clock_gettime(CLOCK_REALTIME, &now) < 0)
+               return -EINVAL;
+
+       sb->st_uid = sb->st_gid = 0;
+       sb->st_atim = sb->st_mtim = sb->st_ctim = now;
+       sb->st_size = 0;
+
+       if (strcmp(path, "/cgroup") == 0) {
+               sb->st_mode = S_IFDIR | 00755;
+               sb->st_nlink = 2;
+               return 0;
+       }
+
+       controller = pick_controller_from_path(fc, path);
+       if (!controller)
+               return -errno;
+       cgroup = find_cgroup_in_path(path);
+       if (!cgroup) {
+               /* this is just /cgroup/controller, return it as a dir */
+               sb->st_mode = S_IFDIR | 00755;
+               sb->st_nlink = 2;
+               return 0;
+       }
+
+       get_cgdir_and_path(cgroup, &cgdir, &last);
+
+       if (!last) {
+               path1 = "/";
+               path2 = cgdir;
+       } else {
+               path1 = cgdir;
+               path2 = last;
+       }
+
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 1 || is_shared_pidns(initpid))
+               initpid = fc->pid;
+       /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
+        * Then check that caller's cgroup is under path if last is a child
+        * cgroup, or cgdir if last is a file */
+
+       if (is_child_cgroup(controller, path1, path2)) {
+               if (!caller_may_see_dir(initpid, controller, cgroup)) {
+                       ret = -ENOENT;
+                       goto out;
+               }
+               if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
+                       /* this is just /cgroup/controller, return it as a dir */
+                       sb->st_mode = S_IFDIR | 00555;
+                       sb->st_nlink = 2;
+                       ret = 0;
+                       goto out;
+               }
+               if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
+                       ret = -EACCES;
+                       goto out;
+               }
+
+               // get uid, gid, from '/tasks' file and make up a mode
+               // That is a hack, until cgmanager gains a GetCgroupPerms fn.
+               sb->st_mode = S_IFDIR | 00755;
+               k = cgfs_get_key(controller, cgroup, NULL);
+               if (!k) {
+                       sb->st_uid = sb->st_gid = 0;
+               } else {
+                       sb->st_uid = k->uid;
+                       sb->st_gid = k->gid;
+               }
+               free_key(k);
+               sb->st_nlink = 2;
+               ret = 0;
+               goto out;
+       }
+
+       if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
+               sb->st_mode = S_IFREG | k->mode;
+               sb->st_nlink = 1;
+               sb->st_uid = k->uid;
+               sb->st_gid = k->gid;
+               sb->st_size = 0;
+               free_key(k);
+               if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
+                       ret = -ENOENT;
+                       goto out;
+               }
+               ret = 0;
+       }
+
+out:
+       free(cgdir);
+       return ret;
+}
+
+/*
+ * Chown all the files in the cgroup directory.  We do this when we create a
+ * cgroup on behalf of a user.
+ */
+static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
+{
+       struct dirent *direntp;
+       char path[MAXPATHLEN];
+       size_t len;
+       DIR *d;
+       int fd1, ret;
+
+       len = strlen(dirname);
+       if (len >= MAXPATHLEN) {
+               lxcfs_error("Pathname too long: %s\n", dirname);
+               return;
+       }
+
+       fd1 = openat(fd, dirname, O_DIRECTORY);
+       if (fd1 < 0)
+               return;
+
+       d = fdopendir(fd1);
+       if (!d) {
+               lxcfs_error("Failed to open %s\n", dirname);
+               return;
+       }
+
+       while ((direntp = readdir(d))) {
+               if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
+                       continue;
+               ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
+               if (ret < 0 || ret >= MAXPATHLEN) {
+                       lxcfs_error("Pathname too long under %s\n", dirname);
+                       continue;
+               }
+               if (fchownat(fd, path, uid, gid, 0) < 0)
+                       lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
+       }
+       closedir(d);
+}
+
+static int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
+{
+       int cfd;
+       size_t len;
+       char *dirnam;
+
+       cfd = get_cgroup_fd(controller);
+       if (cfd < 0)
+               return -EINVAL;
+
+       /* Make sure we pass a relative path to *at() family of functions.
+        * . + /cg + \0
+        */
+       len = strlen(cg) + 2;
+       dirnam = alloca(len);
+       snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
+
+       if (mkdirat(cfd, dirnam, 0755) < 0)
+               return -errno;
+
+       if (uid == 0 && gid == 0)
+               return 0;
+
+       if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
+               return -errno;
+
+       chown_all_cgroup_files(dirnam, uid, gid, cfd);
+
+       return 0;
+}
+
+int cg_mkdir(const char *path, mode_t mode)
+{
+       struct fuse_context *fc = fuse_get_context();
+       char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
+       const char *cgroup;
+       int ret;
+
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
+       controller = pick_controller_from_path(fc, path);
+       if (!controller)
+               return errno == ENOENT ? -EPERM : -errno;
+
+       cgroup = find_cgroup_in_path(path);
+       if (!cgroup)
+               return -errno;
+
+       get_cgdir_and_path(cgroup, &cgdir, &last);
+       if (!last)
+               path1 = "/";
+       else
+               path1 = cgdir;
+
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 1 || is_shared_pidns(initpid))
+               initpid = fc->pid;
+       if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
+               if (!next)
+                       ret = -EINVAL;
+               else if (last && strcmp(next, last) == 0)
+                       ret = -EEXIST;
+               else
+                       ret = -EPERM;
+               goto out;
+       }
+
+       if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
+               ret = -EACCES;
+               goto out;
+       }
+       if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
+               ret = -EACCES;
+               goto out;
+       }
+
+       ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
+
+out:
+       free(cgdir);
+       free(next);
+       return ret;
+}
+
+static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
+{
+       struct dirent *direntp;
+       DIR *dir;
+       bool ret = false;
+       char pathname[MAXPATHLEN];
+       int dupfd;
+
+       dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
+       if (dupfd < 0)
+               return false;
+
+       dir = fdopendir(dupfd);
+       if (!dir) {
+               lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
+               close(dupfd);
+               return false;
+       }
+
+       while ((direntp = readdir(dir))) {
+               struct stat mystat;
+               int rc;
+
+               if (!strcmp(direntp->d_name, ".") ||
+                   !strcmp(direntp->d_name, ".."))
+                       continue;
+
+               rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
+               if (rc < 0 || rc >= MAXPATHLEN) {
+                       lxcfs_error("%s\n", "Pathname too long.");
+                       continue;
+               }
+
+               rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
+               if (rc) {
+                       lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
+                       continue;
+               }
+               if (S_ISDIR(mystat.st_mode))
+                       if (!recursive_rmdir(pathname, fd, cfd))
+                               lxcfs_debug("Error removing %s.\n", pathname);
+       }
+
+       ret = true;
+       if (closedir(dir) < 0) {
+               lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
+               ret = false;
+       }
+
+       if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
+               lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
+               ret = false;
+       }
+
+       close(dupfd);
+
+       return ret;
+}
+
+static bool cgfs_remove(const char *controller, const char *cg)
+{
+       int fd, cfd;
+       size_t len;
+       char *dirnam;
+       bool bret;
+
+       cfd = get_cgroup_fd(controller);
+       if (cfd < 0)
+               return false;
+
+       /* Make sure we pass a relative path to *at() family of functions.
+        * . +  /cg + \0
+        */
+       len = strlen(cg) + 2;
+       dirnam = alloca(len);
+       snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
+
+       fd = openat(cfd, dirnam, O_DIRECTORY);
+       if (fd < 0)
+               return false;
+
+       bret = recursive_rmdir(dirnam, fd, cfd);
+       close(fd);
+       return bret;
+}
+
+int cg_rmdir(const char *path)
+{
+       struct fuse_context *fc = fuse_get_context();
+       char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
+       const char *cgroup;
+       int ret;
+
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
+       controller = pick_controller_from_path(fc, path);
+       if (!controller) /* Someone's trying to delete "/cgroup". */
+               return -EPERM;
+
+       cgroup = find_cgroup_in_path(path);
+       if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
+               return -EPERM;
+
+       get_cgdir_and_path(cgroup, &cgdir, &last);
+       if (!last) {
+               /* Someone's trying to delete a cgroup on the same level as the
+                * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
+                * rmdir "/cgroup/blkio/init.slice".
+                */
+               ret = -EPERM;
+               goto out;
+       }
+
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 1 || is_shared_pidns(initpid))
+               initpid = fc->pid;
+       if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
+               if (!last || (next && (strcmp(next, last) == 0)))
+                       ret = -EBUSY;
+               else
+                       ret = -ENOENT;
+               goto out;
+       }
+
+       if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
+               ret = -EACCES;
+               goto out;
+       }
+       if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
+               ret = -EACCES;
+               goto out;
+       }
+
+       if (!cgfs_remove(controller, cgroup)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = 0;
+
+out:
+       free(cgdir);
+       free(next);
+       return ret;
+}
+
+static bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
+{
+       int cfd;
+       size_t len;
+       char *pathname;
+
+       cfd = get_cgroup_fd(controller);
+       if (cfd < 0)
+               return false;
+
+       /* Make sure we pass a relative path to *at() family of functions.
+        * . + /file + \0
+        */
+       len = strlen(file) + 2;
+       pathname = alloca(len);
+       snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
+       if (fchmodat(cfd, pathname, mode, 0) < 0)
+               return false;
+       return true;
+}
+
+int cg_chmod(const char *path, mode_t mode)
+{
+       struct fuse_context *fc = fuse_get_context();
+       char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
+       struct cgfs_files *k = NULL;
+       const char *cgroup;
+       int ret;
+
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
+       if (strcmp(path, "/cgroup") == 0)
+               return -EPERM;
+
+       controller = pick_controller_from_path(fc, path);
+       if (!controller)
+               return errno == ENOENT ? -EPERM : -errno;
+
+       cgroup = find_cgroup_in_path(path);
+       if (!cgroup)
+               /* this is just /cgroup/controller */
+               return -EPERM;
+
+       get_cgdir_and_path(cgroup, &cgdir, &last);
+
+       if (!last) {
+               path1 = "/";
+               path2 = cgdir;
+       } else {
+               path1 = cgdir;
+               path2 = last;
+       }
+
+       if (is_child_cgroup(controller, path1, path2)) {
+               // get uid, gid, from '/tasks' file and make up a mode
+               // That is a hack, until cgmanager gains a GetCgroupPerms fn.
+               k = cgfs_get_key(controller, cgroup, "tasks");
+
+       } else
+               k = cgfs_get_key(controller, path1, path2);
+
+       if (!k) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * This being a fuse request, the uid and gid must be valid
+        * in the caller's namespace.  So we can just check to make
+        * sure that the caller is root in his uid, and privileged
+        * over the file's current owner.
+        */
+       if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
+               ret = -EPERM;
+               goto out;
+       }
+
+       if (!cgfs_chmod_file(controller, cgroup, mode)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = 0;
+out:
+       free_key(k);
+       free(cgdir);
+       return ret;
+}
+
+static int is_dir(const char *path, int fd)
+{
+       struct stat statbuf;
+       int ret = fstatat(fd, path, &statbuf, fd);
+       if (ret == 0 && S_ISDIR(statbuf.st_mode))
+               return 1;
+       return 0;
+}
+
+static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
+{
+       size_t len;
+       char *fname;
+
+       len = strlen(dirname) + strlen("/cgroup.procs") + 1;
+       fname = alloca(len);
+       snprintf(fname, len, "%s/tasks", dirname);
+       if (fchownat(fd, fname, uid, gid, 0) != 0)
+               return -errno;
+       snprintf(fname, len, "%s/cgroup.procs", dirname);
+       if (fchownat(fd, fname, uid, gid, 0) != 0)
+               return -errno;
+       return 0;
+}
+
+static int cgfs_chown_file(const char *controller, const char *file, uid_t uid,
+                          gid_t gid)
+{
+       int cfd;
+       size_t len;
+       char *pathname;
+
+       cfd = get_cgroup_fd(controller);
+       if (cfd < 0)
+               return false;
+
+       /* Make sure we pass a relative path to *at() family of functions.
+        * . + /file + \0
+        */
+       len = strlen(file) + 2;
+       pathname = alloca(len);
+       snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
+       if (fchownat(cfd, pathname, uid, gid, 0) < 0)
+               return -errno;
+
+       if (is_dir(pathname, cfd))
+               return chown_tasks_files(pathname, uid, gid, cfd);
+
+       return 0;
+}
+
+int cg_chown(const char *path, uid_t uid, gid_t gid)
+{
+       struct fuse_context *fc = fuse_get_context();
+       char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
+       struct cgfs_files *k = NULL;
+       const char *cgroup;
+       int ret;
+
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
+       if (strcmp(path, "/cgroup") == 0)
+               return -EPERM;
+
+       controller = pick_controller_from_path(fc, path);
+       if (!controller)
+               return errno == ENOENT ? -EPERM : -errno;
+
+       cgroup = find_cgroup_in_path(path);
+       if (!cgroup)
+               /* this is just /cgroup/controller */
+               return -EPERM;
+
+       get_cgdir_and_path(cgroup, &cgdir, &last);
+
+       if (!last) {
+               path1 = "/";
+               path2 = cgdir;
+       } else {
+               path1 = cgdir;
+               path2 = last;
+       }
+
+       if (is_child_cgroup(controller, path1, path2)) {
+               // get uid, gid, from '/tasks' file and make up a mode
+               // That is a hack, until cgmanager gains a GetCgroupPerms fn.
+               k = cgfs_get_key(controller, cgroup, "tasks");
+
+       } else
+               k = cgfs_get_key(controller, path1, path2);
+
+       if (!k) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * This being a fuse request, the uid and gid must be valid
+        * in the caller's namespace.  So we can just check to make
+        * sure that the caller is root in his uid, and privileged
+        * over the file's current owner.
+        */
+       if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
+               ret = -EACCES;
+               goto out;
+       }
+
+       ret = cgfs_chown_file(controller, cgroup, uid, gid);
+
+out:
+       free_key(k);
+       free(cgdir);
+
+       return ret;
+}
+
+int cg_open(const char *path, struct fuse_file_info *fi)
+{
+       const char *cgroup;
+       char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
+       struct cgfs_files *k = NULL;
+       struct file_info *file_info;
+       struct fuse_context *fc = fuse_get_context();
+       int ret;
+
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
+       controller = pick_controller_from_path(fc, path);
+       if (!controller)
+               return -errno;
+       cgroup = find_cgroup_in_path(path);
+       if (!cgroup)
+               return -errno;
+
+       get_cgdir_and_path(cgroup, &cgdir, &last);
+       if (!last) {
+               path1 = "/";
+               path2 = cgdir;
+       } else {
+               path1 = cgdir;
+               path2 = last;
+       }
+
+       k = cgfs_get_key(controller, path1, path2);
+       if (!k) {
+               ret = -EINVAL;
+               goto out;
+       }
+       free_key(k);
+
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 1 || is_shared_pidns(initpid))
+               initpid = fc->pid;
+       if (!caller_may_see_dir(initpid, controller, path1)) {
+               ret = -ENOENT;
+               goto out;
+       }
+       if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
+               ret = -EACCES;
+               goto out;
+       }
+
+       /* we'll free this at cg_release */
+       file_info = malloc(sizeof(*file_info));
+       if (!file_info) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       file_info->controller = must_copy_string(controller);
+       file_info->cgroup = must_copy_string(path1);
+       file_info->file = must_copy_string(path2);
+       file_info->type = LXC_TYPE_CGFILE;
+       file_info->buf = NULL;
+       file_info->buflen = 0;
+
+       fi->fh = (unsigned long)file_info;
+       ret = 0;
+
+out:
+       free(cgdir);
+       return ret;
+}
+
+#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
+
+static bool wait_for_sock(int sock, int timeout)
+{
+       struct epoll_event ev;
+       int epfd, ret, now, starttime, deltatime, saved_errno;
+
+       if ((starttime = time(NULL)) < 0)
+               return false;
+
+       if ((epfd = epoll_create(1)) < 0) {
+               lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
+               return false;
+       }
+
+       ev.events = POLLIN_SET;
+       ev.data.fd = sock;
+       if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
+               lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
+               close(epfd);
+               return false;
+       }
+
+again:
+       if ((now = time(NULL)) < 0) {
+               close(epfd);
+               return false;
+       }
+
+       deltatime = (starttime + timeout) - now;
+       if (deltatime < 0) { // timeout
+               errno = 0;
+               close(epfd);
+               return false;
+       }
+       ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
+       if (ret < 0 && errno == EINTR)
+               goto again;
+       saved_errno = errno;
+       close(epfd);
+
+       if (ret <= 0) {
+               errno = saved_errno;
+               return false;
+       }
+       return true;
+}
+
+static int msgrecv(int sockfd, void *buf, size_t len)
+{
+       if (!wait_for_sock(sockfd, 2))
+               return -1;
+       return recv(sockfd, buf, len, MSG_DONTWAIT);
+}
+
+#define SEND_CREDS_OK 0
+#define SEND_CREDS_NOTSK 1
+#define SEND_CREDS_FAIL 2
+
+static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
+{
+       struct msghdr msg = { 0 };
+       struct iovec iov;
+       struct cmsghdr *cmsg;
+       char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
+       char buf[1];
+       buf[0] = 'p';
+
+       if (pingfirst) {
+               if (msgrecv(sock, buf, 1) != 1) {
+                       lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
+                       return SEND_CREDS_FAIL;
+               }
+       }
+
+       msg.msg_control = cmsgbuf;
+       msg.msg_controllen = sizeof(cmsgbuf);
+
+       cmsg = CMSG_FIRSTHDR(&msg);
+       cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+       cmsg->cmsg_level = SOL_SOCKET;
+       cmsg->cmsg_type = SCM_CREDENTIALS;
+       memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
+
+       msg.msg_name = NULL;
+       msg.msg_namelen = 0;
+
+       buf[0] = v;
+       iov.iov_base = buf;
+       iov.iov_len = sizeof(buf);
+       msg.msg_iov = &iov;
+       msg.msg_iovlen = 1;
+
+       if (sendmsg(sock, &msg, 0) < 0) {
+               lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
+               if (errno == 3)
+                       return SEND_CREDS_NOTSK;
+               return SEND_CREDS_FAIL;
+       }
+
+       return SEND_CREDS_OK;
+}
+
+static int wait_for_pid(pid_t pid)
+{
+       int status, ret;
+
+       if (pid <= 0)
+               return -1;
+
+again:
+       ret = waitpid(pid, &status, 0);
+       if (ret == -1) {
+               if (errno == EINTR)
+                       goto again;
+               return -1;
+       }
+       if (ret != pid)
+               goto again;
+       if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
+               return -1;
+       return 0;
+}
+
+static bool recv_creds(int sock, struct ucred *cred, char *v)
+{
+       struct msghdr msg = { 0 };
+       struct iovec iov;
+       struct cmsghdr *cmsg;
+       char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
+       char buf[1];
+       int ret;
+       int optval = 1;
+
+       *v = '1';
+
+       cred->pid = -1;
+       cred->uid = -1;
+       cred->gid = -1;
+
+       if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
+               lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
+               return false;
+       }
+       buf[0] = '1';
+       if (write(sock, buf, 1) != 1) {
+               lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
+               return false;
+       }
+
+       msg.msg_name = NULL;
+       msg.msg_namelen = 0;
+       msg.msg_control = cmsgbuf;
+       msg.msg_controllen = sizeof(cmsgbuf);
+
+       iov.iov_base = buf;
+       iov.iov_len = sizeof(buf);
+       msg.msg_iov = &iov;
+       msg.msg_iovlen = 1;
+
+       if (!wait_for_sock(sock, 2)) {
+               lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
+               return false;
+       }
+       ret = recvmsg(sock, &msg, MSG_DONTWAIT);
+       if (ret < 0) {
+               lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
+               return false;
+       }
+
+       cmsg = CMSG_FIRSTHDR(&msg);
+
+       if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
+                       cmsg->cmsg_level == SOL_SOCKET &&
+                       cmsg->cmsg_type == SCM_CREDENTIALS) {
+               memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
+       }
+       *v = buf[0];
+
+       return true;
+}
+
+/*
+ * pid_to_ns - reads pids from a ucred over a socket, then writes the
+ * int value back over the socket.  This shifts the pid from the
+ * sender's pidns into tpid's pidns.
+ */
+static int pid_to_ns(int sock, pid_t tpid)
+{
+       char v = '0';
+       struct ucred cred;
+
+       while (recv_creds(sock, &cred, &v)) {
+               if (v == '1')
+                       return 0;
+
+               if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
+                       return 1;
+       }
+
+       return 0;
+}
+
+/*
+ * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
+ * with clone(). This simply writes '1' as ACK back to the parent
+ * before calling the actual wrapped function.
+ */
+static int pid_ns_clone_wrapper(void *arg) {
+       struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
+       char b = '1';
+
+       close(args->cpipe[0]);
+       if (write(args->cpipe[1], &b, sizeof(char)) < 0)
+               lxcfs_error("(child): error on write: %s.\n", strerror(errno));
+       close(args->cpipe[1]);
+       return args->wrapped(args->sock, args->tpid);
+}
+
+/*
+ * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
+ * in your old pidns.  Only children which you clone will be in the target
+ * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
+ * actually convert pids.
+ *
+ * Note: glibc's fork() does not respect pidns, which can lead to failed
+ * assertions inside glibc (and thus failed forks) if the child's pid in
+ * the pidns and the parent pid outside are identical. Using clone prevents
+ * this issue.
+ */
+static void pid_to_ns_wrapper(int sock, pid_t tpid)
+{
+       int newnsfd = -1, ret, cpipe[2];
+       char fnam[100];
+       pid_t cpid;
+       char v;
+
+       ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
+       if (ret < 0 || ret >= sizeof(fnam))
+               _exit(1);
+       newnsfd = open(fnam, O_RDONLY);
+       if (newnsfd < 0)
+               _exit(1);
+       if (setns(newnsfd, 0) < 0)
+               _exit(1);
+       close(newnsfd);
+
+       if (pipe(cpipe) < 0)
+               _exit(1);
+
+       struct pid_ns_clone_args args = {
+               .cpipe = cpipe,
+               .sock = sock,
+               .tpid = tpid,
+               .wrapped = &pid_to_ns
+       };
+       size_t stack_size = sysconf(_SC_PAGESIZE);
+       void *stack = alloca(stack_size);
+
+       cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
+       if (cpid < 0)
+               _exit(1);
+
+       /* Give the child 1 second to be done forking and write its ack. */
+       if (!wait_for_sock(cpipe[0], 1))
+               _exit(1);
+       ret = read(cpipe[0], &v, 1);
+       if (ret != sizeof(char) || v != '1')
+               _exit(1);
+
+       if (!wait_for_pid(cpid))
+               _exit(1);
+       _exit(0);
+}
+
+/*
+ * append pid to *src.
+ * src: a pointer to a char* in which ot append the pid.
+ * sz: the number of characters printed so far, minus trailing \0.
+ * asz: the allocated size so far
+ * pid: the pid to append
+ */
+static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
+{
+       must_strcat(src, sz, asz, "%d\n", (int)pid);
+}
+
+/*
+ * To read cgroup files with a particular pid, we will setns into the child
+ * pidns, open a pipe, fork a child - which will be the first to really be in
+ * the child ns - which does the cgfs_get_value and writes the data to the pipe.
+ */
+static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg,
+                        const char *file, char **d)
+{
+       int sock[2] = {-1, -1};
+       char *tmpdata = NULL;
+       int ret;
+       pid_t qpid, cpid = -1;
+       bool answer = false;
+       char v = '0';
+       struct ucred cred;
+       size_t sz = 0, asz = 0;
+
+       if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
+               return false;
+
+       /*
+        * Now we read the pids from returned data one by one, pass
+        * them into a child in the target namespace, read back the
+        * translated pids, and put them into our to-return data
+        */
+
+       if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
+               perror("socketpair");
+               free(tmpdata);
+               return false;
+       }
+
+       cpid = fork();
+       if (cpid == -1)
+               goto out;
+
+       if (!cpid) // child - exits when done
+               pid_to_ns_wrapper(sock[1], tpid);
+
+       char *ptr = tmpdata;
+       cred.uid = 0;
+       cred.gid = 0;
+       while (sscanf(ptr, "%d\n", &qpid) == 1) {
+               cred.pid = qpid;
+               ret = send_creds(sock[0], &cred, v, true);
+
+               if (ret == SEND_CREDS_NOTSK)
+                       goto next;
+               if (ret == SEND_CREDS_FAIL)
+                       goto out;
+
+               // read converted results
+               if (!wait_for_sock(sock[0], 2)) {
+                       lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
+                       goto out;
+               }
+               if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
+                       lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
+                       goto out;
+               }
+               must_strcat_pid(d, &sz, &asz, qpid);
+next:
+               ptr = strchr(ptr, '\n');
+               if (!ptr)
+                       break;
+               ptr++;
+       }
+
+       cred.pid = getpid();
+       v = '1';
+       if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
+               // failed to ask child to exit
+               lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
+               goto out;
+       }
+
+       answer = true;
+
+out:
+       free(tmpdata);
+       if (cpid != -1)
+               wait_for_pid(cpid);
+       if (sock[0] != -1) {
+               close(sock[0]);
+               close(sock[1]);
+       }
+       return answer;
+}
+
+int cg_read(const char *path, char *buf, size_t size, off_t offset,
+           struct fuse_file_info *fi)
+{
+       struct fuse_context *fc = fuse_get_context();
+       struct file_info *f = (struct file_info *)fi->fh;
+       struct cgfs_files *k = NULL;
+       char *data = NULL;
+       int ret, s;
+       bool r;
+
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
+       if (f->type != LXC_TYPE_CGFILE) {
+               lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
+               return -EIO;
+       }
+
+       if (offset)
+               return 0;
+
+       if (!f->controller)
+               return -EINVAL;
+
+       if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
+               return -EINVAL;
+       }
+       free_key(k);
+
+
+       if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
+               ret = -EACCES;
+               goto out;
+       }
+
+       if (strcmp(f->file, "tasks") == 0 ||
+                       strcmp(f->file, "/tasks") == 0 ||
+                       strcmp(f->file, "/cgroup.procs") == 0 ||
+                       strcmp(f->file, "cgroup.procs") == 0)
+               // special case - we have to translate the pids
+               r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
+       else
+               r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
+
+       if (!r) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (!data) {
+               ret = 0;
+               goto out;
+       }
+       s = strlen(data);
+       if (s > size)
+               s = size;
+       memcpy(buf, data, s);
+       if (s > 0 && s < size && data[s-1] != '\n')
+               buf[s++] = '\n';
+
+       ret = s;
+
+out:
+       free(data);
+       return ret;
+}
+
+int cg_opendir(const char *path, struct fuse_file_info *fi)
+{
+       struct fuse_context *fc = fuse_get_context();
+       const char *cgroup;
+       struct file_info *dir_info;
+       char *controller = NULL;
+
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
+       if (strcmp(path, "/cgroup") == 0) {
+               cgroup = NULL;
+               controller = NULL;
+       } else {
+               // return list of keys for the controller, and list of child cgroups
+               controller = pick_controller_from_path(fc, path);
+               if (!controller)
+                       return -errno;
+
+               cgroup = find_cgroup_in_path(path);
+               if (!cgroup) {
+                       /* this is just /cgroup/controller, return its contents */
+                       cgroup = "/";
+               }
+       }
+
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 1 || is_shared_pidns(initpid))
+               initpid = fc->pid;
+       if (cgroup) {
+               if (!caller_may_see_dir(initpid, controller, cgroup))
+                       return -ENOENT;
+               if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
+                       return -EACCES;
+       }
+
+       /* we'll free this at cg_releasedir */
+       dir_info = malloc(sizeof(*dir_info));
+       if (!dir_info)
+               return -ENOMEM;
+       dir_info->controller = must_copy_string(controller);
+       dir_info->cgroup = must_copy_string(cgroup);
+       dir_info->type = LXC_TYPE_CGDIR;
+       dir_info->buf = NULL;
+       dir_info->file = NULL;
+       dir_info->buflen = 0;
+
+       fi->fh = (unsigned long)dir_info;
+       return 0;
+}
+
+int cg_release(const char *path, struct fuse_file_info *fi)
+{
+       do_release_file_info(fi);
+       return 0;
+}
+
+int cg_releasedir(const char *path, struct fuse_file_info *fi)
+{
+       do_release_file_info(fi);
+       return 0;
+}
+
+static FILE *open_pids_file(const char *controller, const char *cgroup)
+{
+       int fd, cfd;
+       size_t len;
+       char *pathname;
+
+       cfd = get_cgroup_fd(controller);
+       if (cfd < 0)
+               return false;
+
+       /* Make sure we pass a relative path to *at() family of functions.
+        * . + /cgroup + / "cgroup.procs" + \0
+        */
+       len = strlen(cgroup) + strlen("cgroup.procs") + 3;
+       pathname = alloca(len);
+       snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
+
+       fd = openat(cfd, pathname, O_WRONLY);
+       if (fd < 0)
+               return NULL;
+
+       return fdopen(fd, "w");
+}
+
+static int pid_from_ns(int sock, pid_t tpid)
+{
+       pid_t vpid;
+       struct ucred cred;
+       char v;
+       int ret;
+
+       cred.uid = 0;
+       cred.gid = 0;
+       while (1) {
+               if (!wait_for_sock(sock, 2)) {
+                       lxcfs_error("%s\n", "Timeout reading from parent.");
+                       return 1;
+               }
+               if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
+                       lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
+                       return 1;
+               }
+               if (vpid == -1) // done
+                       break;
+               v = '0';
+               cred.pid = vpid;
+               if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
+                       v = '1';
+                       cred.pid = getpid();
+                       if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
+                               return 1;
+               }
+       }
+       return 0;
+}
+
+static void pid_from_ns_wrapper(int sock, pid_t tpid)
+{
+       int newnsfd = -1, ret, cpipe[2];
+       char fnam[100];
+       pid_t cpid;
+       char v;
+
+       ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
+       if (ret < 0 || ret >= sizeof(fnam))
+               _exit(1);
+       newnsfd = open(fnam, O_RDONLY);
+       if (newnsfd < 0)
+               _exit(1);
+       if (setns(newnsfd, 0) < 0)
+               _exit(1);
+       close(newnsfd);
+
+       if (pipe(cpipe) < 0)
+               _exit(1);
+
+       struct pid_ns_clone_args args = {
+               .cpipe = cpipe,
+               .sock = sock,
+               .tpid = tpid,
+               .wrapped = &pid_from_ns
+       };
+       size_t stack_size = sysconf(_SC_PAGESIZE);
+       void *stack = alloca(stack_size);
+
+       cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
+       if (cpid < 0)
+               _exit(1);
+
+       // give the child 1 second to be done forking and
+       // write its ack
+       if (!wait_for_sock(cpipe[0], 1))
+               _exit(1);
+       ret = read(cpipe[0], &v, 1);
+       if (ret != sizeof(char) || v != '1')
+               _exit(1);
+
+       if (!wait_for_pid(cpid))
+               _exit(1);
+       _exit(0);
+}
+
+/*
+ * get_pid_creds: get the real uid and gid of @pid from
+ * /proc/$$/status
+ * (XXX should we use euid here?)
+ */
+static void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
+{
+       char line[400];
+       uid_t u;
+       gid_t g;
+       FILE *f;
+
+       *uid = -1;
+       *gid = -1;
+       sprintf(line, "/proc/%d/status", pid);
+       if ((f = fopen(line, "r")) == NULL) {
+               lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
+               return;
+       }
+       while (fgets(line, 400, f)) {
+               if (strncmp(line, "Uid:", 4) == 0) {
+                       if (sscanf(line+4, "%u", &u) != 1) {
+                               lxcfs_error("bad uid line for pid %u\n", pid);
+                               fclose(f);
+                               return;
+                       }
+                       *uid = u;
+               } else if (strncmp(line, "Gid:", 4) == 0) {
+                       if (sscanf(line+4, "%u", &g) != 1) {
+                               lxcfs_error("bad gid line for pid %u\n", pid);
+                               fclose(f);
+                               return;
+                       }
+                       *gid = g;
+               }
+       }
+       fclose(f);
+}
+
+/*
+ * Given host @uid, return the uid to which it maps in
+ * @pid's user namespace, or -1 if none.
+ */
+static bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
+{
+       FILE *f;
+       char line[400];
+
+       sprintf(line, "/proc/%d/uid_map", pid);
+       if ((f = fopen(line, "r")) == NULL) {
+               return false;
+       }
+
+       *answer = convert_id_to_ns(f, uid);
+       fclose(f);
+
+       if (*answer == -1)
+               return false;
+       return true;
+}
+
+/*
+ * May the requestor @r move victim @v to a new cgroup?
+ * This is allowed if
+ *   . they are the same task
+ *   . they are ownedy by the same uid
+ *   . @r is root on the host, or
+ *   . @v's uid is mapped into @r's where @r is root.
+ */
+static bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
+{
+       uid_t v_uid, tmpuid;
+       gid_t v_gid;
+
+       if (r == v)
+               return true;
+       if (r_uid == 0)
+               return true;
+       get_pid_creds(v, &v_uid, &v_gid);
+       if (r_uid == v_uid)
+               return true;
+       if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
+                       && hostuid_to_ns(v_uid, r, &tmpuid))
+               return true;
+       return false;
+}
+
+static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl,
+                         const char *cg, const char *file, const char *buf)
+{
+       int sock[2] = {-1, -1};
+       pid_t qpid, cpid = -1;
+       FILE *pids_file = NULL;
+       bool answer = false, fail = false;
+
+       pids_file = open_pids_file(contrl, cg);
+       if (!pids_file)
+               return false;
+
+       /*
+        * write the pids to a socket, have helper in writer's pidns
+        * call movepid for us
+        */
+       if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
+               perror("socketpair");
+               goto out;
+       }
+
+       cpid = fork();
+       if (cpid == -1)
+               goto out;
+
+       if (!cpid) { // child
+               fclose(pids_file);
+               pid_from_ns_wrapper(sock[1], tpid);
+       }
+
+       const char *ptr = buf;
+       while (sscanf(ptr, "%d", &qpid) == 1) {
+               struct ucred cred;
+               char v;
+
+               if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
+                       lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
+                       goto out;
+               }
+
+               if (recv_creds(sock[0], &cred, &v)) {
+                       if (v == '0') {
+                               if (!may_move_pid(tpid, tuid, cred.pid)) {
+                                       fail = true;
+                                       break;
+                               }
+                               if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
+                                       fail = true;
+                       }
+               }
+
+               ptr = strchr(ptr, '\n');
+               if (!ptr)
+                       break;
+               ptr++;
+       }
+
+       /* All good, write the value */
+       qpid = -1;
+       if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
+               lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
+
+       if (!fail)
+               answer = true;
+
+out:
+       if (cpid != -1)
+               wait_for_pid(cpid);
+       if (sock[0] != -1) {
+               close(sock[0]);
+               close(sock[1]);
+       }
+       if (pids_file) {
+               if (fclose(pids_file) != 0)
+                       answer = false;
+       }
+       return answer;
+}
+
+static bool write_string(const char *fnam, const char *string, int fd)
+{
+       FILE *f;
+       size_t len, ret;
+
+       f = fdopen(fd, "w");
+       if (!f)
+               return false;
+
+       len = strlen(string);
+       ret = fwrite(string, 1, len, f);
+       if (ret != len) {
+               lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
+                           strerror(errno), string, fnam);
+               fclose(f);
+               return false;
+       }
+
+       if (fclose(f) < 0) {
+               lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
+               return false;
+       }
+
+       return true;
+}
+
+static bool cgfs_set_value(const char *controller, const char *cgroup,
+                          const char *file, const char *value)
+{
+       int ret, fd, cfd;
+       size_t len;
+       char *fnam;
+
+       cfd = get_cgroup_fd(controller);
+       if (cfd < 0)
+               return false;
+
+       /* Make sure we pass a relative path to *at() family of functions.
+        * . + /cgroup + / + file + \0
+        */
+       len = strlen(cgroup) + strlen(file) + 3;
+       fnam = alloca(len);
+       ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
+       if (ret < 0 || (size_t)ret >= len)
+               return false;
+
+       fd = openat(cfd, fnam, O_WRONLY);
+       if (fd < 0)
+               return false;
+
+       return write_string(fnam, value, fd);
+}
+
+int cg_write(const char *path, const char *buf, size_t size, off_t offset,
+            struct fuse_file_info *fi)
+{
+       struct fuse_context *fc = fuse_get_context();
+       char *localbuf = NULL;
+       struct cgfs_files *k = NULL;
+       struct file_info *f = (struct file_info *)fi->fh;
+       bool r;
+
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
+       if (f->type != LXC_TYPE_CGFILE) {
+               lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
+               return -EIO;
+       }
+
+       if (offset)
+               return 0;
+
+       localbuf = alloca(size+1);
+       localbuf[size] = '\0';
+       memcpy(localbuf, buf, size);
+
+       if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
+               size = -EINVAL;
+               goto out;
+       }
+
+       if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
+               size = -EACCES;
+               goto out;
+       }
+
+       if (strcmp(f->file, "tasks") == 0 ||
+                       strcmp(f->file, "/tasks") == 0 ||
+                       strcmp(f->file, "/cgroup.procs") == 0 ||
+                       strcmp(f->file, "cgroup.procs") == 0)
+               // special case - we have to translate the pids
+               r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
+       else
+               r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
+
+       if (!r)
+               size = -EINVAL;
+
+out:
+       free_key(k);
+       return size;
+}
+
+static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup,
+                               bool directories, void ***list, size_t typesize,
+                               void *(*iterator)(const char *, const char *, const char *))
+{
+       int cfd, fd, ret;
+       size_t len;
+       char *cg;
+       char pathname[MAXPATHLEN];
+       size_t sz = 0, asz = 0;
+       struct dirent *dirent;
+       DIR *dir;
+
+       cfd = get_cgroup_fd(controller);
+       *list = NULL;
+       if (cfd < 0)
+               return false;
+
+       /* Make sure we pass a relative path to *at() family of functions. */
+       len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
+       cg = alloca(len);
+       ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
+       if (ret < 0 || (size_t)ret >= len) {
+               lxcfs_error("Pathname too long under %s\n", cgroup);
+               return false;
+       }
+
+       fd = openat(cfd, cg, O_DIRECTORY);
+       if (fd < 0)
+               return false;
+
+       dir = fdopendir(fd);
+       if (!dir)
+               return false;
+
+       while ((dirent = readdir(dir))) {
+               struct stat mystat;
+
+               if (!strcmp(dirent->d_name, ".") ||
+                   !strcmp(dirent->d_name, ".."))
+                       continue;
+
+               ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
+               if (ret < 0 || ret >= MAXPATHLEN) {
+                       lxcfs_error("Pathname too long under %s\n", cg);
+                       continue;
+               }
+
+               ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
+               if (ret) {
+                       lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
+                       continue;
+               }
+               if ((!directories && !S_ISREG(mystat.st_mode)) ||
+                   (directories && !S_ISDIR(mystat.st_mode)))
+                       continue;
+
+               if (sz+2 >= asz) {
+                       void **tmp;
+                       asz += BATCH_SIZE;
+                       do {
+                               tmp = realloc(*list, asz * typesize);
+                       } while  (!tmp);
+                       *list = tmp;
+               }
+               (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
+               (*list)[sz+1] = NULL;
+               sz++;
+       }
+       if (closedir(dir) < 0) {
+               lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
+               return false;
+       }
+       return true;
+}
+
+static void *make_key_list_entry(const char *controller, const char *cgroup,
+                                const char *dir_entry)
+{
+       struct cgfs_files *entry;
+
+       entry = cgfs_get_key(controller, cgroup, dir_entry);
+       if (!entry)
+               lxcfs_error("Failed to retrieve files under %s:%s\n",
+                           controller, cgroup);
+       return entry;
+}
+
+static bool cgfs_list_keys(const char *controller, const char *cgroup,
+                          struct cgfs_files ***keys)
+{
+       return cgfs_iterate_cgroup(controller, cgroup, false, (void ***)keys,
+                                  sizeof(*keys), &make_key_list_entry);
+}
+
+static void *make_children_list_entry(const char *controller,
+                                     const char *cgroup, const char *dir_entry)
+{
+       return strdup(dir_entry);
+}
+
+static bool cgfs_list_children(const char *controller, const char *cgroup,
+                              char ***list)
+{
+       return cgfs_iterate_cgroup(controller, cgroup, true, (void ***)list,
+                                  sizeof(*list), &make_children_list_entry);
+}
+
+static void free_keys(struct cgfs_files **keys)
+{
+       if (!keys)
+               return;
+
+       for (int i = 0; keys[i]; i++)
+               free_key(keys[i]);
+
+       free_disarm(keys);
+}
+
+int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler,
+              off_t offset, struct fuse_file_info *fi)
+{
+       struct file_info *d = (struct file_info *)fi->fh;
+       struct cgfs_files **list = NULL;
+       int i, ret;
+       char *nextcg = NULL;
+       struct fuse_context *fc = fuse_get_context();
+       char **clist = NULL;
+
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
+       if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
+               return -EIO;
+
+       if (d->type != LXC_TYPE_CGDIR) {
+               lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
+               return -EIO;
+       }
+       if (!d->cgroup && !d->controller) {
+               /*
+                * ls /var/lib/lxcfs/cgroup - just show list of controllers.
+                * This only works with the legacy hierarchy.
+                */
+               for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
+                       if (is_unified_hierarchy(*h))
+                               continue;
+
+                       if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
+                               return -EIO;
+               }
+
+               return 0;
+       }
+
+       if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
+               // not a valid cgroup
+               ret = -EINVAL;
+               goto out;
+       }
+
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 1 || is_shared_pidns(initpid))
+               initpid = fc->pid;
+       if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
+               if (nextcg) {
+                       ret = filler(buf, nextcg,  NULL, 0);
+                       free(nextcg);
+                       if (ret != 0) {
+                               ret = -EIO;
+                               goto out;
+                       }
+               }
+               ret = 0;
+               goto out;
+       }
+
+       for (i = 0; list && list[i]; i++) {
+               if (filler(buf, list[i]->name, NULL, 0) != 0) {
+                       ret = -EIO;
+                       goto out;
+               }
+       }
+
+       // now get the list of child cgroups
+
+       if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
+               ret = 0;
+               goto out;
+       }
+       if (clist) {
+               for (i = 0; clist[i]; i++) {
+                       if (filler(buf, clist[i], NULL, 0) != 0) {
+                               ret = -EIO;
+                               goto out;
+                       }
+               }
+       }
+       ret = 0;
+
+out:
+       free_keys(list);
+       if (clist) {
+               for (i = 0; clist[i]; i++)
+                       free(clist[i]);
+               free(clist);
+       }
+       return ret;
+}
+
+int cg_access(const char *path, int mode)
+{
+       int ret;
+       const char *cgroup;
+       char *path1, *path2, *controller;
+       char *last = NULL, *cgdir = NULL;
+       struct cgfs_files *k = NULL;
+       struct fuse_context *fc = fuse_get_context();
+
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
+       if (strcmp(path, "/cgroup") == 0)
+               return 0;
+
+       controller = pick_controller_from_path(fc, path);
+       if (!controller)
+               return -errno;
+       cgroup = find_cgroup_in_path(path);
+       if (!cgroup) {
+               // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
+               if ((mode & W_OK) == 0)
+                       return 0;
+               return -EACCES;
+       }
+
+       get_cgdir_and_path(cgroup, &cgdir, &last);
+       if (!last) {
+               path1 = "/";
+               path2 = cgdir;
+       } else {
+               path1 = cgdir;
+               path2 = last;
+       }
+
+       k = cgfs_get_key(controller, path1, path2);
+       if (!k) {
+               if ((mode & W_OK) == 0)
+                       ret = 0;
+               else
+                       ret = -EACCES;
+               goto out;
+       }
+       free_key(k);
+
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 1 || is_shared_pidns(initpid))
+               initpid = fc->pid;
+       if (!caller_may_see_dir(initpid, controller, path1)) {
+               ret = -ENOENT;
+               goto out;
+       }
+       if (!fc_may_access(fc, controller, path1, path2, mode)) {
+               ret = -EACCES;
+               goto out;
+       }
+
+       ret = 0;
+
+out:
+       free(cgdir);
+       return ret;
+}
diff --git a/cgroup_fuse.h b/cgroup_fuse.h

new file mode 100644 (file)

index 0000000..4515530
--- /dev/null
+++ b/cgroup_fuse.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#ifndef __LXCFS_CGROUP_FUSE_H
+#define __LXCFS_CGROUP_FUSE_H
+
+extern int cg_getattr(const char *path, struct stat *sb);
+extern int cg_mkdir(const char *path, mode_t mode);
+extern int cg_rmdir(const char *path);
+extern int cg_chmod(const char *path, mode_t mode);
+extern int cg_chown(const char *path, uid_t uid, gid_t gid);
+extern int cg_open(const char *path, struct fuse_file_info *fi);
+extern int cg_read(const char *path, char *buf, size_t size, off_t offset,
+                  struct fuse_file_info *fi);
+extern int cg_opendir(const char *path, struct fuse_file_info *fi);
+extern int cg_release(const char *path, struct fuse_file_info *fi);
+extern int cg_releasedir(const char *path, struct fuse_file_info *fi);
+extern int cg_write(const char *path, const char *buf, size_t size,
+                   off_t offset, struct fuse_file_info *fi);
+extern int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler,
+                     off_t offset, struct fuse_file_info *fi);
+extern int cg_access(const char *path, int mode);
+
+#endif /* __LXCFS_CGROUP_FUSE_H */
diff --git a/utils.c b/utils.c

index 1ebcf169d0950dba3c0fcc3d7aaf4038572374bc..5bfd442a91b530b7d6595bcd717698a86bfd3e69 100644 (file)
--- a/utils.c
+++ b/utils.c
@@ -127,3 +127,19 @@ int preserve_ns(const int pid, const char *ns)
  
         return open(path, O_RDONLY | O_CLOEXEC);
  }
+
+void do_release_file_info(struct fuse_file_info *fi)
+{
+       struct file_info *f = (struct file_info *)fi->fh;
+
+       if (!f)
+               return;
+
+       fi->fh = 0;
+
+       free_disarm(f->controller);
+       free_disarm(f->cgroup);
+       free_disarm(f->file);
+       free_disarm(f->buf);
+       free_disarm(f);
+}
diff --git a/utils.h b/utils.h

index fbe775efeda228637d9f2a4dfb6338cd7393d7f4..0a4dd3cd1a4bc52fa55cd7c6ca901cdb8aecc822 100644 (file)
--- a/utils.h
+++ b/utils.h
@@ -3,11 +3,16 @@
  #ifndef __LXCFS_UTILS_H
  #define __LXCFS_UTILS_H
  
+#define FUSE_USE_VERSION 26
+
+#include <fuse.h>
+
  /* Reserve buffer size to account for file size changes. */
  #define BUF_RESERVE_SIZE 512
  
  extern void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...);
  extern bool is_shared_pidns(pid_t pid);
  extern int preserve_ns(const int pid, const char *ns);
+extern void do_release_file_info(struct fuse_file_info *fi);
  
  #endif /* __LXCFS_UTILS_H */
author	Christian Brauner <christian.brauner@ubuntu.com>
	Tue, 25 Feb 2020 16:17:10 +0000 (17:17 +0100)
committer	Christian Brauner <christian.brauner@ubuntu.com>
	Tue, 25 Feb 2020 16:18:44 +0000 (17:18 +0100)
Makefile.am		patch \| blob \| blame \| history
bindings.c		patch \| blob \| blame \| history
bindings.h		patch \| blob \| blame \| history
cgroup_fuse.c	[new file with mode: 0644]	patch \| blob
cgroup_fuse.h	[new file with mode: 0644]	patch \| blob
utils.c		patch \| blob \| blame \| history
utils.h		patch \| blob \| blame \| history