Remove unused chunks in caching code

[mirror_lxcfs.git] / lxcfs.c
diff --git a/lxcfs.c b/lxcfs.c

index e2ccc5e855d5f72c2a38b81ac2934cf76cd87698..a1470bf2a3e1154daa3cc84c23067e6b517a4e09 100644 (file)
--- a/lxcfs.c
+++ b/lxcfs.c
@@ -1,19 +1,11 @@
  /* lxcfs
   *
- * Copyright © 2014,2015 Canonical, Inc
+ * Copyright © 2014-2016 Canonical, Inc
   * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   *
   * See COPYING file for details.
   */
  
-/*
- * TODO XXX
- * sanitize paths for '..', cgmanager's not doing that for us any more
- *     does fuse help us?
- * Surely there are more paths we'll need to sanitize - look back through
- * cgmanager's sources.
- */
-
  #define FUSE_USE_VERSION 26
  
  #include <stdio.h>
@@ -28,9 +20,11 @@
  #include <stdlib.h>
  #include <libgen.h>
  #include <sched.h>
+#include <pthread.h>
  #include <linux/sched.h>
  #include <sys/socket.h>
  #include <sys/mount.h>
+#include <sys/epoll.h>
  #include <wait.h>
  
  #ifdef FORTRAVIS
@@ -66,41 +60,326 @@ struct file_info {
  #define BUF_RESERVE_SIZE 256
  
  /*
- * append pid to *src.
- * src: a pointer to a char* in which ot append the pid.
- * sz: the number of characters printed so far, minus trailing \0.
- * asz: the allocated size so far
- * pid: the pid to append
+ * A table caching which pid is init for a pid namespace.
+ * When looking up which pid is init for $qpid, we first
+ * 1. Stat /proc/$qpid/ns/pid.
+ * 2. Check whether the ino_t is in our store.
+ *   a. if not, fork a child in qpid's ns to send us
+ *      ucred.pid = 1, and read the initpid.  Cache
+ *      initpid and creation time for /proc/initpid
+ *      in a new store entry.
+ *   b. if so, verify that /proc/initpid still matches
+ *      what we have saved.  If not, clear the store
+ *      entry and go back to a.  If so, return the
+ *      cached initpid.
   */
-static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
+struct pidns_init_store {
+       ino_t ino;          // inode number for /proc/$pid/ns/pid
+       pid_t initpid;      // the pid of nit in that ns
+       long int ctime;     // the time at which /proc/$initpid was created
+       struct pidns_init_store *next;
+       long int lastcheck;
+};
+
+/* lol - look at how they are allocated in the kernel */
+#define PIDNS_HASH_SIZE 4096
+#define HASH(x) ((x) % PIDNS_HASH_SIZE)
+
+struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
+static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
+static void lock_mutex(pthread_mutex_t *l)
  {
-       char *d = *src;
-       char tmp[30];
+       int ret;
  
-       sprintf(tmp, "%d\n", (int)pid);
+       if ((ret = pthread_mutex_lock(l)) != 0) {
+               fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
+               exit(1);
+       }
+}
  
-       if (!d) {
-               do {
-                       d = malloc(BUF_RESERVE_SIZE);
-               } while (!d);
-               *src = d;
-               *asz = BUF_RESERVE_SIZE;
-       } else if (strlen(tmp) + sz + 1 >= asz) {
-               do {
-                       d = realloc(d, *asz + BUF_RESERVE_SIZE);
-               } while (!d);
-               *src = d;
-               *asz += BUF_RESERVE_SIZE;
+static void unlock_mutex(pthread_mutex_t *l)
+{
+       int ret;
+
+       if ((ret = pthread_mutex_unlock(l)) != 0) {
+               fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
+               exit(1);
+       }
+}
+
+static void store_lock(void)
+{
+       lock_mutex(&pidns_store_mutex);
+}
+
+static void store_unlock(void)
+{
+       unlock_mutex(&pidns_store_mutex);
+}
+
+/* Must be called under store_lock */
+static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
+{
+       struct stat initsb;
+       char fnam[100];
+
+       snprintf(fnam, 100, "/proc/%d", e->initpid);
+       if (stat(fnam, &initsb) < 0)
+               return false;
+#if DEBUG
+       fprintf(stderr, "comparing ctime %ld %ld for pid %d\n",
+               e->ctime, initsb.st_ctime, e->initpid);
+#endif
+       if (e->ctime != initsb.st_ctime)
+               return false;
+       return true;
+}
+
+/* Must be called under store_lock */
+static void remove_initpid(struct pidns_init_store *e)
+{
+       struct pidns_init_store *tmp;
+       int h;
+
+#if DEBUG
+       fprintf(stderr, "remove_initpid: removing entry for %d\n", e->initpid);
+#endif
+       h = HASH(e->ino);
+       if (pidns_hash_table[h] == e) {
+               pidns_hash_table[h] = e->next;
+               free(e);
+               return;
+       }
+
+       tmp = pidns_hash_table[h];
+       while (tmp) {
+               if (tmp->next == e) {
+                       tmp->next = e->next;
+                       free(e);
+                       return;
+               }
+               tmp = tmp->next;
+       }
+}
+
+#define PURGE_SECS 5
+/* Must be called under store_lock */
+static void prune_initpid_store(void)
+{
+       static long int last_prune = 0;
+       struct pidns_init_store *e, *prev, *delme;
+       long int now, threshold;
+       int i;
+
+       if (!last_prune) {
+               last_prune = time(NULL);
+               return;
+       }
+       now = time(NULL);
+       if (now < last_prune + PURGE_SECS)
+               return;
+#if DEBUG
+       fprintf(stderr, "pruning\n");
+#endif
+       last_prune = now;
+       threshold = now - 2 * PURGE_SECS;
+
+       for (i = 0; i < PIDNS_HASH_SIZE; i++) {
+               for (prev = NULL, e = pidns_hash_table[i]; e; ) {
+                       if (e->lastcheck < threshold) {
+#if DEBUG
+                               fprintf(stderr, "Removing cached entry for %d\n", e->initpid);
+#endif
+                               delme = e;
+                               if (prev)
+                                       prev->next = e->next;
+                               else
+                                       pidns_hash_table[i] = e->next;
+                               e = e->next;
+                               free(delme);
+                       } else {
+                               prev = e;
+                               e = e->next;
+                       }
+               }
         }
-       memcpy(d+*sz, tmp, strlen(tmp));
-       *sz += strlen(tmp);
-       d[*sz] = '\0';
+}
+
+/* Must be called under store_lock */
+static void save_initpid(struct stat *sb, pid_t pid)
+{
+       struct pidns_init_store *e;
+       char fpath[100];
+       struct stat procsb;
+       int h;
+
+#if DEBUG
+       fprintf(stderr, "save_initpid: adding entry for %d\n", pid);
+#endif
+       snprintf(fpath, 100, "/proc/%d", pid);
+       if (stat(fpath, &procsb) < 0)
+               return;
+       do {
+               e = malloc(sizeof(*e));
+       } while (!e);
+       e->ino = sb->st_ino;
+       e->initpid = pid;
+       e->ctime = procsb.st_ctime;
+       h = HASH(e->ino);
+       e->next = pidns_hash_table[h];
+       e->lastcheck = time(NULL);
+       pidns_hash_table[h] = e;
+}
+
+/*
+ * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
+ * entry for the inode number and creation time.  Verify that the init pid
+ * is still valid.  If not, remove it.  Return the entry if valid, NULL
+ * otherwise.
+ * Must be called under store_lock
+ */
+static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
+{
+       int h = HASH(sb->st_ino);
+       struct pidns_init_store *e = pidns_hash_table[h];
+
+       while (e) {
+               if (e->ino == sb->st_ino) {
+                       if (initpid_still_valid(e, sb)) {
+                               e->lastcheck = time(NULL);
+                               return e;
+                       }
+                       remove_initpid(e);
+                       return NULL;
+               }
+               e = e->next;
+       }
+
+       return NULL;
+}
+
+#define SEND_CREDS_OK 0
+#define SEND_CREDS_NOTSK 1
+#define SEND_CREDS_FAIL 2
+static bool recv_creds(int sock, struct ucred *cred, char *v);
+static int wait_for_pid(pid_t pid);
+static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
+
+/*
+ * fork a task which switches to @task's namespace and writes '1'.
+ * over a unix sock so we can read the task's reaper's pid in our
+ * namespace
+ */
+static void write_task_init_pid_exit(int sock, pid_t target)
+{
+       struct ucred cred;
+       char fnam[100];
+       pid_t pid;
+       char v;
+       int fd, ret;
+
+       ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
+       if (ret < 0 || ret >= sizeof(fnam))
+               _exit(1);
+
+       fd = open(fnam, O_RDONLY);
+       if (fd < 0) {
+               perror("write_task_init_pid_exit open of ns/pid");
+               _exit(1);
+       }
+       if (setns(fd, 0)) {
+               perror("write_task_init_pid_exit setns 1");
+               close(fd);
+               _exit(1);
+       }
+       pid = fork();
+       if (pid < 0)
+               _exit(1);
+       if (pid != 0) {
+               if (!wait_for_pid(pid))
+                       _exit(1);
+               _exit(0);
+       }
+
+       /* we are the child */
+       cred.uid = 0;
+       cred.gid = 0;
+       cred.pid = 1;
+       v = '1';
+       if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
+               _exit(1);
+       _exit(0);
+}
+
+static pid_t get_init_pid_for_task(pid_t task)
+{
+       int sock[2];
+       pid_t pid;
+       pid_t ret = -1;
+       char v = '0';
+       struct ucred cred;
+
+       if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
+               perror("socketpair");
+               return -1;
+       }
+
+       pid = fork();
+       if (pid < 0)
+               goto out;
+       if (!pid) {
+               close(sock[1]);
+               write_task_init_pid_exit(sock[0], task);
+               _exit(0);
+       }
+
+       if (!recv_creds(sock[1], &cred, &v))
+               goto out;
+       ret = cred.pid;
+
+out:
+       close(sock[0]);
+       close(sock[1]);
+       if (pid > 0)
+               wait_for_pid(pid);
+       return ret;
+}
+
+static pid_t lookup_initpid_in_store(pid_t qpid)
+{
+       pid_t answer = 0;
+       struct stat sb;
+       struct pidns_init_store *e;
+       char fnam[100];
+
+       snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
+       store_lock();
+       if (stat(fnam, &sb) < 0)
+               goto out;
+       e = lookup_verify_initpid(&sb);
+       if (e) {
+               answer = e->initpid;
+               goto out;
+       }
+       answer = get_init_pid_for_task(qpid);
+       if (answer > 0)
+               save_initpid(&sb, answer);
+
+out:
+       /* we prune at end in case we are returning
+        * the value we were about to return */
+       prune_initpid_store();
+       store_unlock();
+       return answer;
  }
  
  static int wait_for_pid(pid_t pid)
  {
         int status, ret;
  
+       if (pid <= 0)
+               return -1;
+
  again:
         ret = waitpid(pid, &status, 0);
         if (ret == -1) {
@@ -115,6 +394,33 @@ again:
         return 0;
  }
  
+
+/*
+ * append pid to *src.
+ * src: a pointer to a char* in which ot append the pid.
+ * sz: the number of characters printed so far, minus trailing \0.
+ * asz: the allocated size so far
+ * pid: the pid to append
+ */
+static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
+{
+       char tmp[30];
+
+       int tmplen = sprintf(tmp, "%d\n", (int)pid);
+
+       if (!*src || tmplen + *sz + 1 >= *asz) {
+               char *tmp;
+               do {
+                       tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
+               } while (!tmp);
+               *src = tmp;
+               *asz += BUF_RESERVE_SIZE;
+       }
+       memcpy((*src) +*sz , tmp, tmplen);
+       *sz += tmplen;
+       (*src)[*sz] = '\0';
+}
+
  /*
   * Given a open file * to /proc/pid/{u,g}id_map, and an id
   * valid in the caller's namespace, return the id mapped into
@@ -235,6 +541,12 @@ static bool perms_include(int fmode, mode_t req_mode)
         return ((fmode & r) == r);
  }
  
+
+/*
+ * taskcg is  a/b/c
+ * querycg is /a/b/c/d/e
+ * we return 'd'
+ */
  static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
  {
         char *start, *end;
@@ -325,12 +637,6 @@ static bool fc_may_access(struct fuse_context *fc, const char *contrl, const cha
         struct cgfs_files *k = NULL;
         bool ret = false;
  
-       if (!file)
-               file = "tasks";
-
-       if (*file == '/')
-               file++;
-
         k = cgfs_get_key(contrl, cg, file);
         if (!k)
                 return false;
@@ -358,9 +664,12 @@ out:
  static void prune_init_slice(char *cg)
  {
         char *point;
-       point = cg + strlen(cg) - strlen(INITSCOPE);
-       if (point < cg)
-                return;
+       size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
+
+       if (cg_len < initscope_len)
+               return;
+
+       point = cg + cg_len - initscope_len;
         if (strcmp(point, INITSCOPE) == 0) {
                 if (point == cg)
                         *(point+1) = '\0';
@@ -370,61 +679,88 @@ static void prune_init_slice(char *cg)
  }
  
  /*
- * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
- * If caller is in /a, he may act on /a/b, but not on /b.
+ * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
+ * If pid is in /a, he may act on /a/b, but not on /b.
   * if the answer is false and nextcg is not NULL, then *nextcg will point
   * to a string containing the next cgroup directory under cg, which must be
   * freed by the caller.
   */
  static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
  {
-       char fnam[PROCLEN];
-       FILE *f;
         bool answer = false;
-       char *line = NULL;
-       size_t len = 0;
-       int ret;
+       char *c2 = get_pid_cgroup(pid, contrl);
+       char *linecmp;
  
-       ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
-       if (ret < 0 || ret >= PROCLEN)
-               return false;
-       if (!(f = fopen(fnam, "r")))
+       if (!c2)
                 return false;
+       prune_init_slice(c2);
  
-       while (getline(&line, &len, f) != -1) {
-               char *c1, *c2, *linecmp;
-               if (!line[0])
-                       continue;
-               c1 = strchr(line, ':');
-               if (!c1)
-                       goto out;
-               c1++;
-               c2 = strchr(c1, ':');
-               if (!c2)
-                       goto out;
-               *c2 = '\0';
-               if (strcmp(c1, contrl) != 0)
-                       continue;
-               c2++;
-               stripnewline(c2);
-               prune_init_slice(c2);
-               /*
-                * callers pass in '/' for root cgroup, otherwise they pass
-                * in a cgroup without leading '/'
-                */
-               linecmp = *cg == '/' ? c2 : c2+1;
-               if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
-                       if (nextcg)
-                               *nextcg = get_next_cgroup_dir(linecmp, cg);
-                       goto out;
+       /*
+        * callers pass in '/' for root cgroup, otherwise they pass
+        * in a cgroup without leading '/'
+        */
+       linecmp = *cg == '/' ? c2 : c2+1;
+       if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
+               if (nextcg) {
+                       *nextcg = get_next_cgroup_dir(linecmp, cg);
                 }
+               goto out;
+       }
+       answer = true;
+
+out:
+       free(c2);
+       return answer;
+}
+
+/*
+ * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
+ */
+static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
+{
+       bool answer = false;
+       char *c2, *task_cg;
+       size_t target_len, task_len;
+
+       if (strcmp(cg, "/") == 0)
+               return true;
+
+       c2 = get_pid_cgroup(pid, contrl);
+       if (!c2)
+               return false;
+       prune_init_slice(c2);
+
+       task_cg = c2 + 1;
+       target_len = strlen(cg);
+       task_len = strlen(task_cg);
+       if (task_len == 0) {
+               /* Task is in the root cg, it can see everything. This case is
+                * not handled by the strmcps below, since they test for the
+                * last /, but that is the first / that we've chopped off
+                * above.
+                */
                 answer = true;
                 goto out;
         }
+       if (strcmp(cg, task_cg) == 0) {
+               answer = true;
+               goto out;
+       }
+       if (target_len < task_len) {
+               /* looking up a parent dir */
+               if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
+                       answer = true;
+               goto out;
+       }
+       if (target_len > task_len) {
+               /* looking up a child dir */
+               if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
+                       answer = true;
+               goto out;
+       }
  
  out:
-       fclose(f);
-       free(line);
+       free(c2);
         return answer;
  }
  
@@ -474,18 +810,19 @@ static const char *find_cgroup_in_path(const char *path)
  }
  
  /*
- * dir should be freed, file not
- */
-static void get_cgdir_and_path(const char *cg, char **dir, char **file)
+ * split the last path element from the path in @cg.
+ * @dir is newly allocated and should be freed, @last not
+*/
+static void get_cgdir_and_path(const char *cg, char **dir, char **last)
  {
         char *p;
  
         do {
                 *dir = strdup(cg);
         } while (!*dir);
-       *file = strrchr(cg, '/');
-       if (!*file) {
-               *file = NULL;
+       *last = strrchr(cg, '/');
+       if (!*last) {
+               *last = NULL;
                 return;
         }
         p = strrchr(*dir, '/');
@@ -501,7 +838,7 @@ static int cg_getattr(const char *path, struct stat *sb)
         struct timespec now;
         struct fuse_context *fc = fuse_get_context();
         char * cgdir = NULL;
-       char *fpath = NULL, *path1, *path2;
+       char *last = NULL, *path1, *path2;
         struct cgfs_files *k = NULL;
         const char *cgroup;
         const char *controller = NULL;
@@ -537,22 +874,29 @@ static int cg_getattr(const char *path, struct stat *sb)
                 return 0;
         }
  
-       get_cgdir_and_path(cgroup, &cgdir, &fpath);
+       get_cgdir_and_path(cgroup, &cgdir, &last);
  
-       if (!fpath) {
+       if (!last) {
                 path1 = "/";
                 path2 = cgdir;
         } else {
                 path1 = cgdir;
-               path2 = fpath;
+               path2 = last;
         }
  
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 0)
+               initpid = fc->pid;
         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
-        * Then check that caller's cgroup is under path if fpath is a child
-        * cgroup, or cgdir if fpath is a file */
+        * Then check that caller's cgroup is under path if last is a child
+        * cgroup, or cgdir if last is a file */
  
         if (is_child_cgroup(controller, path1, path2)) {
-               if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
+               if (!caller_may_see_dir(initpid, controller, cgroup)) {
+                       ret = -ENOENT;
+                       goto out;
+               }
+               if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
                         /* this is just /cgroup/controller, return it as a dir */
                         sb->st_mode = S_IFDIR | 00555;
                         sb->st_nlink = 2;
@@ -567,7 +911,7 @@ static int cg_getattr(const char *path, struct stat *sb)
                 // get uid, gid, from '/tasks' file and make up a mode
                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
                 sb->st_mode = S_IFDIR | 00755;
-               k = cgfs_get_key(controller, cgroup, "tasks");
+               k = cgfs_get_key(controller, cgroup, NULL);
                 if (!k) {
                         sb->st_uid = sb->st_gid = 0;
                 } else {
@@ -587,7 +931,7 @@ static int cg_getattr(const char *path, struct stat *sb)
                 sb->st_gid = k->gid;
                 sb->st_size = 0;
                 free_key(k);
-               if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL)) {
+               if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
                         ret = -ENOENT;
                         goto out;
                 }
@@ -630,8 +974,14 @@ static int cg_opendir(const char *path, struct fuse_file_info *fi)
                 }
         }
  
-       if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
-               return -EACCES;
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 0)
+               initpid = fc->pid;
+       if (cgroup) {
+               if (!caller_may_see_dir(initpid, controller, cgroup))
+                       return -ENOENT;
+               if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
+                       return -EACCES;
         }
  
         /* we'll free this at cg_releasedir */
@@ -681,7 +1031,10 @@ static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t
                 goto out;
         }
  
-       if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 0)
+               initpid = fc->pid;
+       if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
                 if (nextcg) {
                         int ret;
                         ret = filler(buf, nextcg,  NULL, 0);
@@ -748,7 +1101,7 @@ static int cg_releasedir(const char *path, struct fuse_file_info *fi)
  static int cg_open(const char *path, struct fuse_file_info *fi)
  {
         const char *cgroup;
-       char *fpath = NULL, *path1, *path2, * cgdir = NULL, *controller;
+       char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
         struct cgfs_files *k = NULL;
         struct file_info *file_info;
         struct fuse_context *fc = fuse_get_context();
@@ -764,13 +1117,13 @@ static int cg_open(const char *path, struct fuse_file_info *fi)
         if (!cgroup)
                 return -EINVAL;
  
-       get_cgdir_and_path(cgroup, &cgdir, &fpath);
-       if (!fpath) {
+       get_cgdir_and_path(cgroup, &cgdir, &last);
+       if (!last) {
                 path1 = "/";
                 path2 = cgdir;
         } else {
                 path1 = cgdir;
-               path2 = fpath;
+               path2 = last;
         }
  
         k = cgfs_get_key(controller, path1, path2);
@@ -780,6 +1133,13 @@ static int cg_open(const char *path, struct fuse_file_info *fi)
         }
         free_key(k);
  
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 0)
+               initpid = fc->pid;
+       if (!caller_may_see_dir(initpid, controller, path1)) {
+               ret = -ENOENT;
+               goto out;
+       }
         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
                 // should never get here
                 ret = -EACCES;
@@ -815,24 +1175,61 @@ static int cg_release(const char *path, struct fuse_file_info *fi)
         return 0;
  }
  
+#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
+
+static bool wait_for_sock(int sock, int timeout)
+{
+       struct epoll_event ev;
+       int epfd, ret, now, starttime, deltatime, saved_errno;
+
+       if ((starttime = time(NULL)) < 0)
+               return false;
+
+       if ((epfd = epoll_create(1)) < 0) {
+               fprintf(stderr, "Failed to create epoll socket: %m\n");
+               return false;
+       }
+
+       ev.events = POLLIN_SET;
+       ev.data.fd = sock;
+       if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
+               fprintf(stderr, "Failed adding socket to epoll: %m\n");
+               close(epfd);
+               return false;
+       }
+
+again:
+       if ((now = time(NULL)) < 0) {
+               close(epfd);
+               return false;
+       }
+
+       deltatime = (starttime + timeout) - now;
+       if (deltatime < 0) { // timeout
+               errno = 0;
+               close(epfd);
+               return false;
+       }
+       ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
+       if (ret < 0 && errno == EINTR)
+               goto again;
+       saved_errno = errno;
+       close(epfd);
+
+       if (ret <= 0) {
+               errno = saved_errno;
+               return false;
+       }
+       return true;
+}
+
  static int msgrecv(int sockfd, void *buf, size_t len)
  {
-       struct timeval tv;
-       fd_set rfds;
-
-       FD_ZERO(&rfds);
-       FD_SET(sockfd, &rfds);
-       tv.tv_sec = 2;
-       tv.tv_usec = 0;
-
-       if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
+       if (!wait_for_sock(sockfd, 2))
                 return -1;
         return recv(sockfd, buf, len, MSG_DONTWAIT);
  }
  
-#define SEND_CREDS_OK 0
-#define SEND_CREDS_NOTSK 1
-#define SEND_CREDS_FAIL 2
  static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
  {
         struct msghdr msg = { 0 };
@@ -888,8 +1285,6 @@ static bool recv_creds(int sock, struct ucred *cred, char *v)
         char buf[1];
         int ret;
         int optval = 1;
-       struct timeval tv;
-       fd_set rfds;
  
         *v = '1';
  
@@ -917,12 +1312,8 @@ static bool recv_creds(int sock, struct ucred *cred, char *v)
         msg.msg_iov = &iov;
         msg.msg_iovlen = 1;
  
-       FD_ZERO(&rfds);
-       FD_SET(sock, &rfds);
-       tv.tv_sec = 2;
-       tv.tv_usec = 0;
-       if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
-               fprintf(stderr, "Failed to select for scm_cred: %s\n",
+       if (!wait_for_sock(sock, 2)) {
+               fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
                           strerror(errno));
                 return false;
         }
@@ -976,8 +1367,6 @@ static void pid_to_ns_wrapper(int sock, pid_t tpid)
         int newnsfd = -1, ret, cpipe[2];
         char fnam[100];
         pid_t cpid;
-       struct timeval tv;
-       fd_set s;
         char v;
  
         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
@@ -993,7 +1382,6 @@ static void pid_to_ns_wrapper(int sock, pid_t tpid)
         if (pipe(cpipe) < 0)
                 _exit(1);
  
-loop:
         cpid = fork();
         if (cpid < 0)
                 _exit(1);
@@ -1007,29 +1395,19 @@ loop:
                 }
                 close(cpipe[1]);
                 pid_to_ns(sock, tpid);
+               _exit(1); // not reached
         }
         // give the child 1 second to be done forking and
-       // write it's ack
-       FD_ZERO(&s);
-       FD_SET(cpipe[0], &s);
-       tv.tv_sec = 1;
-       tv.tv_usec = 0;
-       ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
-       if (ret <= 0)
-               goto again;
+       // write its ack
+       if (!wait_for_sock(cpipe[0], 1))
+               _exit(1);
         ret = read(cpipe[0], &v, 1);
-       if (ret != sizeof(char) || v != '1') {
-               goto again;
-       }
+       if (ret != sizeof(char) || v != '1')
+               _exit(1);
  
         if (!wait_for_pid(cpid))
                 _exit(1);
         _exit(0);
-
-again:
-       kill(cpid, SIGKILL);
-       wait_for_pid(cpid);
-       goto loop;
  }
  
  /*
@@ -1046,9 +1424,7 @@ static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const c
         bool answer = false;
         char v = '0';
         struct ucred cred;
-       struct timeval tv;
         size_t sz = 0, asz = 0;
-       fd_set s;
  
         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
                 return false;
@@ -1069,7 +1445,7 @@ static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const c
         if (cpid == -1)
                 goto out;
  
-       if (!cpid) // child
+       if (!cpid) // child - exits when done
                 pid_to_ns_wrapper(sock[1], tpid);
  
         char *ptr = tmpdata;
@@ -1085,13 +1461,8 @@ static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const c
                         goto out;
  
                 // read converted results
-               FD_ZERO(&s);
-               FD_SET(sock[0], &s);
-               tv.tv_sec = 2;
-               tv.tv_usec = 0;
-               ret = select(sock[0]+1, &s, NULL, NULL, &tv);
-               if (ret <= 0) {
-                       fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
+               if (!wait_for_sock(sock[0], 2)) {
+                       fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
                                 __func__, strerror(errno));
                         goto out;
                 }
@@ -1202,21 +1573,13 @@ static void pid_from_ns(int sock, pid_t tpid)
         pid_t vpid;
         struct ucred cred;
         char v;
-       struct timeval tv;
-       fd_set s;
         int ret;
  
         cred.uid = 0;
         cred.gid = 0;
         while (1) {
-               FD_ZERO(&s);
-               FD_SET(sock, &s);
-               tv.tv_sec = 2;
-               tv.tv_usec = 0;
-               ret = select(sock+1, &s, NULL, NULL, &tv);
-               if (ret <= 0) {
-                       fprintf(stderr, "%s: bad select before read from parent: %s\n",
-                               __func__, strerror(errno));
+               if (!wait_for_sock(sock, 2)) {
+                       fprintf(stderr, "%s: timeout reading from parent\n", __func__);
                         _exit(1);
                 }
                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
@@ -1243,8 +1606,6 @@ static void pid_from_ns_wrapper(int sock, pid_t tpid)
         int newnsfd = -1, ret, cpipe[2];
         char fnam[100];
         pid_t cpid;
-       fd_set s;
-       struct timeval tv;
         char v;
  
         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
@@ -1278,13 +1639,8 @@ loop:
         }
  
         // give the child 1 second to be done forking and
-       // write it's ack
-       FD_ZERO(&s);
-       FD_SET(cpipe[0], &s);
-       tv.tv_sec = 1;
-       tv.tv_usec = 0;
-       ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
-       if (ret <= 0)
+       // write its ack
+       if (!wait_for_sock(cpipe[0], 1))
                 goto again;
         ret = read(cpipe[0], &v, 1);
         if (ret != sizeof(char) || v != '1') {
@@ -1301,7 +1657,95 @@ again:
         goto loop;
  }
  
-static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
+/*
+ * Given host @uid, return the uid to which it maps in
+ * @pid's user namespace, or -1 if none.
+ */
+bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
+{
+       FILE *f;
+       char line[400];
+
+       sprintf(line, "/proc/%d/uid_map", pid);
+       if ((f = fopen(line, "r")) == NULL) {
+               return false;
+       }
+
+       *answer = convert_id_to_ns(f, uid);
+       fclose(f);
+
+       if (*answer == -1)
+               return false;
+       return true;
+}
+
+/*
+ * get_pid_creds: get the real uid and gid of @pid from
+ * /proc/$$/status
+ * (XXX should we use euid here?)
+ */
+void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
+{
+       char line[400];
+       uid_t u;
+       gid_t g;
+       FILE *f;
+
+       *uid = -1;
+       *gid = -1;
+       sprintf(line, "/proc/%d/status", pid);
+       if ((f = fopen(line, "r")) == NULL) {
+               fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
+               return;
+       }
+       while (fgets(line, 400, f)) {
+               if (strncmp(line, "Uid:", 4) == 0) {
+                       if (sscanf(line+4, "%u", &u) != 1) {
+                               fprintf(stderr, "bad uid line for pid %u\n", pid);
+                               fclose(f);
+                               return;
+                       }
+                       *uid = u;
+               } else if (strncmp(line, "Gid:", 4) == 0) {
+                       if (sscanf(line+4, "%u", &g) != 1) {
+                               fprintf(stderr, "bad gid line for pid %u\n", pid);
+                               fclose(f);
+                               return;
+                       }
+                       *gid = g;
+               }
+       }
+       fclose(f);
+}
+
+/*
+ * May the requestor @r move victim @v to a new cgroup?
+ * This is allowed if
+ *   . they are the same task
+ *   . they are ownedy by the same uid
+ *   . @r is root on the host, or
+ *   . @v's uid is mapped into @r's where @r is root.
+ */
+bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
+{
+       uid_t v_uid, tmpuid;
+       gid_t v_gid;
+
+       if (r == v)
+               return true;
+       if (r_uid == 0)
+               return true;
+       get_pid_creds(v, &v_uid, &v_gid);
+       if (r_uid == v_uid)
+               return true;
+       if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
+                       && hostuid_to_ns(v_uid, r, &tmpuid))
+               return true;
+       return false;
+}
+
+static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
+               const char *file, const char *buf)
  {
         int sock[2] = {-1, -1};
         pid_t qpid, cpid = -1;
@@ -1343,6 +1787,10 @@ static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const
  
                 if (recv_creds(sock[0], &cred, &v)) {
                         if (v == '0') {
+                               if (!may_move_pid(tpid, tuid, cred.pid)) {
+                                       fail = true;
+                                       break;
+                               }
                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
                                         fail = true;
                         }
@@ -1415,7 +1863,7 @@ int cg_write(const char *path, const char *buf, size_t size, off_t offset,
                         strcmp(f->file, "/cgroup.procs") == 0 ||
                         strcmp(f->file, "cgroup.procs") == 0)
                 // special case - we have to translate the pids
-               r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
+               r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
         else
                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
  
@@ -1430,7 +1878,7 @@ out:
  int cg_chown(const char *path, uid_t uid, gid_t gid)
  {
         struct fuse_context *fc = fuse_get_context();
-       char *cgdir = NULL, *fpath = NULL, *path1, *path2, *controller;
+       char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
         struct cgfs_files *k = NULL;
         const char *cgroup;
         int ret;
@@ -1449,14 +1897,14 @@ int cg_chown(const char *path, uid_t uid, gid_t gid)
                 /* this is just /cgroup/controller */
                 return -EINVAL;
  
-       get_cgdir_and_path(cgroup, &cgdir, &fpath);
+       get_cgdir_and_path(cgroup, &cgdir, &last);
  
-       if (!fpath) {
+       if (!last) {
                 path1 = "/";
                 path2 = cgdir;
         } else {
                 path1 = cgdir;
-               path2 = fpath;
+               path2 = last;
         }
  
         if (is_child_cgroup(controller, path1, path2)) {
@@ -1495,7 +1943,7 @@ out:
  int cg_chmod(const char *path, mode_t mode)
  {
         struct fuse_context *fc = fuse_get_context();
-       char * cgdir = NULL, *fpath = NULL, *path1, *path2, *controller;
+       char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
         struct cgfs_files *k = NULL;
         const char *cgroup;
         int ret;
@@ -1514,14 +1962,14 @@ int cg_chmod(const char *path, mode_t mode)
                 /* this is just /cgroup/controller */
                 return -EINVAL;
  
-       get_cgdir_and_path(cgroup, &cgdir, &fpath);
+       get_cgdir_and_path(cgroup, &cgdir, &last);
  
-       if (!fpath) {
+       if (!last) {
                 path1 = "/";
                 path2 = cgdir;
         } else {
                 path1 = cgdir;
-               path2 = fpath;
+               path2 = last;
         }
  
         if (is_child_cgroup(controller, path1, path2)) {
@@ -1563,7 +2011,7 @@ out:
  int cg_mkdir(const char *path, mode_t mode)
  {
         struct fuse_context *fc = fuse_get_context();
-       char *fpath = NULL, *path1, *cgdir = NULL, *controller;
+       char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
         const char *cgroup;
         int ret;
  
@@ -1579,33 +2027,46 @@ int cg_mkdir(const char *path, mode_t mode)
         if (!cgroup)
                 return -EINVAL;
  
-       get_cgdir_and_path(cgroup, &cgdir, &fpath);
-       if (!fpath)
+       get_cgdir_and_path(cgroup, &cgdir, &last);
+       if (!last)
                 path1 = "/";
         else
                 path1 = cgdir;
  
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 0)
+               initpid = fc->pid;
+       if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
+               if (!next)
+                       ret = -EINVAL;
+               else if (last && strcmp(next, last) == 0)
+                       ret = -EEXIST;
+               else
+                       ret = -ENOENT;
+               goto out;
+       }
+
         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
                 ret = -EACCES;
                 goto out;
         }
-       if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL)) {
+       if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
                 ret = -EACCES;
                 goto out;
         }
  
         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
-       printf("cgfs_create returned %d for %s %s\n", ret, controller, cgroup);
  
  out:
         free(cgdir);
+       free(next);
         return ret;
  }
  
  static int cg_rmdir(const char *path)
  {
         struct fuse_context *fc = fuse_get_context();
-       char *fpath = NULL, *cgdir = NULL, *controller;
+       char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
         const char *cgroup;
         int ret;
  
@@ -1620,19 +2081,28 @@ static int cg_rmdir(const char *path)
         if (!cgroup)
                 return -EINVAL;
  
-       get_cgdir_and_path(cgroup, &cgdir, &fpath);
-       if (!fpath) {
+       get_cgdir_and_path(cgroup, &cgdir, &last);
+       if (!last) {
                 ret = -EINVAL;
                 goto out;
         }
  
-       fprintf(stderr, "rmdir: verifying access to %s:%s (req path %s)\n",
-                       controller, cgdir, path);
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 0)
+               initpid = fc->pid;
+       if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
+               if (!last || strcmp(next, last) == 0)
+                       ret = -EBUSY;
+               else
+                       ret = -ENOENT;
+               goto out;
+       }
+
         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
                 ret = -EACCES;
                 goto out;
         }
-       if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
+       if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
                 ret = -EACCES;
                 goto out;
         }
@@ -1646,6 +2116,7 @@ static int cg_rmdir(const char *path)
  
  out:
         free(cgdir);
+       free(next);
         return ret;
  }
  
@@ -1720,16 +2191,9 @@ static int read_file(const char *path, char *buf, size_t size,
                         rv = 0;
                         goto err;
                 }
-               if (l < cache_size) {
-                       cache += l;
-                       cache_size -= l;
-                       total_len += l;
-               } else {
-                       cache += cache_size;
-                       total_len += cache_size;
-                       cache_size = 0;
-                       break;
-               }
+               cache += l;
+               cache_size -= l;
+               total_len += l;
         }
  
         d->size = total_len;
@@ -1785,7 +2249,8 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset,
         struct file_info *d = (struct file_info *)fi->fh;
         char *cg;
         char *memusage_str = NULL, *memstat_str = NULL,
-               *memswlimit_str = NULL, *memswusage_str = NULL;
+               *memswlimit_str = NULL, *memswusage_str = NULL,
+               *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
                 cached = 0, hosttotal = 0;
         char *line = NULL;
@@ -1805,7 +2270,10 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset,
                 return total_len;
         }
  
-       cg = get_pid_cgroup(fc->pid, "memory");
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 0)
+               initpid = fc->pid;
+       cg = get_pid_cgroup(initpid, "memory");
         if (!cg)
                 return read_file("/proc/meminfo", buf, size, d);
  
@@ -1820,15 +2288,28 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset,
         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
         {
+               /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
+               if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
+                       goto err;
+               if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
+                       goto err;
+
                 memswlimit = strtoul(memswlimit_str, NULL, 10);
                 memswusage = strtoul(memswusage_str, NULL, 10);
-               memswlimit /= 1024;
-               memswusage /= 1024;
+
+               if (!strcmp(memswlimit_str, memswlimit_default_str))
+                       memswlimit = 0;
+               if (!strcmp(memswusage_str, memswusage_default_str))
+                       memswusage = 0;
+
+               memswlimit = memswlimit / 1024;
+               memswusage = memswusage / 1024;
         }
-       
+
         memusage = strtoul(memusage_str, NULL, 10);
         memlimit /= 1024;
         memusage /= 1024;
+
         get_mem_cached(memstat_str, &cached);
  
         f = fopen("/proc/meminfo", "r");
@@ -1904,6 +2385,8 @@ err:
         free(memswlimit_str);
         free(memswusage_str);
         free(memstat_str);
+       free(memswlimit_default_str);
+       free(memswusage_default_str);
         return rv;
  }
  
@@ -1969,7 +2452,10 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
                 return total_len;
         }
  
-       cg = get_pid_cgroup(fc->pid, "cpuset");
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 0)
+               initpid = fc->pid;
+       cg = get_pid_cgroup(initpid, "cpuset");
         if (!cg)
                 return read_file("proc/cpuinfo", buf, size, d);
  
@@ -1998,16 +2484,9 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
                                         rv = 0;
                                         goto err;
                                 }
-                               if (l < cache_size){
-                                       cache += l;
-                                       cache_size -= l;
-                                       total_len += l;
-                               }else{
-                                       cache += cache_size;
-                                       total_len += cache_size;
-                                       cache_size = 0;
-                                       break;
-                               }
+                               cache += l;
+                               cache_size -= l;
+                               total_len += l;
                         }
                         continue;
                 }
@@ -2023,16 +2502,9 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
                                 rv = 0;
                                 goto err;
                         }
-                       if (l < cache_size) {
-                               cache += l;
-                               cache_size -= l;
-                               total_len += l;
-                       } else {
-                               cache += cache_size;
-                               total_len += cache_size;
-                               cache_size = 0;
-                               break;
-                       }
+                       cache += l;
+                       cache_size -= l;
+                       total_len += l;
                 }
         }
  
@@ -2083,7 +2555,10 @@ static int proc_stat_read(char *buf, size_t size, off_t offset,
                 return total_len;
         }
  
-       cg = get_pid_cgroup(fc->pid, "cpuset");
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 0)
+               initpid = fc->pid;
+       cg = get_pid_cgroup(initpid, "cpuset");
         if (!cg)
                 return read_file("/proc/stat", buf, size, d);
  
@@ -2120,18 +2595,10 @@ static int proc_stat_read(char *buf, size_t size, off_t offset,
                                 rv = 0;
                                 goto err;
                         }
-                       if (l < cache_size) {
-                               cache += l;
-                               cache_size -= l;
-                               total_len += l;
-                               continue;
-                       } else {
-                               //no more space, break it
-                               cache += cache_size;
-                               total_len += cache_size;
-                               cache_size = 0;
-                               break;
-                       }
+                       cache += l;
+                       cache_size -= l;
+                       total_len += l;
+                       continue;
                 }
  
                 if (sscanf(cpu_char, "%d", &cpu) != 1)
@@ -2205,247 +2672,43 @@ err:
         return rv;
  }
  
-/*
- * How to guess what to present for uptime?
- * One thing we could do would be to take the date on the caller's
- * memory.usage_in_bytes file, which should equal the time of creation
- * of his cgroup.  However, a task could be in a sub-cgroup of the
- * container.  The same problem exists if we try to look at the ages
- * of processes in the caller's cgroup.
- *
- * So we'll fork a task that will enter the caller's pidns, mount a
- * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
- *
- * For the second uptime #, we'll do as Stéphane had done, just copy
- * the number from /proc/uptime.  Not sure how to best emulate 'idle'
- * time.  Maybe someone can come up with a good algorithm and submit a
- * patch.  Maybe something based on cpushare info?
- */
-
-/* return age of the reaper for $pid, taken from ctime of its procdir */
-static long int get_pid1_time(pid_t pid)
+static long int getreaperage(pid_t pid)
  {
         char fnam[100];
-       int fd, cpipe[2], ret;
         struct stat sb;
-       pid_t cpid;
-       struct timeval tv;
-       fd_set s;
-       char v;
-
-       if (unshare(CLONE_NEWNS))
-               return 0;
-
-       if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
-               perror("rslave mount failed");
-               return 0;
-       }
-
-       ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", pid);
-       if (ret < 0 || ret >= sizeof(fnam))
-               return 0;
+       int ret;
+       pid_t qpid;
  
-       fd = open(fnam, O_RDONLY);
-       if (fd < 0) {
-               perror("get_pid1_time open of ns/pid");
+       qpid = lookup_initpid_in_store(pid);
+       if (qpid <= 0)
                 return 0;
-       }
-       if (setns(fd, 0)) {
-               perror("get_pid1_time setns 1");
-               close(fd);
-               return 0;
-       }
-       close(fd);
-
-       if (pipe(cpipe) < 0)
-               exit(1);
  
-loop:
-       cpid = fork();
-       if (cpid < 0)
+       ret = snprintf(fnam, 100, "/proc/%d", qpid);
+       if (ret < 0 || ret >= 100)
                 return 0;
  
-       if (!cpid) {
-               char b = '1';
-               close(cpipe[0]);
-               if (write(cpipe[1], &b, sizeof(char)) < 0) {
-                       fprintf(stderr, "%s (child): erorr on write: %s\n",
-                               __func__, strerror(errno));
-               }
-               close(cpipe[1]);
-               umount2("/proc", MNT_DETACH);
-               if (mount("proc", "/proc", "proc", 0, NULL)) {
-                       perror("get_pid1_time mount");
-                       return 0;
-               }
-               ret = lstat("/proc/1", &sb);
-               if (ret) {
-                       perror("get_pid1_time lstat");
-                       return 0;
-               }
-               return time(NULL) - sb.st_ctime;
-       }
-
-       // give the child 1 second to be done forking and
-       // write it's ack
-       FD_ZERO(&s);
-       FD_SET(cpipe[0], &s);
-       tv.tv_sec = 1;
-       tv.tv_usec = 0;
-       ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
-       if (ret <= 0)
-               goto again;
-       ret = read(cpipe[0], &v, 1);
-       if (ret != sizeof(char) || v != '1') {
-               goto again;
-       }
-
-       wait_for_pid(cpid);
-       _exit(0);
-
-again:
-       kill(cpid, SIGKILL);
-       wait_for_pid(cpid);
-       goto loop;
-}
-
-static long int getreaperage(pid_t qpid)
-{
-       int pid, mypipe[2], ret;
-       struct timeval tv;
-       fd_set s;
-       long int mtime, answer = 0;
-
-       if (pipe(mypipe)) {
+       if (lstat(fnam, &sb) < 0)
                 return 0;
-       }
-
-       pid = fork();
-
-       if (!pid) { // child
-               mtime = get_pid1_time(qpid);
-               if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
-                       fprintf(stderr, "Warning: bad write from getreaperage\n");
-               _exit(0);
-       }
-
-       close(mypipe[1]);
-       FD_ZERO(&s);
-       FD_SET(mypipe[0], &s);
-       tv.tv_sec = 1;
-       tv.tv_usec = 0;
-       ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
-       if (ret <= 0) {
-               perror("select");
-               goto out;
-       }
-       if (!ret) {
-               fprintf(stderr, "timed out\n");
-               goto out;
-       }
-       if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
-               perror("read");
-               goto out;
-       }
-       answer = mtime;
-
-out:
-       wait_for_pid(pid);
-       close(mypipe[0]);
-       return answer;
-}
-
-/*
- * fork a task which switches to @task's namespace and writes '1'.
- * over a unix sock so we can read the task's reaper's pid in our
- * namespace
- */
-void write_task_init_pid_exit(int sock, pid_t target)
-{
-       struct ucred cred;
-       char fnam[100];
-       pid_t pid;
-       char v;
-       int fd, ret;
-
-       ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
-       if (ret < 0 || ret >= sizeof(fnam))
-               exit(1);
-
-       fd = open(fnam, O_RDONLY);
-       if (fd < 0) {
-               perror("get_pid1_time open of ns/pid");
-               exit(1);
-       }
-       if (setns(fd, 0)) {
-               perror("get_pid1_time setns 1");
-               close(fd);
-               exit(1);
-       }
-       pid = fork();
-       if (pid < 0)
-               exit(1);
-       if (pid != 0) {
-               wait_for_pid(pid);
-               exit(0);
-       }
-
-       /* we are the child */
-       cred.uid = 0;
-       cred.gid = 0;
-       cred.pid = 1;
-       v = '1';
-       send_creds(sock, &cred, v, true);
-       exit(0);
-}
-
-static pid_t get_task_reaper_pid(pid_t task)
-{
-       int sock[2];
-       pid_t pid;
-       pid_t ret = -1;
-       char v = '0';
-       struct ucred cred;
-
-       if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
-               perror("socketpair");
-               return -1;
-       }
  
-       pid = fork();
-       if (pid < 0)
-               goto out;
-       if (!pid) {
-               close(sock[1]);
-               write_task_init_pid_exit(sock[0], task);
-       }
-
-       if (!recv_creds(sock[1], &cred, &v))
-               goto out;
-       ret = cred.pid;
-
-out:
-       close(sock[0]);
-       close(sock[1]);
-       return ret;
+       return time(NULL) - sb.st_ctime;
  }
  
  static unsigned long get_reaper_busy(pid_t task)
  {
-       pid_t init = get_task_reaper_pid(task);
+       pid_t initpid = lookup_initpid_in_store(task);
         char *cgroup = NULL, *usage_str = NULL;
         unsigned long usage = 0;
  
-       if (init == -1)
+       if (initpid <= 0)
                 return 0;
  
-       cgroup = get_pid_cgroup(task, "cpuacct");
+       cgroup = get_pid_cgroup(initpid, "cpuacct");
         if (!cgroup)
                 goto out;
         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
                 goto out;
         usage = strtoul(usage_str, NULL, 10);
-       usage /= 100000000;
+       usage /= 1000000000;
  
  out:
         free(cgroup);
@@ -2463,7 +2726,7 @@ static int proc_uptime_read(char *buf, size_t size, off_t offset,
  {
         struct fuse_context *fc = fuse_get_context();
         struct file_info *d = (struct file_info *)fi->fh;
-       long int reaperage = getreaperage(fc->pid);;
+       long int reaperage = getreaperage(fc->pid);
         unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
         char *cache = d->buf;
         size_t total_len = 0;
@@ -2532,7 +2795,10 @@ static int proc_diskstats_read(char *buf, size_t size, off_t offset,
                 return total_len;
         }
  
-       cg = get_pid_cgroup(fc->pid, "blkio");
+       pid_t initpid = lookup_initpid_in_store(fc->pid);
+       if (initpid <= 0)
+               initpid = fc->pid;
+       cg = get_pid_cgroup(initpid, "blkio");
         if (!cg)
                 return read_file("/proc/diskstats", buf, size, d);