]> git.proxmox.com Git - mirror_lxcfs.git/blobdiff - lxcfs.c
add cache types for each procfile type that we provide
[mirror_lxcfs.git] / lxcfs.c
diff --git a/lxcfs.c b/lxcfs.c
index 3e4af803be049bd008496857b62d745fe0892439..162f356445176cc3ab238aefaac91b6d1c50fcdd 100644 (file)
--- a/lxcfs.c
+++ b/lxcfs.c
@@ -31,6 +31,7 @@
 
 #include <nih/alloc.h>
 #include <nih/string.h>
+#include <nih/alloc.h>
 
 #include "cgmanager.h"
 
@@ -43,6 +44,36 @@ struct lxcfs_state {
 };
 #define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
 
+enum {
+       LXC_TYPE_CGDIR,
+       LXC_TYPE_CGFILE,
+       LXC_TYPE_PROC_MEMINFO,
+       LXC_TYPE_PROC_CPUINFO,
+       LXC_TYPE_PROC_UPTIME,
+       LXC_TYPE_PROC_STAT,
+       LXC_TYPE_PROC_DISKSTATS,
+};
+
+struct file_info {
+       char *controller;
+       char *cgroup;
+       char *file;
+       int type;
+       char *buf;  // unused as of yet
+       int buflen;
+};
+
+static char *must_copy_string(const char *str)
+{
+       if (!str)
+               return NULL;
+       return NIH_MUST( nih_strdup(NULL, str) );
+}
+
+/*
+ * TODO - return value should denote whether child exited with failure
+ * so callers can return errors.  Esp read/write of tasks and cgroup.procs
+ */
 static int wait_for_pid(pid_t pid)
 {
        int status, ret;
@@ -526,41 +557,59 @@ static int cg_opendir(const char *path, struct fuse_file_info *fi)
        struct fuse_context *fc = fuse_get_context();
        nih_local struct cgm_keys **list = NULL;
        const char *cgroup;
+       struct file_info *dir_info;
        nih_local char *controller = NULL;
-       nih_local char *nextcg = NULL;
 
        if (!fc)
                return -EIO;
 
-       if (strcmp(path, "/cgroup") == 0)
-               return 0;
-
-       // return list of keys for the controller, and list of child cgroups
-       controller = pick_controller_from_path(fc, path);
-       if (!controller)
-               return -EIO;
+       if (strcmp(path, "/cgroup") == 0) {
+               cgroup = NULL;
+               controller = NULL;
+       } else {
+               // return list of keys for the controller, and list of child cgroups
+               controller = pick_controller_from_path(fc, path);
+               if (!controller)
+                       return -EIO;
 
-       cgroup = find_cgroup_in_path(path);
-       if (!cgroup) {
-               /* this is just /cgroup/controller, return its contents */
-               cgroup = "/";
+               cgroup = find_cgroup_in_path(path);
+               if (!cgroup) {
+                       /* this is just /cgroup/controller, return its contents */
+                       cgroup = "/";
+               }
        }
 
        if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
                return -EACCES;
+
+       /* we'll free this at cg_releasedir */
+       dir_info = NIH_MUST( nih_alloc(NULL, sizeof(*dir_info)) );
+       dir_info->controller = must_copy_string(controller);
+       dir_info->cgroup = must_copy_string(cgroup);
+       dir_info->type = LXC_TYPE_CGDIR;
+       dir_info->buf = NULL;
+       dir_info->file = NULL;
+       dir_info->buflen = 0;
+
+       fi->fh = (unsigned long)dir_info;
        return 0;
 }
 
 static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
                struct fuse_file_info *fi)
 {
+       struct file_info *d = (struct file_info *)fi->fh;
+       nih_local struct cgm_keys **list = NULL;
+       int i;
+       nih_local char *nextcg = NULL;
        struct fuse_context *fc = fuse_get_context();
 
-       if (!fc)
+       if (d->type != LXC_TYPE_CGDIR) {
+               fprintf(stderr, "Internal error: file cache info used in readdir\n");
                return -EIO;
-
-       if (strcmp(path, "/cgroup") == 0) {
-               // get list of controllers
+       }
+       if (!d->cgroup && !d->controller) {
+               // ls /var/lib/lxcfs/cgroup - just show list of controllers
                char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
                int i;
 
@@ -575,31 +624,11 @@ static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t
                return 0;
        }
 
-       // return list of keys for the controller, and list of child cgroups
-       nih_local struct cgm_keys **list = NULL;
-       const char *cgroup;
-       nih_local char *controller = NULL;
-       int i;
-       nih_local char *nextcg = NULL;
-
-       controller = pick_controller_from_path(fc, path);
-       if (!controller)
-               return -EIO;
-
-       cgroup = find_cgroup_in_path(path);
-       if (!cgroup) {
-               /* this is just /cgroup/controller, return its contents */
-               cgroup = "/";
-       }
-
-       if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
-               return -EACCES;
-
-       if (!cgm_list_keys(controller, cgroup, &list))
+       if (!cgm_list_keys(d->controller, d->cgroup, &list))
                // not a valid cgroup
                return -EINVAL;
 
-       if (!caller_is_in_ancestor(fc->pid, controller, cgroup, &nextcg)) {
+       if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
                if (nextcg) {
                        int ret;
                        ret = filler(buf, nextcg,  NULL, 0);
@@ -616,9 +645,9 @@ static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t
        }
 
        // now get the list of child cgroups
-       nih_local char **clist;
+       nih_local char **clist = NULL;
 
-       if (!cgm_list_children(controller, cgroup, &clist))
+       if (!cgm_list_children(d->controller, d->cgroup, &clist))
                return 0;
        for (i = 0; clist[i]; i++) {
                if (filler(buf, clist[i], NULL, 0) != 0) {
@@ -628,14 +657,26 @@ static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t
        return 0;
 }
 
+static void do_release_file_info(struct file_info *f)
+{
+       if (f->controller)
+               nih_free(f->controller);
+       if (f->cgroup)
+               nih_free(f->cgroup);
+       if (f->file)
+               nih_free(f->file);
+       free(f->buf);
+       nih_free(f);
+}
+
 static int cg_releasedir(const char *path, struct fuse_file_info *fi)
 {
+       struct file_info *d = (struct file_info *)fi->fh;
+
+       do_release_file_info(d);
        return 0;
 }
 
-/*
- * TODO - cache info here for read/write, release in cg_release.
- */
 static int cg_open(const char *path, struct fuse_file_info *fi)
 {
        nih_local char *controller = NULL;
@@ -643,6 +684,7 @@ static int cg_open(const char *path, struct fuse_file_info *fi)
        char *fpath = NULL, *path1, *path2;
        nih_local char * cgdir = NULL;
        nih_local struct cgm_keys *k = NULL;
+       struct file_info *file_info;
        struct fuse_context *fc = fuse_get_context();
 
        if (!fc)
@@ -664,15 +706,33 @@ static int cg_open(const char *path, struct fuse_file_info *fi)
                path2 = fpath;
        }
 
-       if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
-               if (!fc_may_access(fc, controller, path1, path2, fi->flags))
-                       // should never get here
-                       return -EACCES;
+       k = get_cgroup_key(controller, path1, path2);
+       if (!k)
+               return -EINVAL;
 
-               return 0;
-       }
+       if (!fc_may_access(fc, controller, path1, path2, fi->flags))
+               // should never get here
+               return -EACCES;
 
-       return -EINVAL;
+       /* we'll free this at cg_release */
+       file_info = NIH_MUST( nih_alloc(NULL, sizeof(*file_info)) );
+       file_info->controller = must_copy_string(controller);
+       file_info->cgroup = must_copy_string(path1);
+       file_info->file = must_copy_string(path2);
+       file_info->type = LXC_TYPE_CGFILE;
+       file_info->buf = NULL;
+       file_info->buflen = 0;
+
+       fi->fh = (unsigned long)file_info;
+       return 0;
+}
+
+static int cg_release(const char *path, struct fuse_file_info *fi)
+{
+       struct file_info *f = (struct file_info *)fi->fh;
+
+       do_release_file_info(f);
+       return 0;
 }
 
 static int msgrecv(int sockfd, void *buf, size_t len)
@@ -685,12 +745,15 @@ static int msgrecv(int sockfd, void *buf, size_t len)
        tv.tv_sec = 2;
        tv.tv_usec = 0;
 
-       if (select(sockfd+1, &rfds, NULL, NULL, &tv) < 0)
+       if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
                return -1;
        return recv(sockfd, buf, len, MSG_DONTWAIT);
 }
 
-static bool send_creds(int sock, struct ucred *cred, char v)
+#define SEND_CREDS_OK 0
+#define SEND_CREDS_NOTSK 1
+#define SEND_CREDS_FAIL 2
+static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
 {
        struct msghdr msg = { 0 };
        struct iovec iov;
@@ -699,10 +762,12 @@ static bool send_creds(int sock, struct ucred *cred, char v)
        char buf[1];
        buf[0] = 'p';
 
-       if (msgrecv(sock, buf, 1) != 1) {
-               printf("%s: Error getting reply from server over socketpair",
-                         __func__);
-               return false;
+       if (pingfirst) {
+               if (msgrecv(sock, buf, 1) != 1) {
+                       fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
+                                 __func__);
+                       return SEND_CREDS_FAIL;
+               }
        }
 
        msg.msg_control = cmsgbuf;
@@ -724,14 +789,14 @@ static bool send_creds(int sock, struct ucred *cred, char v)
        msg.msg_iovlen = 1;
 
        if (sendmsg(sock, &msg, 0) < 0) {
-               printf("%s: failed at sendmsg: %s", __func__,
+               fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
                          strerror(errno));
                if (errno == 3)
-                       return true;
-               return false;
+                       return SEND_CREDS_NOTSK;
+               return SEND_CREDS_FAIL;
        }
 
-       return true;
+       return SEND_CREDS_OK;
 }
 
 static bool recv_creds(int sock, struct ucred *cred, char *v)
@@ -743,6 +808,8 @@ static bool recv_creds(int sock, struct ucred *cred, char *v)
        char buf[1];
        int ret;
        int optval = 1;
+       struct timeval tv;
+       fd_set rfds;
 
        *v = '1';
 
@@ -751,12 +818,12 @@ static bool recv_creds(int sock, struct ucred *cred, char *v)
        cred->gid = -1;
 
        if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
-               printf("Failed to set passcred: %s", strerror(errno));
+               fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
                return false;
        }
        buf[0] = '1';
        if (write(sock, buf, 1) != 1) {
-               printf("Failed to start write on scm fd: %s", strerror(errno));
+               fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
                return false;
        }
 
@@ -770,12 +837,18 @@ static bool recv_creds(int sock, struct ucred *cred, char *v)
        msg.msg_iov = &iov;
        msg.msg_iovlen = 1;
 
-       // retry logic is not ideal, especially as we are not
-       // threaded.  Sleep at most 1 second waiting for the client
-       // to send us the scm_cred
-       ret = recvmsg(sock, &msg, 0);
+       FD_ZERO(&rfds);
+       FD_SET(sock, &rfds);
+       tv.tv_sec = 2;
+       tv.tv_usec = 0;
+       if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
+               fprintf(stderr, "Failed to select for scm_cred: %s\n",
+                         strerror(errno));
+               return false;
+       }
+       ret = recvmsg(sock, &msg, MSG_DONTWAIT);
        if (ret < 0) {
-               printf("Failed to receive scm_cred: %s",
+               fprintf(stderr, "Failed to receive scm_cred: %s\n",
                          strerror(errno));
                return false;
        }
@@ -794,10 +867,11 @@ static bool recv_creds(int sock, struct ucred *cred, char *v)
 
 
 /*
- * pidreader - reads pids from a ucred over a socket, then writes the
- * int value back over the socket
+ * pid_to_ns - reads pids from a ucred over a socket, then writes the
+ * int value back over the socket.  This shifts the pid from the
+ * sender's pidns into tpid's pidns.
  */
-static void pidreader(int sock, pid_t tpid)
+static void pid_to_ns(int sock, pid_t tpid)
 {
        char v = '0';
        struct ucred cred;
@@ -805,7 +879,6 @@ static void pidreader(int sock, pid_t tpid)
        while (recv_creds(sock, &cred, &v)) {
                if (v == '1')
                        exit(0);
-               printf("CCC: child received %d\n", cred.pid);
                if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
                        exit(1);
        }
@@ -813,16 +886,19 @@ static void pidreader(int sock, pid_t tpid)
 }
 
 /*
- * pidreader_wrapper: when you setns into a pidns, you yourself remain
+ * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
  * in your old pidns.  Only children which you fork will be in the target
- * pidns.  So the pidreader_wrapper does the setns, then forks a child to
+ * pidns.  So the pid_to_ns_wrapper does the setns, then forks a child to
  * actually convert pids
  */
-static void pidreader_wrapper(int sock, pid_t tpid)
+static void pid_to_ns_wrapper(int sock, pid_t tpid)
 {
-       int newnsfd = -1;
+       int newnsfd = -1, ret, cpipe[2];
        char fnam[100];
        pid_t cpid;
+       struct timeval tv;
+       fd_set s;
+       char v;
 
        sprintf(fnam, "/proc/%d/ns/pid", tpid);
        newnsfd = open(fnam, O_RDONLY);
@@ -832,15 +908,46 @@ static void pidreader_wrapper(int sock, pid_t tpid)
                exit(1);
        close(newnsfd);
 
-       cpid = fork();
+       if (pipe(cpipe) < 0)
+               exit(1);
 
+loop:
+       cpid = fork();
        if (cpid < 0)
                exit(1);
-       if (!cpid)
-               pidreader(sock, tpid);
+
+       if (!cpid) {
+               char b = '1';
+               close(cpipe[0]);
+               if (write(cpipe[1], &b, sizeof(char)) < 0) {
+                       fprintf(stderr, "%s (child): erorr on write: %s\n",
+                               __func__, strerror(errno));
+               }
+               close(cpipe[1]);
+               pid_to_ns(sock, tpid);
+       }
+       // give the child 1 second to be done forking and
+       // write it's ack
+       FD_ZERO(&s);
+       FD_SET(cpipe[0], &s);
+       tv.tv_sec = 1;
+       tv.tv_usec = 0;
+       ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
+       if (ret <= 0)
+               goto again;
+       ret = read(cpipe[0], &v, 1);
+       if (ret != sizeof(char) || v != '1') {
+               goto again;
+       }
+
        if (!wait_for_pid(cpid))
                exit(1);
        exit(0);
+
+again:
+       kill(cpid, SIGKILL);
+       wait_for_pid(cpid);
+       goto loop;
 }
 
 /*
@@ -879,34 +986,38 @@ static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const c
                goto out;
 
        if (!cpid) // child
-               pidreader_wrapper(sock[1], tpid);
+               pid_to_ns_wrapper(sock[1], tpid);
 
        char *ptr = tmpdata;
        cred.uid = 0;
        cred.gid = 0;
        while (sscanf(ptr, "%d\n", &qpid) == 1) {
                cred.pid = qpid;
-               printf("AAA: sending %d\n", qpid);
-               if (!send_creds(sock[0], &cred, v))
+               ret = send_creds(sock[0], &cred, v, true);
+
+               if (ret == SEND_CREDS_NOTSK)
+                       goto next;
+               if (ret == SEND_CREDS_FAIL)
                        goto out;
 
                // read converted results
                FD_ZERO(&s);
                FD_SET(sock[0], &s);
-               tv.tv_sec = 1;
+               tv.tv_sec = 2;
                tv.tv_usec = 0;
                ret = select(sock[0]+1, &s, NULL, NULL, &tv);
                if (ret <= 0) {
-                       kill(cpid, SIGTERM);
+                       fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
+                               __func__, strerror(errno));
                        goto out;
                }
                if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
-                       kill(cpid, SIGTERM);
-                       perror("read");
+                       fprintf(stderr, "%s: error reading pid from child: %s\n",
+                               __func__, strerror(errno));
                        goto out;
                }
-               printf("BBB: read %d\n", qpid);
                NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
+next:
                ptr = strchr(ptr, '\n');
                if (!ptr)
                        break;
@@ -915,9 +1026,10 @@ static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const c
 
        cred.pid = getpid();
        v = '1';
-       if (!send_creds(sock[0], &cred, v)) {
+       if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
                // failed to ask child to exit
-               kill(cpid, SIGTERM);
+               fprintf(stderr, "%s: failed to ask child to exit: %s\n",
+                       __func__, strerror(errno));
                goto out;
        }
 
@@ -936,56 +1048,47 @@ out:
 static int cg_read(const char *path, char *buf, size_t size, off_t offset,
                struct fuse_file_info *fi)
 {
-       nih_local char *controller = NULL;
-       const char *cgroup;
-       char *fpath = NULL, *path1, *path2;
        struct fuse_context *fc = fuse_get_context();
-       nih_local char * cgdir = NULL;
+       struct file_info *f = (struct file_info *)fi->fh;
        nih_local struct cgm_keys *k = NULL;
 
+       if (f->type != LXC_TYPE_CGFILE) {
+               fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
+               return -EIO;
+       }
+
        if (offset)
                return -EIO;
 
        if (!fc)
                return -EIO;
 
-       controller = pick_controller_from_path(fc, path);
-       if (!controller)
-               return -EINVAL;
-       cgroup = find_cgroup_in_path(path);
-       if (!cgroup)
+       if (!f->controller)
                return -EINVAL;
 
-       get_cgdir_and_path(cgroup, &cgdir, &fpath);
-       if (!fpath) {
-               path1 = "/";
-               path2 = cgdir;
-       } else {
-               path1 = cgdir;
-               path2 = fpath;
-       }
-
-       if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
+       if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
                nih_local char *data = NULL;
-               int s, ret;
+               int s;
+               bool r;
 
-               if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
+               if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY))
                        // should never get here
                        return -EACCES;
 
-               printf("XXX path2 is .%s.\n", path2);
-               if (strcmp(path2, "tasks") == 0 ||
-                               strcmp(path2, "/tasks") == 0 ||
-                               strcmp(path2, "/cgroup.procs") == 0 ||
-                               strcmp(path2, "cgroup.procs") == 0)
+               if (strcmp(f->file, "tasks") == 0 ||
+                               strcmp(f->file, "/tasks") == 0 ||
+                               strcmp(f->file, "/cgroup.procs") == 0 ||
+                               strcmp(f->file, "cgroup.procs") == 0)
                        // special case - we have to translate the pids
-                       ret = do_read_pids(fc->pid, controller, path1, path2, &data);
+                       r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
                else
-                       ret = cgm_get_value(controller, path1, path2, &data);
+                       r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
 
-               if (ret == 0)
+               if (!r)
                        return -EINVAL;
 
+               if (!data)
+                       return 0;
                s = strlen(data);
                if (s > size)
                        s = size;
@@ -997,15 +1100,184 @@ static int cg_read(const char *path, char *buf, size_t size, off_t offset,
        return -EINVAL;
 }
 
+static void pid_from_ns(int sock, pid_t tpid)
+{
+       pid_t vpid;
+       struct ucred cred;
+       char v;
+       struct timeval tv;
+       fd_set s;
+       int ret;
+
+       cred.uid = 0;
+       cred.gid = 0;
+       while (1) {
+               FD_ZERO(&s);
+               FD_SET(sock, &s);
+               tv.tv_sec = 2;
+               tv.tv_usec = 0;
+               ret = select(sock+1, &s, NULL, NULL, &tv);
+               if (ret <= 0) {
+                       fprintf(stderr, "%s: bad select before read from parent: %s\n",
+                               __func__, strerror(errno));
+                       exit(1);
+               }
+               if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
+                       fprintf(stderr, "%s: bad read from parent: %s\n",
+                               __func__, strerror(errno));
+                       exit(1);
+               }
+               if (vpid == -1) // done
+                       break;
+               v = '0';
+               cred.pid = vpid;
+               if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
+                       v = '1';
+                       cred.pid = getpid();
+                       if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
+                               exit(1);
+               }
+       }
+       exit(0);
+}
+
+static void pid_from_ns_wrapper(int sock, pid_t tpid)
+{
+       int newnsfd = -1, ret, cpipe[2];
+       char fnam[100];
+       pid_t cpid;
+       fd_set s;
+       struct timeval tv;
+       char v;
+
+       sprintf(fnam, "/proc/%d/ns/pid", tpid);
+       newnsfd = open(fnam, O_RDONLY);
+       if (newnsfd < 0)
+               exit(1);
+       if (setns(newnsfd, 0) < 0)
+               exit(1);
+       close(newnsfd);
+
+       if (pipe(cpipe) < 0)
+               exit(1);
+
+loop:
+       cpid = fork();
+
+       if (cpid < 0)
+               exit(1);
+
+       if (!cpid) {
+               char b = '1';
+               close(cpipe[0]);
+               if (write(cpipe[1], &b, sizeof(char)) < 0) {
+                       fprintf(stderr, "%s (child): erorr on write: %s\n",
+                               __func__, strerror(errno));
+               }
+               close(cpipe[1]);
+               pid_from_ns(sock, tpid);
+       }
+
+       // give the child 1 second to be done forking and
+       // write it's ack
+       FD_ZERO(&s);
+       FD_SET(cpipe[0], &s);
+       tv.tv_sec = 1;
+       tv.tv_usec = 0;
+       ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
+       if (ret <= 0)
+               goto again;
+       ret = read(cpipe[0], &v, 1);
+       if (ret != sizeof(char) || v != '1') {
+               goto again;
+       }
+
+       if (!wait_for_pid(cpid))
+               exit(1);
+       exit(0);
+
+again:
+       kill(cpid, SIGKILL);
+       wait_for_pid(cpid);
+       goto loop;
+}
+
+static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
+{
+       int sock[2] = {-1, -1};
+       pid_t qpid, cpid = -1;
+       bool answer = false, fail = false;
+
+       /*
+        * write the pids to a socket, have helper in writer's pidns
+        * call movepid for us
+        */
+       if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
+               perror("socketpair");
+               exit(1);
+       }
+
+       cpid = fork();
+       if (cpid == -1)
+               goto out;
+
+       if (!cpid) // child
+               pid_from_ns_wrapper(sock[1], tpid);
+
+       const char *ptr = buf;
+       while (sscanf(ptr, "%d", &qpid) == 1) {
+               struct ucred cred;
+               char v;
+
+               if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
+                       fprintf(stderr, "%s: error writing pid to child: %s\n",
+                               __func__, strerror(errno));
+                       goto out;
+               }
+
+               if (recv_creds(sock[0], &cred, &v)) {
+                       if (v == '0') {
+                               if (!cgm_move_pid(contrl, cg, cred.pid))
+                                       fail = true;
+                       }
+               }
+
+               ptr = strchr(ptr, '\n');
+               if (!ptr)
+                       break;
+               ptr++;
+       }
+
+       /* All good, write the value */
+       qpid = -1;
+       if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
+               fprintf(stderr, "Warning: failed to ask child to exit\n");
+
+       if (!fail)
+               answer = true;
+
+out:
+       if (cpid != -1)
+               wait_for_pid(cpid);
+       if (sock[0] != -1) {
+               close(sock[0]);
+               close(sock[1]);
+       }
+       return answer;
+}
+
 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
             struct fuse_file_info *fi)
 {
-       nih_local char *controller = NULL;
-       const char *cgroup;
-       char *fpath = NULL, *path1, *path2;
        struct fuse_context *fc = fuse_get_context();
-       nih_local char * cgdir = NULL;
+       nih_local char *localbuf = NULL;
        nih_local struct cgm_keys *k = NULL;
+       struct file_info *f = (struct file_info *)fi->fh;
+
+       if (f->type != LXC_TYPE_CGFILE) {
+               fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
+               return -EIO;
+       }
 
        if (offset)
                return -EINVAL;
@@ -1013,27 +1285,26 @@ int cg_write(const char *path, const char *buf, size_t size, off_t offset,
        if (!fc)
                return -EIO;
 
-       controller = pick_controller_from_path(fc, path);
-       if (!controller)
-               return -EINVAL;
-       cgroup = find_cgroup_in_path(path);
-       if (!cgroup)
-               return -EINVAL;
+       localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
+       localbuf[size] = '\0';
+       memcpy(localbuf, buf, size);
 
-       get_cgdir_and_path(cgroup, &cgdir, &fpath);
-       if (!fpath) {
-               path1 = "/";
-               path2 = cgdir;
-       } else {
-               path1 = cgdir;
-               path2 = fpath;
-       }
+       if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
+               bool r;
 
-       if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
-               if (!fc_may_access(fc, controller, path1, path2, O_WRONLY))
+               if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY))
                        return -EACCES;
 
-               if (!cgm_set_value(controller, path1, path2, buf))
+               if (strcmp(f->file, "tasks") == 0 ||
+                               strcmp(f->file, "/tasks") == 0 ||
+                               strcmp(f->file, "/cgroup.procs") == 0 ||
+                               strcmp(f->file, "cgroup.procs") == 0)
+                       // special case - we have to translate the pids
+                       r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
+               else
+                       r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
+
+               if (!r)
                        return -EINVAL;
 
                return size;
@@ -1255,6 +1526,29 @@ static void get_mem_cached(char *memstat, unsigned long *v)
        }
 }
 
+static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
+{   
+       char *eol;
+       char key[32];
+       
+       memset(key, 0, 32);
+       snprintf(key, 32, "%u:%u %s", major, minor, iotype);
+       
+       size_t len = strlen(key);
+       *v = 0;
+
+       while (*str) {
+               if (startswith(str, key)) {
+                       sscanf(str + len, "%lu", v);
+                       return;
+               }
+               eol = strchr(str, '\n');
+               if (!eol)
+                       return;
+               str = eol+1;
+       }
+}
+
 static char *get_pid_cgroup(pid_t pid, const char *contrl)
 {
        nih_local char *fnam = NULL;
@@ -1364,6 +1658,8 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset,
                total_len += l;
        }
 
+       fclose(f);
+       free(line);
        return total_len;
 }
 
@@ -1490,6 +1786,8 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
                }
        }
 
+       fclose(f);
+       free(line);
        return total_len;
 }
 
@@ -1501,7 +1799,7 @@ static int proc_stat_read(char *buf, size_t size, off_t offset,
        nih_local char *cpuset = NULL;
        char *line = NULL;
        size_t linelen = 0, total_len = 0;
-       int curcpu = 0;
+       int curcpu = -1; /* cpu numbering starts at 0 */
        FILE *f;
 
        if (offset)
@@ -1521,16 +1819,20 @@ static int proc_stat_read(char *buf, size_t size, off_t offset,
        while (getline(&line, &linelen, f) != -1) {
                size_t l;
                int cpu;
+               char cpu_char[10]; /* That's a lot of cores */
                char *c;
 
-               if (sscanf(line, "cpu%d", &cpu) != 1) {
-                       /* not a ^cpu line, just print it */
+               if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
+                       /* not a ^cpuN line containing a number N, just print it */
                        l = snprintf(buf, size, "%s", line);
                        buf += l;
                        size -= l;
                        total_len += l;
                        continue;
                }
+
+               if (sscanf(cpu_char, "%d", &cpu) != 1)
+                       continue;
                if (!cpu_in_cpuset(cpu, cpuset))
                        continue;
                curcpu ++;
@@ -1544,6 +1846,8 @@ static int proc_stat_read(char *buf, size_t size, off_t offset,
                total_len += l;
        }
 
+       fclose(f);
+       free(line);
        return total_len;
 }
 
@@ -1568,14 +1872,21 @@ static int proc_stat_read(char *buf, size_t size, off_t offset,
 static long int get_pid1_time(pid_t pid)
 {
        char fnam[100];
-       int fd;
+       int fd, cpipe[2], ret;
        struct stat sb;
-       int ret;
-       pid_t npid;
+       pid_t cpid;
+       struct timeval tv;
+       fd_set s;
+       char v;
 
        if (unshare(CLONE_NEWNS))
                return 0;
 
+       if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
+               perror("rslave mount failed");
+               return 0;
+       }
+
        sprintf(fnam, "/proc/%d/ns/pid", pid);
        fd = open(fnam, O_RDONLY);
        if (fd < 0) {
@@ -1588,28 +1899,57 @@ static long int get_pid1_time(pid_t pid)
                return 0;
        }
        close(fd);
-       npid = fork();
-       if (npid < 0)
-               return 0;
-
-       if (npid) {
-               // child will do the writing for us
-               wait_for_pid(npid);
-               exit(0);
-       }
 
-       umount2("/proc", MNT_DETACH);
+       if (pipe(cpipe) < 0)
+               exit(1);
 
-       if (mount("proc", "/proc", "proc", 0, NULL)) {
-               perror("get_pid1_time mount");
+loop:
+       cpid = fork();
+       if (cpid < 0)
                return 0;
+
+       if (!cpid) {
+               char b = '1';
+               close(cpipe[0]);
+               if (write(cpipe[1], &b, sizeof(char)) < 0) {
+                       fprintf(stderr, "%s (child): erorr on write: %s\n",
+                               __func__, strerror(errno));
+               }
+               close(cpipe[1]);
+               umount2("/proc", MNT_DETACH);
+               if (mount("proc", "/proc", "proc", 0, NULL)) {
+                       perror("get_pid1_time mount");
+                       return 0;
+               }
+               ret = lstat("/proc/1", &sb);
+               if (ret) {
+                       perror("get_pid1_time lstat");
+                       return 0;
+               }
+               return time(NULL) - sb.st_ctime;
        }
-       ret = lstat("/proc/1", &sb);
-       if (ret) {
-               perror("get_pid1_time lstat");
-               return 0;
+
+       // give the child 1 second to be done forking and
+       // write it's ack
+       FD_ZERO(&s);
+       FD_SET(cpipe[0], &s);
+       tv.tv_sec = 1;
+       tv.tv_usec = 0;
+       ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
+       if (ret <= 0)
+               goto again;
+       ret = read(cpipe[0], &v, 1);
+       if (ret != sizeof(char) || v != '1') {
+               goto again;
        }
-       return time(NULL) - sb.st_ctime;
+
+       wait_for_pid(cpid);
+       exit(0);
+
+again:
+       kill(cpid, SIGKILL);
+       wait_for_pid(cpid);
+       goto loop;
 }
 
 static long int getreaperage(pid_t qpid)
@@ -1638,12 +1978,12 @@ static long int getreaperage(pid_t qpid)
        tv.tv_sec = 1;
        tv.tv_usec = 0;
        ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
-       if (ret == -1) {
+       if (ret <= 0) {
                perror("select");
                goto out;
        }
        if (!ret) {
-               printf("timed out\n");
+               fprintf(stderr, "timed out\n");
                goto out;
        }
        if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
@@ -1662,9 +2002,12 @@ static long int getprocidle(void)
 {
        FILE *f = fopen("/proc/uptime", "r");
        long int age, idle;
+       int ret;
        if (!f)
                return 0;
-       if (fscanf(f, "%ld %ld", &age, &idle) != 2)
+       ret = fscanf(f, "%ld %ld", &age, &idle);
+       fclose(f);
+       if (ret != 2)
                return 0;
        return idle;
 }
@@ -1686,6 +2029,101 @@ static int proc_uptime_read(char *buf, size_t size, off_t offset,
        return snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
 }
 
+static int proc_diskstats_read(char *buf, size_t size, off_t offset,
+               struct fuse_file_info *fi)
+{
+       char dev_name[72];
+       struct fuse_context *fc = fuse_get_context();
+       nih_local char *cg = get_pid_cgroup(fc->pid, "blkio");
+       nih_local char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
+                       *io_wait_time_str = NULL, *io_service_time_str = NULL;
+       unsigned long read = 0, write = 0;
+       unsigned long read_merged = 0, write_merged = 0;
+       unsigned long read_sectors = 0, write_sectors = 0;
+       unsigned long read_ticks = 0, write_ticks = 0;
+       unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
+       unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
+       char *line = NULL;
+       size_t linelen = 0, total_len = 0;
+       unsigned int major = 0, minor = 0;
+       int i = 0;
+       FILE *f;
+
+       if (offset)
+               return -EINVAL;
+
+       if (!cg)
+               return 0;
+
+       if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
+               return 0;
+       if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
+               return 0;
+       if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
+               return 0;
+       if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
+               return 0;
+       if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
+               return 0;
+
+
+       f = fopen("/proc/diskstats", "r");
+       if (!f)
+               return 0;
+
+       while (getline(&line, &linelen, f) != -1) {
+               size_t l;
+               char *printme, lbuf[256];
+
+               i = sscanf(line, "%u %u %s", &major, &minor, dev_name);
+               if(i == 3){
+                       get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
+                       get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
+                       get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
+                       get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
+                       get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
+                       read_sectors = read_sectors/512;
+                       get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
+                       write_sectors = write_sectors/512;
+                       
+                       get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
+                       rd_svctm = rd_svctm/1000000;
+                       get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
+                       rd_wait = rd_wait/1000000;
+                       read_ticks = rd_svctm + rd_wait;
+
+                       get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
+                       wr_svctm =  wr_svctm/1000000;
+                       get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
+                       wr_wait =  wr_wait/1000000;
+                       write_ticks = wr_svctm + wr_wait;
+
+                       get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
+                       tot_ticks =  tot_ticks/1000000;
+               }else{
+                       continue;
+               }
+
+               memset(lbuf, 0, 256);
+               if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
+                       snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", 
+                               major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
+                               write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
+                       printme = lbuf;
+               } else
+                       continue;
+
+               l = snprintf(buf, size, "%s", printme);
+               buf += l;
+               size -= l;
+               total_len += l;
+       }
+
+       fclose(f);
+       free(line);
+       return total_len;
+}
+
 static off_t get_procfile_size(const char *which)
 {
        FILE *f = fopen(which, "r");
@@ -1698,6 +2136,7 @@ static off_t get_procfile_size(const char *which)
        while ((sz = getline(&line, &len, f)) != -1)
                answer += sz;
        fclose (f);
+       free(line);
 
        return answer;
 }
@@ -1719,8 +2158,8 @@ static int proc_getattr(const char *path, struct stat *sb)
        if (strcmp(path, "/proc/meminfo") == 0 ||
                        strcmp(path, "/proc/cpuinfo") == 0 ||
                        strcmp(path, "/proc/uptime") == 0 ||
-                       strcmp(path, "/proc/stat") == 0) {
-
+                       strcmp(path, "/proc/stat") == 0 ||
+                       strcmp(path, "/proc/diskstats") == 0) {
                sb->st_size = get_procfile_size(path);
                sb->st_mode = S_IFREG | 00444;
                sb->st_nlink = 1;
@@ -1736,7 +2175,8 @@ static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off
        if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
                                filler(buf, "meminfo", NULL, 0) != 0 ||
                                filler(buf, "stat", NULL, 0) != 0 ||
-                               filler(buf, "uptime", NULL, 0) != 0)
+                               filler(buf, "uptime", NULL, 0) != 0 ||
+                               filler(buf, "diskstats", NULL, 0) != 0)
                return -EINVAL;
        return 0;
 }
@@ -1746,7 +2186,8 @@ static int proc_open(const char *path, struct fuse_file_info *fi)
        if (strcmp(path, "/proc/meminfo") == 0 ||
                        strcmp(path, "/proc/cpuinfo") == 0 ||
                        strcmp(path, "/proc/uptime") == 0 ||
-                       strcmp(path, "/proc/stat") == 0)
+                       strcmp(path, "/proc/stat") == 0 || 
+                       strcmp(path, "/proc/diskstats") == 0)
                return 0;
        return -ENOENT;
 }
@@ -1762,6 +2203,8 @@ static int proc_read(const char *path, char *buf, size_t size, off_t offset,
                return proc_uptime_read(buf, size, offset, fi);
        if (strcmp(path, "/proc/stat") == 0)
                return proc_stat_read(buf, size, offset, fi);
+       if (strcmp(path, "/proc/diskstats") == 0)
+               return proc_diskstats_read(buf, size, offset, fi);
        return -EINVAL;
 }
 
@@ -1866,7 +2309,14 @@ static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
 
 static int lxcfs_release(const char *path, struct fuse_file_info *fi)
 {
-       return 0;
+       if (strncmp(path, "/cgroup", 7) == 0)
+               return cg_release(path, fi);
+#if 0
+       if (strncmp(path, "/proc", 5) == 0)
+               return proc_close(path, fi);
+#endif
+
+       return -EINVAL;
 }
 
 static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)