From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 25 Feb 2020 16:17:10 +0000 (+0100)
Subject: bindings: split cgroup part of lxcfs into separate files
X-Git-Tag: lxcfs-4.0.0~33^2~2
X-Git-Url: https://git.proxmox.com/?p=mirror_lxcfs.git;a=commitdiff_plain;h=580fe4df03735cdbb2f7c3d474b71b951aaddca3

bindings: split cgroup part of lxcfs into separate files

This was long overdue since the cgroup faking part is basically unused
at this point on most kernels.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---

diff --git a/Makefile.am b/Makefile.am
index d37aa7f..e3c4c24 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -13,6 +13,7 @@ AM_LDFLAGS = $(FUSE_LIBS) -pthread
 AM_CFLAGS += -DRUNTIME_PATH=\"$(RUNTIME_PATH)\"
 
 liblxcfs_la_SOURCES = bindings.c bindings.h \
+		      cgroup_fuse.c cgroup_fuse.h
 		      cgroups/cgfsng.c \
 		      cgroups/cgroup.c cgroups/cgroup.h \
 		      cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \
@@ -25,6 +26,7 @@ liblxcfs_la_CFLAGS = $(AM_CFLAGS)
 liblxcfs_la_LDFLAGS = $(AM_CFLAGS) -module -avoid-version -shared
 
 liblxcfstest_la_SOURCES = bindings.c bindings.h \
+			  cgroup_fuse.c cgroup_fuse.h
 			  cgroups/cgfsng.c \
 			  cgroups/cgroup.c cgroups/cgroup.h \
 			  cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \
diff --git a/bindings.c b/bindings.c
index ddaa528..83243b8 100644
--- a/bindings.c
+++ b/bindings.c
@@ -39,6 +39,7 @@
 
 #include "bindings.h"
 #include "config.h"
+#include "cgroup_fuse.h"
 #include "cgroups/cgroup.h"
 #include "cgroups/cgroup_utils.h"
 #include "memory_utils.h"
@@ -574,41 +575,6 @@ static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 	return NULL;
 }
 
-static int is_dir(const char *path, int fd)
-{
-	struct stat statbuf;
-	int ret = fstatat(fd, path, &statbuf, fd);
-	if (ret == 0 && S_ISDIR(statbuf.st_mode))
-		return 1;
-	return 0;
-}
-
-static bool write_string(const char *fnam, const char *string, int fd)
-{
-	FILE *f;
-	size_t len, ret;
-
-	f = fdopen(fd, "w");
-	if (!f)
-		return false;
-
-	len = strlen(string);
-	ret = fwrite(string, 1, len, f);
-	if (ret != len) {
-		lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
-			    strerror(errno), string, fnam);
-		fclose(f);
-		return false;
-	}
-
-	if (fclose(f) < 0) {
-		lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
-		return false;
-	}
-
-	return true;
-}
-
 struct cgfs_files {
 	char *name;
 	uint32_t uid, gid;
@@ -627,10 +593,9 @@ static void print_subsystems(void)
 	}
 }
 
-bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
-		const char *value)
+bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
 {
-	int ret, fd, cfd;
+	int ret, cfd;
 	size_t len;
 	char *fnam;
 
@@ -647,2401 +612,353 @@ bool cgfs_set_value(const char *controller, const char *cgroup, const char *file
 	if (ret < 0 || (size_t)ret >= len)
 		return false;
 
-	fd = openat(cfd, fnam, O_WRONLY);
-	if (fd < 0)
-		return false;
-
-	return write_string(fnam, value, fd);
+	return (faccessat(cfd, fnam, F_OK, 0) == 0);
 }
 
-// Chown all the files in the cgroup directory.  We do this when we create
-// a cgroup on behalf of a user.
-static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
-{
-	struct dirent *direntp;
-	char path[MAXPATHLEN];
-	size_t len;
-	DIR *d;
-	int fd1, ret;
+#define SEND_CREDS_OK 0
+#define SEND_CREDS_NOTSK 1
+#define SEND_CREDS_FAIL 2
+static bool recv_creds(int sock, struct ucred *cred, char *v);
+static int wait_for_pid(pid_t pid);
+static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
+static int send_creds_clone_wrapper(void *arg);
 
-	len = strlen(dirname);
-	if (len >= MAXPATHLEN) {
-		lxcfs_error("Pathname too long: %s\n", dirname);
-		return;
-	}
+/*
+ * clone a task which switches to @task's namespace and writes '1'.
+ * over a unix sock so we can read the task's reaper's pid in our
+ * namespace
+ *
+ * Note: glibc's fork() does not respect pidns, which can lead to failed
+ * assertions inside glibc (and thus failed forks) if the child's pid in
+ * the pidns and the parent pid outside are identical. Using clone prevents
+ * this issue.
+ */
+static void write_task_init_pid_exit(int sock, pid_t target)
+{
+	char fnam[100];
+	pid_t pid;
+	int fd, ret;
+	size_t stack_size = sysconf(_SC_PAGESIZE);
+	void *stack = alloca(stack_size);
 
-	fd1 = openat(fd, dirname, O_DIRECTORY);
-	if (fd1 < 0)
-		return;
+	ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
+	if (ret < 0 || ret >= sizeof(fnam))
+		_exit(1);
 
-	d = fdopendir(fd1);
-	if (!d) {
-		lxcfs_error("Failed to open %s\n", dirname);
-		return;
+	fd = open(fnam, O_RDONLY);
+	if (fd < 0) {
+		perror("write_task_init_pid_exit open of ns/pid");
+		_exit(1);
 	}
-
-	while ((direntp = readdir(d))) {
-		if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
-			continue;
-		ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
-		if (ret < 0 || ret >= MAXPATHLEN) {
-			lxcfs_error("Pathname too long under %s\n", dirname);
-			continue;
-		}
-		if (fchownat(fd, path, uid, gid, 0) < 0)
-			lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
+	if (setns(fd, 0)) {
+		perror("write_task_init_pid_exit setns 1");
+		close(fd);
+		_exit(1);
+	}
+	pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
+	if (pid < 0)
+		_exit(1);
+	if (pid != 0) {
+		if (!wait_for_pid(pid))
+			_exit(1);
+		_exit(0);
 	}
-	closedir(d);
 }
 
-int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
-{
-	int cfd;
-	size_t len;
-	char *dirnam;
-
-	cfd = get_cgroup_fd(controller);
-	if (cfd < 0)
-		return -EINVAL;
-
-	/* Make sure we pass a relative path to *at() family of functions.
-	 * . + /cg + \0
-	 */
-	len = strlen(cg) + 2;
-	dirnam = alloca(len);
-	snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
-
-	if (mkdirat(cfd, dirnam, 0755) < 0)
-		return -errno;
-
-	if (uid == 0 && gid == 0)
-		return 0;
-
-	if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
-		return -errno;
-
-	chown_all_cgroup_files(dirnam, uid, gid, cfd);
+static int send_creds_clone_wrapper(void *arg) {
+	struct ucred cred;
+	char v;
+	int sock = *(int *)arg;
 
+	/* we are the child */
+	cred.uid = 0;
+	cred.gid = 0;
+	cred.pid = 1;
+	v = '1';
+	if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
+		return 1;
 	return 0;
 }
 
-static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
+static pid_t get_init_pid_for_task(pid_t task)
 {
-	struct dirent *direntp;
-	DIR *dir;
-	bool ret = false;
-	char pathname[MAXPATHLEN];
-	int dupfd;
-
-	dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
-	if (dupfd < 0)
-		return false;
-
-	dir = fdopendir(dupfd);
-	if (!dir) {
-		lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
-		close(dupfd);
-		return false;
-	}
-
-	while ((direntp = readdir(dir))) {
-		struct stat mystat;
-		int rc;
-
-		if (!strcmp(direntp->d_name, ".") ||
-		    !strcmp(direntp->d_name, ".."))
-			continue;
-
-		rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
-		if (rc < 0 || rc >= MAXPATHLEN) {
-			lxcfs_error("%s\n", "Pathname too long.");
-			continue;
-		}
-
-		rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
-		if (rc) {
-			lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
-			continue;
-		}
-		if (S_ISDIR(mystat.st_mode))
-			if (!recursive_rmdir(pathname, fd, cfd))
-				lxcfs_debug("Error removing %s.\n", pathname);
-	}
+	int sock[2];
+	pid_t pid;
+	pid_t ret = -1;
+	char v = '0';
+	struct ucred cred;
 
-	ret = true;
-	if (closedir(dir) < 0) {
-		lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
-		ret = false;
+	if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
+		perror("socketpair");
+		return -1;
 	}
 
-	if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
-		lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
-		ret = false;
+	pid = fork();
+	if (pid < 0)
+		goto out;
+	if (!pid) {
+		close(sock[1]);
+		write_task_init_pid_exit(sock[0], task);
+		_exit(0);
 	}
 
-	close(dupfd);
+	if (!recv_creds(sock[1], &cred, &v))
+		goto out;
+	ret = cred.pid;
 
+out:
+	close(sock[0]);
+	close(sock[1]);
+	if (pid > 0)
+		wait_for_pid(pid);
 	return ret;
 }
 
-bool cgfs_remove(const char *controller, const char *cg)
+pid_t lookup_initpid_in_store(pid_t qpid)
 {
-	int fd, cfd;
-	size_t len;
-	char *dirnam;
-	bool bret;
-
-	cfd = get_cgroup_fd(controller);
-	if (cfd < 0)
-		return false;
-
-	/* Make sure we pass a relative path to *at() family of functions.
-	 * . +  /cg + \0
-	 */
-	len = strlen(cg) + 2;
-	dirnam = alloca(len);
-	snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
+	pid_t answer = 0;
+	struct stat sb;
+	struct pidns_init_store *e;
+	char fnam[100];
 
-	fd = openat(cfd, dirnam, O_DIRECTORY);
-	if (fd < 0)
-		return false;
+	snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
+	store_lock();
+	if (stat(fnam, &sb) < 0)
+		goto out;
+	e = lookup_verify_initpid(&sb);
+	if (e) {
+		answer = e->initpid;
+		goto out;
+	}
+	answer = get_init_pid_for_task(qpid);
+	if (answer > 0)
+		save_initpid(&sb, answer);
 
-	bret = recursive_rmdir(dirnam, fd, cfd);
-	close(fd);
-	return bret;
+out:
+	/* we prune at end in case we are returning
+	 * the value we were about to return */
+	prune_initpid_store();
+	store_unlock();
+	return answer;
 }
 
-bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
+static int wait_for_pid(pid_t pid)
 {
-	int cfd;
-	size_t len;
-	char *pathname;
-
-	cfd = get_cgroup_fd(controller);
-	if (cfd < 0)
-		return false;
+	int status, ret;
 
-	/* Make sure we pass a relative path to *at() family of functions.
-	 * . + /file + \0
-	 */
-	len = strlen(file) + 2;
-	pathname = alloca(len);
-	snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
-	if (fchmodat(cfd, pathname, mode, 0) < 0)
-		return false;
-	return true;
-}
+	if (pid <= 0)
+		return -1;
 
-static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
-{
-	size_t len;
-	char *fname;
-
-	len = strlen(dirname) + strlen("/cgroup.procs") + 1;
-	fname = alloca(len);
-	snprintf(fname, len, "%s/tasks", dirname);
-	if (fchownat(fd, fname, uid, gid, 0) != 0)
-		return -errno;
-	snprintf(fname, len, "%s/cgroup.procs", dirname);
-	if (fchownat(fd, fname, uid, gid, 0) != 0)
-		return -errno;
+again:
+	ret = waitpid(pid, &status, 0);
+	if (ret == -1) {
+		if (errno == EINTR)
+			goto again;
+		return -1;
+	}
+	if (ret != pid)
+		goto again;
+	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
+		return -1;
 	return 0;
 }
 
-int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
+char *get_pid_cgroup(pid_t pid, const char *contrl)
 {
 	int cfd;
-	size_t len;
-	char *pathname;
 
-	cfd = get_cgroup_fd(controller);
+	cfd = get_cgroup_fd(contrl);
 	if (cfd < 0)
 		return false;
 
-	/* Make sure we pass a relative path to *at() family of functions.
-	 * . + /file + \0
-	 */
-	len = strlen(file) + 2;
-	pathname = alloca(len);
-	snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
-	if (fchownat(cfd, pathname, uid, gid, 0) < 0)
-		return -errno;
-
-	if (is_dir(pathname, cfd))
-		// like cgmanager did, we want to chown the tasks file as well
-		return chown_tasks_files(pathname, uid, gid, cfd);
+	if (pure_unified_layout(cgroup_ops))
+		return cg_unified_get_current_cgroup(pid);
 
-	return 0;
+	return cg_legacy_get_current_cgroup(pid, contrl);
 }
 
-FILE *open_pids_file(const char *controller, const char *cgroup)
+#define INITSCOPE "/init.scope"
+void prune_init_slice(char *cg)
 {
-	int fd, cfd;
-	size_t len;
-	char *pathname;
-
-	cfd = get_cgroup_fd(controller);
-	if (cfd < 0)
-		return false;
-
-	/* Make sure we pass a relative path to *at() family of functions.
-	 * . + /cgroup + / "cgroup.procs" + \0
-	 */
-	len = strlen(cgroup) + strlen("cgroup.procs") + 3;
-	pathname = alloca(len);
-	snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
+	char *point;
+	size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
 
-	fd = openat(cfd, pathname, O_WRONLY);
-	if (fd < 0)
-		return NULL;
+	if (cg_len < initscope_len)
+		return;
 
-	return fdopen(fd, "w");
+	point = cg + cg_len - initscope_len;
+	if (strcmp(point, INITSCOPE) == 0) {
+		if (point == cg)
+			*(point+1) = '\0';
+		else
+			*point = '\0';
+	}
 }
 
-static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
-                                void ***list, size_t typesize,
-                                void* (*iterator)(const char*, const char*, const char*))
+#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
+
+static bool wait_for_sock(int sock, int timeout)
 {
-	int cfd, fd, ret;
-	size_t len;
-	char *cg;
-	char pathname[MAXPATHLEN];
-	size_t sz = 0, asz = 0;
-	struct dirent *dirent;
-	DIR *dir;
+	struct epoll_event ev;
+	int epfd, ret, now, starttime, deltatime, saved_errno;
 
-	cfd = get_cgroup_fd(controller);
-	*list = NULL;
-	if (cfd < 0)
+	if ((starttime = time(NULL)) < 0)
 		return false;
 
-	/* Make sure we pass a relative path to *at() family of functions. */
-	len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
-	cg = alloca(len);
-	ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
-	if (ret < 0 || (size_t)ret >= len) {
-		lxcfs_error("Pathname too long under %s\n", cgroup);
+	if ((epfd = epoll_create(1)) < 0) {
+		lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
 		return false;
 	}
 
-	fd = openat(cfd, cg, O_DIRECTORY);
-	if (fd < 0)
-		return false;
-
-	dir = fdopendir(fd);
-	if (!dir)
-		return false;
-
-	while ((dirent = readdir(dir))) {
-		struct stat mystat;
-
-		if (!strcmp(dirent->d_name, ".") ||
-		    !strcmp(dirent->d_name, ".."))
-			continue;
-
-		ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
-		if (ret < 0 || ret >= MAXPATHLEN) {
-			lxcfs_error("Pathname too long under %s\n", cg);
-			continue;
-		}
-
-		ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
-		if (ret) {
-			lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
-			continue;
-		}
-		if ((!directories && !S_ISREG(mystat.st_mode)) ||
-		    (directories && !S_ISDIR(mystat.st_mode)))
-			continue;
-
-		if (sz+2 >= asz) {
-			void **tmp;
-			asz += BATCH_SIZE;
-			do {
-				tmp = realloc(*list, asz * typesize);
-			} while  (!tmp);
-			*list = tmp;
-		}
-		(*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
-		(*list)[sz+1] = NULL;
-		sz++;
-	}
-	if (closedir(dir) < 0) {
-		lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
-		return false;
-	}
-	return true;
-}
-
-static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
-{
-	char *dup;
-	do {
-		dup = strdup(dir_entry);
-	} while (!dup);
-	return dup;
-}
-
-bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
-{
-	return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
-}
-
-void free_key(struct cgfs_files *k)
-{
-	if (!k)
-		return;
-	free_disarm(k->name);
-	free_disarm(k);
-}
-
-void free_keys(struct cgfs_files **keys)
-{
-	int i;
-
-	if (!keys)
-		return;
-	for (i = 0; keys[i]; i++) {
-		free_key(keys[i]);
-	}
-	free_disarm(keys);
-}
-
-bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
-{
-	int ret, cfd;
-	size_t len;
-	char *fnam;
-
-	cfd = get_cgroup_fd(controller);
-	if (cfd < 0)
-		return false;
-
-	/* Make sure we pass a relative path to *at() family of functions.
-	 * . + /cgroup + / + file + \0
-	 */
-	len = strlen(cgroup) + strlen(file) + 3;
-	fnam = alloca(len);
-	ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
-	if (ret < 0 || (size_t)ret >= len)
-		return false;
-
-	return (faccessat(cfd, fnam, F_OK, 0) == 0);
-}
-
-struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
-{
-	int ret, cfd;
-	size_t len;
-	char *fnam;
-	struct stat sb;
-	struct cgfs_files *newkey;
-
-	cfd = get_cgroup_fd(controller);
-	if (cfd < 0)
-		return false;
-
-	if (file && *file == '/')
-		file++;
-
-	if (file && strchr(file, '/'))
-		return NULL;
-
-	/* Make sure we pass a relative path to *at() family of functions.
-	 * . + /cgroup + / + file + \0
-	 */
-	len = strlen(cgroup) + 3;
-	if (file)
-		len += strlen(file) + 1;
-	fnam = alloca(len);
-	snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
-		 file ? "/" : "", file ? file : "");
-
-	ret = fstatat(cfd, fnam, &sb, 0);
-	if (ret < 0)
-		return NULL;
-
-	do {
-		newkey = malloc(sizeof(struct cgfs_files));
-	} while (!newkey);
-	if (file)
-		newkey->name = must_copy_string(file);
-	else if (strrchr(cgroup, '/'))
-		newkey->name = must_copy_string(strrchr(cgroup, '/'));
-	else
-		newkey->name = must_copy_string(cgroup);
-	newkey->uid = sb.st_uid;
-	newkey->gid = sb.st_gid;
-	newkey->mode = sb.st_mode;
-
-	return newkey;
-}
-
-static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
-{
-	struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
-	if (!entry) {
-		lxcfs_error("Error getting files under %s:%s\n", controller,
-			     cgroup);
-	}
-	return entry;
-}
-
-bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
-{
-	return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
-}
-
-bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
-{
-	int cfd;
-	size_t len;
-	char *fnam;
-	int ret;
-	struct stat sb;
-
-	cfd = get_cgroup_fd(controller);
-	if (cfd < 0)
-		return false;
-
-	/* Make sure we pass a relative path to *at() family of functions.
-	 * . + /cgroup + / + f + \0
-	 */
-	len = strlen(cgroup) + strlen(f) + 3;
-	fnam = alloca(len);
-	ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
-	if (ret < 0 || (size_t)ret >= len)
-		return false;
-
-	ret = fstatat(cfd, fnam, &sb, 0);
-	if (ret < 0 || !S_ISDIR(sb.st_mode))
-		return false;
-
-	return true;
-}
-
-#define SEND_CREDS_OK 0
-#define SEND_CREDS_NOTSK 1
-#define SEND_CREDS_FAIL 2
-static bool recv_creds(int sock, struct ucred *cred, char *v);
-static int wait_for_pid(pid_t pid);
-static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
-static int send_creds_clone_wrapper(void *arg);
-
-/*
- * clone a task which switches to @task's namespace and writes '1'.
- * over a unix sock so we can read the task's reaper's pid in our
- * namespace
- *
- * Note: glibc's fork() does not respect pidns, which can lead to failed
- * assertions inside glibc (and thus failed forks) if the child's pid in
- * the pidns and the parent pid outside are identical. Using clone prevents
- * this issue.
- */
-static void write_task_init_pid_exit(int sock, pid_t target)
-{
-	char fnam[100];
-	pid_t pid;
-	int fd, ret;
-	size_t stack_size = sysconf(_SC_PAGESIZE);
-	void *stack = alloca(stack_size);
-
-	ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
-	if (ret < 0 || ret >= sizeof(fnam))
-		_exit(1);
-
-	fd = open(fnam, O_RDONLY);
-	if (fd < 0) {
-		perror("write_task_init_pid_exit open of ns/pid");
-		_exit(1);
-	}
-	if (setns(fd, 0)) {
-		perror("write_task_init_pid_exit setns 1");
-		close(fd);
-		_exit(1);
-	}
-	pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
-	if (pid < 0)
-		_exit(1);
-	if (pid != 0) {
-		if (!wait_for_pid(pid))
-			_exit(1);
-		_exit(0);
-	}
-}
-
-static int send_creds_clone_wrapper(void *arg) {
-	struct ucred cred;
-	char v;
-	int sock = *(int *)arg;
-
-	/* we are the child */
-	cred.uid = 0;
-	cred.gid = 0;
-	cred.pid = 1;
-	v = '1';
-	if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
-		return 1;
-	return 0;
-}
-
-static pid_t get_init_pid_for_task(pid_t task)
-{
-	int sock[2];
-	pid_t pid;
-	pid_t ret = -1;
-	char v = '0';
-	struct ucred cred;
-
-	if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
-		perror("socketpair");
-		return -1;
-	}
-
-	pid = fork();
-	if (pid < 0)
-		goto out;
-	if (!pid) {
-		close(sock[1]);
-		write_task_init_pid_exit(sock[0], task);
-		_exit(0);
-	}
-
-	if (!recv_creds(sock[1], &cred, &v))
-		goto out;
-	ret = cred.pid;
-
-out:
-	close(sock[0]);
-	close(sock[1]);
-	if (pid > 0)
-		wait_for_pid(pid);
-	return ret;
-}
-
-pid_t lookup_initpid_in_store(pid_t qpid)
-{
-	pid_t answer = 0;
-	struct stat sb;
-	struct pidns_init_store *e;
-	char fnam[100];
-
-	snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
-	store_lock();
-	if (stat(fnam, &sb) < 0)
-		goto out;
-	e = lookup_verify_initpid(&sb);
-	if (e) {
-		answer = e->initpid;
-		goto out;
-	}
-	answer = get_init_pid_for_task(qpid);
-	if (answer > 0)
-		save_initpid(&sb, answer);
-
-out:
-	/* we prune at end in case we are returning
-	 * the value we were about to return */
-	prune_initpid_store();
-	store_unlock();
-	return answer;
-}
-
-static int wait_for_pid(pid_t pid)
-{
-	int status, ret;
-
-	if (pid <= 0)
-		return -1;
-
-again:
-	ret = waitpid(pid, &status, 0);
-	if (ret == -1) {
-		if (errno == EINTR)
-			goto again;
-		return -1;
-	}
-	if (ret != pid)
-		goto again;
-	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
-		return -1;
-	return 0;
-}
-
-
-/*
- * append pid to *src.
- * src: a pointer to a char* in which ot append the pid.
- * sz: the number of characters printed so far, minus trailing \0.
- * asz: the allocated size so far
- * pid: the pid to append
- */
-static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
-{
-	must_strcat(src, sz, asz, "%d\n", (int)pid);
-}
-
-/*
- * Given a open file * to /proc/pid/{u,g}id_map, and an id
- * valid in the caller's namespace, return the id mapped into
- * pid's namespace.
- * Returns the mapped id, or -1 on error.
- */
-unsigned int
-convert_id_to_ns(FILE *idfile, unsigned int in_id)
-{
-	unsigned int nsuid,   // base id for a range in the idfile's namespace
-		     hostuid, // base id for a range in the caller's namespace
-		     count;   // number of ids in this range
-	char line[400];
-	int ret;
-
-	fseek(idfile, 0L, SEEK_SET);
-	while (fgets(line, 400, idfile)) {
-		ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
-		if (ret != 3)
-			continue;
-		if (hostuid + count < hostuid || nsuid + count < nsuid) {
-			/*
-			 * uids wrapped around - unexpected as this is a procfile,
-			 * so just bail.
-			 */
-			lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
-				nsuid, hostuid, count, line);
-			return -1;
-		}
-		if (hostuid <= in_id && hostuid+count > in_id) {
-			/*
-			 * now since hostuid <= in_id < hostuid+count, and
-			 * hostuid+count and nsuid+count do not wrap around,
-			 * we know that nsuid+(in_id-hostuid) which must be
-			 * less that nsuid+(count) must not wrap around
-			 */
-			return (in_id - hostuid) + nsuid;
-		}
-	}
-
-	// no answer found
-	return -1;
-}
-
-/*
- * for is_privileged_over,
- * specify whether we require the calling uid to be root in his
- * namespace
- */
-#define NS_ROOT_REQD true
-#define NS_ROOT_OPT false
-
-#define PROCLEN 100
-
-static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
-{
-	char fpath[PROCLEN];
-	int ret;
-	bool answer = false;
-	uid_t nsuid;
-
-	if (victim == -1 || uid == -1)
-		return false;
-
-	/*
-	 * If the request is one not requiring root in the namespace,
-	 * then having the same uid suffices.  (i.e. uid 1000 has write
-	 * access to files owned by uid 1000
-	 */
-	if (!req_ns_root && uid == victim)
-		return true;
-
-	ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
-	if (ret < 0 || ret >= PROCLEN)
-		return false;
-	FILE *f = fopen(fpath, "r");
-	if (!f)
-		return false;
-
-	/* if caller's not root in his namespace, reject */
-	nsuid = convert_id_to_ns(f, uid);
-	if (nsuid)
-		goto out;
-
-	/*
-	 * If victim is not mapped into caller's ns, reject.
-	 * XXX I'm not sure this check is needed given that fuse
-	 * will be sending requests where the vfs has converted
-	 */
-	nsuid = convert_id_to_ns(f, victim);
-	if (nsuid == -1)
-		goto out;
-
-	answer = true;
-
-out:
-	fclose(f);
-	return answer;
-}
-
-static bool perms_include(int fmode, mode_t req_mode)
-{
-	mode_t r;
-
-	switch (req_mode & O_ACCMODE) {
-	case O_RDONLY:
-		r = S_IROTH;
-		break;
-	case O_WRONLY:
-		r = S_IWOTH;
-		break;
-	case O_RDWR:
-		r = S_IROTH | S_IWOTH;
-		break;
-	default:
-		return false;
-	}
-	return ((fmode & r) == r);
-}
-
-
-/*
- * taskcg is  a/b/c
- * querycg is /a/b/c/d/e
- * we return 'd'
- */
-static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
-{
-	char *start, *end;
-
-	if (strlen(taskcg) <= strlen(querycg)) {
-		lxcfs_error("%s\n", "I was fed bad input.");
-		return NULL;
-	}
-
-	if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
-		start =  strdup(taskcg + 1);
-	else
-		start = strdup(taskcg + strlen(querycg) + 1);
-	if (!start)
-		return NULL;
-	end = strchr(start, '/');
-	if (end)
-		*end = '\0';
-	return start;
-}
-
-char *get_pid_cgroup(pid_t pid, const char *contrl)
-{
-	int cfd;
-
-	cfd = get_cgroup_fd(contrl);
-	if (cfd < 0)
-		return false;
-
-	if (pure_unified_layout(cgroup_ops))
-		return cg_unified_get_current_cgroup(pid);
-
-	return cg_legacy_get_current_cgroup(pid, contrl);
-}
-
-/*
- * check whether a fuse context may access a cgroup dir or file
- *
- * If file is not null, it is a cgroup file to check under cg.
- * If file is null, then we are checking perms on cg itself.
- *
- * For files we can check the mode of the list_keys result.
- * For cgroups, we must make assumptions based on the files under the
- * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
- * yet.
- */
-static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
-{
-	struct cgfs_files *k = NULL;
-	bool ret = false;
-
-	k = cgfs_get_key(contrl, cg, file);
-	if (!k)
-		return false;
-
-	if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
-		if (perms_include(k->mode >> 6, mode)) {
-			ret = true;
-			goto out;
-		}
-	}
-	if (fc->gid == k->gid) {
-		if (perms_include(k->mode >> 3, mode)) {
-			ret = true;
-			goto out;
-		}
-	}
-	ret = perms_include(k->mode, mode);
-
-out:
-	free_key(k);
-	return ret;
-}
-
-#define INITSCOPE "/init.scope"
-void prune_init_slice(char *cg)
-{
-	char *point;
-	size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
-
-	if (cg_len < initscope_len)
-		return;
-
-	point = cg + cg_len - initscope_len;
-	if (strcmp(point, INITSCOPE) == 0) {
-		if (point == cg)
-			*(point+1) = '\0';
-		else
-			*point = '\0';
-	}
-}
-
-/*
- * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
- * If pid is in /a, he may act on /a/b, but not on /b.
- * if the answer is false and nextcg is not NULL, then *nextcg will point
- * to a string containing the next cgroup directory under cg, which must be
- * freed by the caller.
- */
-static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
-{
-	bool answer = false;
-	char *c2 = get_pid_cgroup(pid, contrl);
-	char *linecmp;
-
-	if (!c2)
-		return false;
-	prune_init_slice(c2);
-
-	/*
-	 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
-	 * they pass in a cgroup without leading '/'
-	 *
-	 * The original line here was:
-	 *	linecmp = *cg == '/' ? c2 : c2+1;
-	 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
-	 *       Serge, do you know?
-	 */
-	if (*cg == '/' || !strncmp(cg, "./", 2))
-		linecmp = c2;
-	else
-		linecmp = c2 + 1;
-	if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
-		if (nextcg) {
-			*nextcg = get_next_cgroup_dir(linecmp, cg);
-		}
-		goto out;
-	}
-	answer = true;
-
-out:
-	free(c2);
-	return answer;
-}
-
-/*
- * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
- */
-static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
-{
-	bool answer = false;
-	char *c2, *task_cg;
-	size_t target_len, task_len;
-
-	if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
-		return true;
-
-	c2 = get_pid_cgroup(pid, contrl);
-	if (!c2)
-		return false;
-	prune_init_slice(c2);
-
-	task_cg = c2 + 1;
-	target_len = strlen(cg);
-	task_len = strlen(task_cg);
-	if (task_len == 0) {
-		/* Task is in the root cg, it can see everything. This case is
-		 * not handled by the strmcps below, since they test for the
-		 * last /, but that is the first / that we've chopped off
-		 * above.
-		 */
-		answer = true;
-		goto out;
-	}
-	if (strcmp(cg, task_cg) == 0) {
-		answer = true;
-		goto out;
-	}
-	if (target_len < task_len) {
-		/* looking up a parent dir */
-		if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
-			answer = true;
-		goto out;
-	}
-	if (target_len > task_len) {
-		/* looking up a child dir */
-		if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
-			answer = true;
-		goto out;
-	}
-
-out:
-	free(c2);
-	return answer;
-}
-
-/*
- * given /cgroup/freezer/a/b, return "freezer".
- * the returned char* should NOT be freed.
- */
-static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
-{
-	const char *p1;
-	char *contr, *slash;
-
-	if (strlen(path) < 9) {
-		errno = EACCES;
-		return NULL;
-	}
-	if (*(path + 7) != '/') {
-		errno = EINVAL;
-		return NULL;
-	}
-	p1 = path + 8;
-	contr = strdupa(p1);
-	if (!contr) {
-		errno = ENOMEM;
-		return NULL;
-	}
-	slash = strstr(contr, "/");
-	if (slash)
-		*slash = '\0';
-
-	for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
-		if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
-			return (*h)->__controllers;
-	}
-	errno = ENOENT;
-	return NULL;
-}
-
-/*
- * Find the start of cgroup in /cgroup/controller/the/cgroup/path
- * Note that the returned value may include files (keynames) etc
- */
-static const char *find_cgroup_in_path(const char *path)
-{
-	const char *p1;
-
-	if (strlen(path) < 9) {
-		errno = EACCES;
-		return NULL;
-	}
-	p1 = strstr(path + 8, "/");
-	if (!p1) {
-		errno = EINVAL;
-		return NULL;
-	}
-	errno = 0;
-	return p1 + 1;
-}
-
-/*
- * split the last path element from the path in @cg.
- * @dir is newly allocated and should be freed, @last not
-*/
-static void get_cgdir_and_path(const char *cg, char **dir, char **last)
-{
-	char *p;
-
-	do {
-		*dir = strdup(cg);
-	} while (!*dir);
-	*last = strrchr(cg, '/');
-	if (!*last) {
-		*last = NULL;
-		return;
-	}
-	p = strrchr(*dir, '/');
-	*p = '\0';
-}
-
-/*
- * FUSE ops for /cgroup
- */
-
-int cg_getattr(const char *path, struct stat *sb)
-{
-	struct timespec now;
-	struct fuse_context *fc = fuse_get_context();
-	char * cgdir = NULL;
-	char *last = NULL, *path1, *path2;
-	struct cgfs_files *k = NULL;
-	const char *cgroup;
-	const char *controller = NULL;
-	int ret = -ENOENT;
-
-
-	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-		return -EIO;
-
-	memset(sb, 0, sizeof(struct stat));
-
-	if (clock_gettime(CLOCK_REALTIME, &now) < 0)
-		return -EINVAL;
-
-	sb->st_uid = sb->st_gid = 0;
-	sb->st_atim = sb->st_mtim = sb->st_ctim = now;
-	sb->st_size = 0;
-
-	if (strcmp(path, "/cgroup") == 0) {
-		sb->st_mode = S_IFDIR | 00755;
-		sb->st_nlink = 2;
-		return 0;
-	}
-
-	controller = pick_controller_from_path(fc, path);
-	if (!controller)
-		return -errno;
-	cgroup = find_cgroup_in_path(path);
-	if (!cgroup) {
-		/* this is just /cgroup/controller, return it as a dir */
-		sb->st_mode = S_IFDIR | 00755;
-		sb->st_nlink = 2;
-		return 0;
-	}
-
-	get_cgdir_and_path(cgroup, &cgdir, &last);
-
-	if (!last) {
-		path1 = "/";
-		path2 = cgdir;
-	} else {
-		path1 = cgdir;
-		path2 = last;
-	}
-
-	pid_t initpid = lookup_initpid_in_store(fc->pid);
-	if (initpid <= 1 || is_shared_pidns(initpid))
-		initpid = fc->pid;
-	/* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
-	 * Then check that caller's cgroup is under path if last is a child
-	 * cgroup, or cgdir if last is a file */
-
-	if (is_child_cgroup(controller, path1, path2)) {
-		if (!caller_may_see_dir(initpid, controller, cgroup)) {
-			ret = -ENOENT;
-			goto out;
-		}
-		if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
-			/* this is just /cgroup/controller, return it as a dir */
-			sb->st_mode = S_IFDIR | 00555;
-			sb->st_nlink = 2;
-			ret = 0;
-			goto out;
-		}
-		if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
-			ret = -EACCES;
-			goto out;
-		}
-
-		// get uid, gid, from '/tasks' file and make up a mode
-		// That is a hack, until cgmanager gains a GetCgroupPerms fn.
-		sb->st_mode = S_IFDIR | 00755;
-		k = cgfs_get_key(controller, cgroup, NULL);
-		if (!k) {
-			sb->st_uid = sb->st_gid = 0;
-		} else {
-			sb->st_uid = k->uid;
-			sb->st_gid = k->gid;
-		}
-		free_key(k);
-		sb->st_nlink = 2;
-		ret = 0;
-		goto out;
-	}
-
-	if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
-		sb->st_mode = S_IFREG | k->mode;
-		sb->st_nlink = 1;
-		sb->st_uid = k->uid;
-		sb->st_gid = k->gid;
-		sb->st_size = 0;
-		free_key(k);
-		if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
-			ret = -ENOENT;
-			goto out;
-		}
-		ret = 0;
-	}
-
-out:
-	free(cgdir);
-	return ret;
-}
-
-int cg_opendir(const char *path, struct fuse_file_info *fi)
-{
-	struct fuse_context *fc = fuse_get_context();
-	const char *cgroup;
-	struct file_info *dir_info;
-	char *controller = NULL;
-
-	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-		return -EIO;
-
-	if (strcmp(path, "/cgroup") == 0) {
-		cgroup = NULL;
-		controller = NULL;
-	} else {
-		// return list of keys for the controller, and list of child cgroups
-		controller = pick_controller_from_path(fc, path);
-		if (!controller)
-			return -errno;
-
-		cgroup = find_cgroup_in_path(path);
-		if (!cgroup) {
-			/* this is just /cgroup/controller, return its contents */
-			cgroup = "/";
-		}
-	}
-
-	pid_t initpid = lookup_initpid_in_store(fc->pid);
-	if (initpid <= 1 || is_shared_pidns(initpid))
-		initpid = fc->pid;
-	if (cgroup) {
-		if (!caller_may_see_dir(initpid, controller, cgroup))
-			return -ENOENT;
-		if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
-			return -EACCES;
-	}
-
-	/* we'll free this at cg_releasedir */
-	dir_info = malloc(sizeof(*dir_info));
-	if (!dir_info)
-		return -ENOMEM;
-	dir_info->controller = must_copy_string(controller);
-	dir_info->cgroup = must_copy_string(cgroup);
-	dir_info->type = LXC_TYPE_CGDIR;
-	dir_info->buf = NULL;
-	dir_info->file = NULL;
-	dir_info->buflen = 0;
-
-	fi->fh = (unsigned long)dir_info;
-	return 0;
-}
-
-int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
-		struct fuse_file_info *fi)
-{
-	struct file_info *d = (struct file_info *)fi->fh;
-	struct cgfs_files **list = NULL;
-	int i, ret;
-	char *nextcg = NULL;
-	struct fuse_context *fc = fuse_get_context();
-	char **clist = NULL;
-
-	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-		return -EIO;
-
-	if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
-		return -EIO;
-
-	if (d->type != LXC_TYPE_CGDIR) {
-		lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
-		return -EIO;
-	}
-	if (!d->cgroup && !d->controller) {
-		/*
-		 * ls /var/lib/lxcfs/cgroup - just show list of controllers.
-		 * This only works with the legacy hierarchy.
-		 */
-		for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
-			if (is_unified_hierarchy(*h))
-				continue;
-
-			if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
-				return -EIO;
-		}
-
-		return 0;
-	}
-
-	if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
-		// not a valid cgroup
-		ret = -EINVAL;
-		goto out;
-	}
-
-	pid_t initpid = lookup_initpid_in_store(fc->pid);
-	if (initpid <= 1 || is_shared_pidns(initpid))
-		initpid = fc->pid;
-	if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
-		if (nextcg) {
-			ret = filler(buf, nextcg,  NULL, 0);
-			free(nextcg);
-			if (ret != 0) {
-				ret = -EIO;
-				goto out;
-			}
-		}
-		ret = 0;
-		goto out;
-	}
-
-	for (i = 0; list && list[i]; i++) {
-		if (filler(buf, list[i]->name, NULL, 0) != 0) {
-			ret = -EIO;
-			goto out;
-		}
-	}
-
-	// now get the list of child cgroups
-
-	if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
-		ret = 0;
-		goto out;
-	}
-	if (clist) {
-		for (i = 0; clist[i]; i++) {
-			if (filler(buf, clist[i], NULL, 0) != 0) {
-				ret = -EIO;
-				goto out;
-			}
-		}
-	}
-	ret = 0;
-
-out:
-	free_keys(list);
-	if (clist) {
-		for (i = 0; clist[i]; i++)
-			free(clist[i]);
-		free(clist);
-	}
-	return ret;
-}
-
-void do_release_file_info(struct fuse_file_info *fi)
-{
-	struct file_info *f = (struct file_info *)fi->fh;
-
-	if (!f)
-		return;
-
-	fi->fh = 0;
-
-	free_disarm(f->controller);
-	free_disarm(f->cgroup);
-	free_disarm(f->file);
-	free_disarm(f->buf);
-	free_disarm(f);
-}
-
-int cg_releasedir(const char *path, struct fuse_file_info *fi)
-{
-	do_release_file_info(fi);
-	return 0;
-}
-
-int cg_open(const char *path, struct fuse_file_info *fi)
-{
-	const char *cgroup;
-	char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
-	struct cgfs_files *k = NULL;
-	struct file_info *file_info;
-	struct fuse_context *fc = fuse_get_context();
-	int ret;
-
-	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-		return -EIO;
-
-	controller = pick_controller_from_path(fc, path);
-	if (!controller)
-		return -errno;
-	cgroup = find_cgroup_in_path(path);
-	if (!cgroup)
-		return -errno;
-
-	get_cgdir_and_path(cgroup, &cgdir, &last);
-	if (!last) {
-		path1 = "/";
-		path2 = cgdir;
-	} else {
-		path1 = cgdir;
-		path2 = last;
-	}
-
-	k = cgfs_get_key(controller, path1, path2);
-	if (!k) {
-		ret = -EINVAL;
-		goto out;
-	}
-	free_key(k);
-
-	pid_t initpid = lookup_initpid_in_store(fc->pid);
-	if (initpid <= 1 || is_shared_pidns(initpid))
-		initpid = fc->pid;
-	if (!caller_may_see_dir(initpid, controller, path1)) {
-		ret = -ENOENT;
-		goto out;
-	}
-	if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
-		ret = -EACCES;
-		goto out;
-	}
-
-	/* we'll free this at cg_release */
-	file_info = malloc(sizeof(*file_info));
-	if (!file_info) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	file_info->controller = must_copy_string(controller);
-	file_info->cgroup = must_copy_string(path1);
-	file_info->file = must_copy_string(path2);
-	file_info->type = LXC_TYPE_CGFILE;
-	file_info->buf = NULL;
-	file_info->buflen = 0;
-
-	fi->fh = (unsigned long)file_info;
-	ret = 0;
-
-out:
-	free(cgdir);
-	return ret;
-}
-
-int cg_access(const char *path, int mode)
-{
-	int ret;
-	const char *cgroup;
-	char *path1, *path2, *controller;
-	char *last = NULL, *cgdir = NULL;
-	struct cgfs_files *k = NULL;
-	struct fuse_context *fc = fuse_get_context();
-
-	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-		return -EIO;
-
-	if (strcmp(path, "/cgroup") == 0)
-		return 0;
-
-	controller = pick_controller_from_path(fc, path);
-	if (!controller)
-		return -errno;
-	cgroup = find_cgroup_in_path(path);
-	if (!cgroup) {
-		// access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
-		if ((mode & W_OK) == 0)
-			return 0;
-		return -EACCES;
-	}
-
-	get_cgdir_and_path(cgroup, &cgdir, &last);
-	if (!last) {
-		path1 = "/";
-		path2 = cgdir;
-	} else {
-		path1 = cgdir;
-		path2 = last;
-	}
-
-	k = cgfs_get_key(controller, path1, path2);
-	if (!k) {
-		if ((mode & W_OK) == 0)
-			ret = 0;
-		else
-			ret = -EACCES;
-		goto out;
-	}
-	free_key(k);
-
-	pid_t initpid = lookup_initpid_in_store(fc->pid);
-	if (initpid <= 1 || is_shared_pidns(initpid))
-		initpid = fc->pid;
-	if (!caller_may_see_dir(initpid, controller, path1)) {
-		ret = -ENOENT;
-		goto out;
-	}
-	if (!fc_may_access(fc, controller, path1, path2, mode)) {
-		ret = -EACCES;
-		goto out;
-	}
-
-	ret = 0;
-
-out:
-	free(cgdir);
-	return ret;
-}
-
-int cg_release(const char *path, struct fuse_file_info *fi)
-{
-	do_release_file_info(fi);
-	return 0;
-}
-
-#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
-
-static bool wait_for_sock(int sock, int timeout)
-{
-	struct epoll_event ev;
-	int epfd, ret, now, starttime, deltatime, saved_errno;
-
-	if ((starttime = time(NULL)) < 0)
-		return false;
-
-	if ((epfd = epoll_create(1)) < 0) {
-		lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
-		return false;
-	}
-
-	ev.events = POLLIN_SET;
-	ev.data.fd = sock;
-	if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
-		lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
-		close(epfd);
-		return false;
-	}
-
-again:
-	if ((now = time(NULL)) < 0) {
-		close(epfd);
-		return false;
-	}
-
-	deltatime = (starttime + timeout) - now;
-	if (deltatime < 0) { // timeout
-		errno = 0;
-		close(epfd);
-		return false;
-	}
-	ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
-	if (ret < 0 && errno == EINTR)
-		goto again;
-	saved_errno = errno;
-	close(epfd);
-
-	if (ret <= 0) {
-		errno = saved_errno;
-		return false;
-	}
-	return true;
-}
-
-static int msgrecv(int sockfd, void *buf, size_t len)
-{
-	if (!wait_for_sock(sockfd, 2))
-		return -1;
-	return recv(sockfd, buf, len, MSG_DONTWAIT);
-}
-
-static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
-{
-	struct msghdr msg = { 0 };
-	struct iovec iov;
-	struct cmsghdr *cmsg;
-	char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
-	char buf[1];
-	buf[0] = 'p';
-
-	if (pingfirst) {
-		if (msgrecv(sock, buf, 1) != 1) {
-			lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
-			return SEND_CREDS_FAIL;
-		}
-	}
-
-	msg.msg_control = cmsgbuf;
-	msg.msg_controllen = sizeof(cmsgbuf);
-
-	cmsg = CMSG_FIRSTHDR(&msg);
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
-	cmsg->cmsg_level = SOL_SOCKET;
-	cmsg->cmsg_type = SCM_CREDENTIALS;
-	memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
-
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-
-	buf[0] = v;
-	iov.iov_base = buf;
-	iov.iov_len = sizeof(buf);
-	msg.msg_iov = &iov;
-	msg.msg_iovlen = 1;
-
-	if (sendmsg(sock, &msg, 0) < 0) {
-		lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
-		if (errno == 3)
-			return SEND_CREDS_NOTSK;
-		return SEND_CREDS_FAIL;
-	}
-
-	return SEND_CREDS_OK;
-}
-
-static bool recv_creds(int sock, struct ucred *cred, char *v)
-{
-	struct msghdr msg = { 0 };
-	struct iovec iov;
-	struct cmsghdr *cmsg;
-	char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
-	char buf[1];
-	int ret;
-	int optval = 1;
-
-	*v = '1';
-
-	cred->pid = -1;
-	cred->uid = -1;
-	cred->gid = -1;
-
-	if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
-		lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
-		return false;
-	}
-	buf[0] = '1';
-	if (write(sock, buf, 1) != 1) {
-		lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
-		return false;
-	}
-
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_control = cmsgbuf;
-	msg.msg_controllen = sizeof(cmsgbuf);
-
-	iov.iov_base = buf;
-	iov.iov_len = sizeof(buf);
-	msg.msg_iov = &iov;
-	msg.msg_iovlen = 1;
-
-	if (!wait_for_sock(sock, 2)) {
-		lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
-		return false;
-	}
-	ret = recvmsg(sock, &msg, MSG_DONTWAIT);
-	if (ret < 0) {
-		lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
-		return false;
-	}
-
-	cmsg = CMSG_FIRSTHDR(&msg);
-
-	if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
-			cmsg->cmsg_level == SOL_SOCKET &&
-			cmsg->cmsg_type == SCM_CREDENTIALS) {
-		memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
-	}
-	*v = buf[0];
-
-	return true;
-}
-
-struct pid_ns_clone_args {
-	int *cpipe;
-	int sock;
-	pid_t tpid;
-	int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
-};
-
-/*
- * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
- * with clone(). This simply writes '1' as ACK back to the parent
- * before calling the actual wrapped function.
- */
-static int pid_ns_clone_wrapper(void *arg) {
-	struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
-	char b = '1';
-
-	close(args->cpipe[0]);
-	if (write(args->cpipe[1], &b, sizeof(char)) < 0)
-		lxcfs_error("(child): error on write: %s.\n", strerror(errno));
-	close(args->cpipe[1]);
-	return args->wrapped(args->sock, args->tpid);
-}
-
-/*
- * pid_to_ns - reads pids from a ucred over a socket, then writes the
- * int value back over the socket.  This shifts the pid from the
- * sender's pidns into tpid's pidns.
- */
-static int pid_to_ns(int sock, pid_t tpid)
-{
-	char v = '0';
-	struct ucred cred;
-
-	while (recv_creds(sock, &cred, &v)) {
-		if (v == '1')
-			return 0;
-		if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
-			return 1;
-	}
-	return 0;
-}
-
-
-/*
- * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
- * in your old pidns.  Only children which you clone will be in the target
- * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
- * actually convert pids.
- *
- * Note: glibc's fork() does not respect pidns, which can lead to failed
- * assertions inside glibc (and thus failed forks) if the child's pid in
- * the pidns and the parent pid outside are identical. Using clone prevents
- * this issue.
- */
-static void pid_to_ns_wrapper(int sock, pid_t tpid)
-{
-	int newnsfd = -1, ret, cpipe[2];
-	char fnam[100];
-	pid_t cpid;
-	char v;
-
-	ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
-	if (ret < 0 || ret >= sizeof(fnam))
-		_exit(1);
-	newnsfd = open(fnam, O_RDONLY);
-	if (newnsfd < 0)
-		_exit(1);
-	if (setns(newnsfd, 0) < 0)
-		_exit(1);
-	close(newnsfd);
-
-	if (pipe(cpipe) < 0)
-		_exit(1);
-
-	struct pid_ns_clone_args args = {
-		.cpipe = cpipe,
-		.sock = sock,
-		.tpid = tpid,
-		.wrapped = &pid_to_ns
-	};
-	size_t stack_size = sysconf(_SC_PAGESIZE);
-	void *stack = alloca(stack_size);
-
-	cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
-	if (cpid < 0)
-		_exit(1);
-
-	// give the child 1 second to be done forking and
-	// write its ack
-	if (!wait_for_sock(cpipe[0], 1))
-		_exit(1);
-	ret = read(cpipe[0], &v, 1);
-	if (ret != sizeof(char) || v != '1')
-		_exit(1);
-
-	if (!wait_for_pid(cpid))
-		_exit(1);
-	_exit(0);
-}
-
-/*
- * To read cgroup files with a particular pid, we will setns into the child
- * pidns, open a pipe, fork a child - which will be the first to really be in
- * the child ns - which does the cgfs_get_value and writes the data to the pipe.
- */
-bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
-{
-	int sock[2] = {-1, -1};
-	char *tmpdata = NULL;
-	int ret;
-	pid_t qpid, cpid = -1;
-	bool answer = false;
-	char v = '0';
-	struct ucred cred;
-	size_t sz = 0, asz = 0;
-
-	if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
-		return false;
-
-	/*
-	 * Now we read the pids from returned data one by one, pass
-	 * them into a child in the target namespace, read back the
-	 * translated pids, and put them into our to-return data
-	 */
-
-	if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
-		perror("socketpair");
-		free(tmpdata);
-		return false;
-	}
-
-	cpid = fork();
-	if (cpid == -1)
-		goto out;
-
-	if (!cpid) // child - exits when done
-		pid_to_ns_wrapper(sock[1], tpid);
-
-	char *ptr = tmpdata;
-	cred.uid = 0;
-	cred.gid = 0;
-	while (sscanf(ptr, "%d\n", &qpid) == 1) {
-		cred.pid = qpid;
-		ret = send_creds(sock[0], &cred, v, true);
-
-		if (ret == SEND_CREDS_NOTSK)
-			goto next;
-		if (ret == SEND_CREDS_FAIL)
-			goto out;
-
-		// read converted results
-		if (!wait_for_sock(sock[0], 2)) {
-			lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
-			goto out;
-		}
-		if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
-			lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
-			goto out;
-		}
-		must_strcat_pid(d, &sz, &asz, qpid);
-next:
-		ptr = strchr(ptr, '\n');
-		if (!ptr)
-			break;
-		ptr++;
-	}
-
-	cred.pid = getpid();
-	v = '1';
-	if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
-		// failed to ask child to exit
-		lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
-		goto out;
-	}
-
-	answer = true;
-
-out:
-	free(tmpdata);
-	if (cpid != -1)
-		wait_for_pid(cpid);
-	if (sock[0] != -1) {
-		close(sock[0]);
-		close(sock[1]);
-	}
-	return answer;
-}
-
-int cg_read(const char *path, char *buf, size_t size, off_t offset,
-		struct fuse_file_info *fi)
-{
-	struct fuse_context *fc = fuse_get_context();
-	struct file_info *f = (struct file_info *)fi->fh;
-	struct cgfs_files *k = NULL;
-	char *data = NULL;
-	int ret, s;
-	bool r;
-
-	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-		return -EIO;
-
-	if (f->type != LXC_TYPE_CGFILE) {
-		lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
-		return -EIO;
-	}
-
-	if (offset)
-		return 0;
-
-	if (!f->controller)
-		return -EINVAL;
-
-	if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
-		return -EINVAL;
-	}
-	free_key(k);
-
-
-	if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
-		ret = -EACCES;
-		goto out;
-	}
-
-	if (strcmp(f->file, "tasks") == 0 ||
-			strcmp(f->file, "/tasks") == 0 ||
-			strcmp(f->file, "/cgroup.procs") == 0 ||
-			strcmp(f->file, "cgroup.procs") == 0)
-		// special case - we have to translate the pids
-		r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
-	else
-		r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
-
-	if (!r) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (!data) {
-		ret = 0;
-		goto out;
-	}
-	s = strlen(data);
-	if (s > size)
-		s = size;
-	memcpy(buf, data, s);
-	if (s > 0 && s < size && data[s-1] != '\n')
-		buf[s++] = '\n';
-
-	ret = s;
-
-out:
-	free(data);
-	return ret;
-}
-
-static int pid_from_ns(int sock, pid_t tpid)
-{
-	pid_t vpid;
-	struct ucred cred;
-	char v;
-	int ret;
-
-	cred.uid = 0;
-	cred.gid = 0;
-	while (1) {
-		if (!wait_for_sock(sock, 2)) {
-			lxcfs_error("%s\n", "Timeout reading from parent.");
-			return 1;
-		}
-		if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
-			lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
-			return 1;
-		}
-		if (vpid == -1) // done
-			break;
-		v = '0';
-		cred.pid = vpid;
-		if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
-			v = '1';
-			cred.pid = getpid();
-			if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
-				return 1;
-		}
-	}
-	return 0;
-}
-
-static void pid_from_ns_wrapper(int sock, pid_t tpid)
-{
-	int newnsfd = -1, ret, cpipe[2];
-	char fnam[100];
-	pid_t cpid;
-	char v;
-
-	ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
-	if (ret < 0 || ret >= sizeof(fnam))
-		_exit(1);
-	newnsfd = open(fnam, O_RDONLY);
-	if (newnsfd < 0)
-		_exit(1);
-	if (setns(newnsfd, 0) < 0)
-		_exit(1);
-	close(newnsfd);
-
-	if (pipe(cpipe) < 0)
-		_exit(1);
-
-	struct pid_ns_clone_args args = {
-		.cpipe = cpipe,
-		.sock = sock,
-		.tpid = tpid,
-		.wrapped = &pid_from_ns
-	};
-	size_t stack_size = sysconf(_SC_PAGESIZE);
-	void *stack = alloca(stack_size);
-
-	cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
-	if (cpid < 0)
-		_exit(1);
-
-	// give the child 1 second to be done forking and
-	// write its ack
-	if (!wait_for_sock(cpipe[0], 1))
-		_exit(1);
-	ret = read(cpipe[0], &v, 1);
-	if (ret != sizeof(char) || v != '1')
-		_exit(1);
-
-	if (!wait_for_pid(cpid))
-		_exit(1);
-	_exit(0);
-}
-
-/*
- * Given host @uid, return the uid to which it maps in
- * @pid's user namespace, or -1 if none.
- */
-bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
-{
-	FILE *f;
-	char line[400];
-
-	sprintf(line, "/proc/%d/uid_map", pid);
-	if ((f = fopen(line, "r")) == NULL) {
-		return false;
-	}
-
-	*answer = convert_id_to_ns(f, uid);
-	fclose(f);
-
-	if (*answer == -1)
-		return false;
-	return true;
-}
-
-/*
- * get_pid_creds: get the real uid and gid of @pid from
- * /proc/$$/status
- * (XXX should we use euid here?)
- */
-void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
-{
-	char line[400];
-	uid_t u;
-	gid_t g;
-	FILE *f;
-
-	*uid = -1;
-	*gid = -1;
-	sprintf(line, "/proc/%d/status", pid);
-	if ((f = fopen(line, "r")) == NULL) {
-		lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
-		return;
-	}
-	while (fgets(line, 400, f)) {
-		if (strncmp(line, "Uid:", 4) == 0) {
-			if (sscanf(line+4, "%u", &u) != 1) {
-				lxcfs_error("bad uid line for pid %u\n", pid);
-				fclose(f);
-				return;
-			}
-			*uid = u;
-		} else if (strncmp(line, "Gid:", 4) == 0) {
-			if (sscanf(line+4, "%u", &g) != 1) {
-				lxcfs_error("bad gid line for pid %u\n", pid);
-				fclose(f);
-				return;
-			}
-			*gid = g;
-		}
-	}
-	fclose(f);
-}
-
-/*
- * May the requestor @r move victim @v to a new cgroup?
- * This is allowed if
- *   . they are the same task
- *   . they are ownedy by the same uid
- *   . @r is root on the host, or
- *   . @v's uid is mapped into @r's where @r is root.
- */
-bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
-{
-	uid_t v_uid, tmpuid;
-	gid_t v_gid;
-
-	if (r == v)
-		return true;
-	if (r_uid == 0)
-		return true;
-	get_pid_creds(v, &v_uid, &v_gid);
-	if (r_uid == v_uid)
-		return true;
-	if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
-			&& hostuid_to_ns(v_uid, r, &tmpuid))
-		return true;
-	return false;
-}
-
-static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
-		const char *file, const char *buf)
-{
-	int sock[2] = {-1, -1};
-	pid_t qpid, cpid = -1;
-	FILE *pids_file = NULL;
-	bool answer = false, fail = false;
-
-	pids_file = open_pids_file(contrl, cg);
-	if (!pids_file)
+	ev.events = POLLIN_SET;
+	ev.data.fd = sock;
+	if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
+		lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
+		close(epfd);
 		return false;
-
-	/*
-	 * write the pids to a socket, have helper in writer's pidns
-	 * call movepid for us
-	 */
-	if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
-		perror("socketpair");
-		goto out;
-	}
-
-	cpid = fork();
-	if (cpid == -1)
-		goto out;
-
-	if (!cpid) { // child
-		fclose(pids_file);
-		pid_from_ns_wrapper(sock[1], tpid);
-	}
-
-	const char *ptr = buf;
-	while (sscanf(ptr, "%d", &qpid) == 1) {
-		struct ucred cred;
-		char v;
-
-		if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
-			lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
-			goto out;
-		}
-
-		if (recv_creds(sock[0], &cred, &v)) {
-			if (v == '0') {
-				if (!may_move_pid(tpid, tuid, cred.pid)) {
-					fail = true;
-					break;
-				}
-				if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
-					fail = true;
-			}
-		}
-
-		ptr = strchr(ptr, '\n');
-		if (!ptr)
-			break;
-		ptr++;
-	}
-
-	/* All good, write the value */
-	qpid = -1;
-	if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
-		lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
-
-	if (!fail)
-		answer = true;
-
-out:
-	if (cpid != -1)
-		wait_for_pid(cpid);
-	if (sock[0] != -1) {
-		close(sock[0]);
-		close(sock[1]);
 	}
-	if (pids_file) {
-		if (fclose(pids_file) != 0)
-			answer = false;
-	}
-	return answer;
-}
-
-int cg_write(const char *path, const char *buf, size_t size, off_t offset,
-	     struct fuse_file_info *fi)
-{
-	struct fuse_context *fc = fuse_get_context();
-	char *localbuf = NULL;
-	struct cgfs_files *k = NULL;
-	struct file_info *f = (struct file_info *)fi->fh;
-	bool r;
 
-	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-		return -EIO;
-
-	if (f->type != LXC_TYPE_CGFILE) {
-		lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
-		return -EIO;
+again:
+	if ((now = time(NULL)) < 0) {
+		close(epfd);
+		return false;
 	}
 
-	if (offset)
-		return 0;
-
-	localbuf = alloca(size+1);
-	localbuf[size] = '\0';
-	memcpy(localbuf, buf, size);
-
-	if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
-		size = -EINVAL;
-		goto out;
+	deltatime = (starttime + timeout) - now;
+	if (deltatime < 0) { // timeout
+		errno = 0;
+		close(epfd);
+		return false;
 	}
+	ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
+	if (ret < 0 && errno == EINTR)
+		goto again;
+	saved_errno = errno;
+	close(epfd);
 
-	if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
-		size = -EACCES;
-		goto out;
+	if (ret <= 0) {
+		errno = saved_errno;
+		return false;
 	}
-
-	if (strcmp(f->file, "tasks") == 0 ||
-			strcmp(f->file, "/tasks") == 0 ||
-			strcmp(f->file, "/cgroup.procs") == 0 ||
-			strcmp(f->file, "cgroup.procs") == 0)
-		// special case - we have to translate the pids
-		r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
-	else
-		r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
-
-	if (!r)
-		size = -EINVAL;
-
-out:
-	free_key(k);
-	return size;
+	return true;
 }
 
-int cg_chown(const char *path, uid_t uid, gid_t gid)
+static int msgrecv(int sockfd, void *buf, size_t len)
 {
-	struct fuse_context *fc = fuse_get_context();
-	char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
-	struct cgfs_files *k = NULL;
-	const char *cgroup;
-	int ret;
-
-	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-		return -EIO;
-
-	if (strcmp(path, "/cgroup") == 0)
-		return -EPERM;
-
-	controller = pick_controller_from_path(fc, path);
-	if (!controller)
-		return errno == ENOENT ? -EPERM : -errno;
-
-	cgroup = find_cgroup_in_path(path);
-	if (!cgroup)
-		/* this is just /cgroup/controller */
-		return -EPERM;
-
-	get_cgdir_and_path(cgroup, &cgdir, &last);
-
-	if (!last) {
-		path1 = "/";
-		path2 = cgdir;
-	} else {
-		path1 = cgdir;
-		path2 = last;
-	}
-
-	if (is_child_cgroup(controller, path1, path2)) {
-		// get uid, gid, from '/tasks' file and make up a mode
-		// That is a hack, until cgmanager gains a GetCgroupPerms fn.
-		k = cgfs_get_key(controller, cgroup, "tasks");
-
-	} else
-		k = cgfs_get_key(controller, path1, path2);
-
-	if (!k) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/*
-	 * This being a fuse request, the uid and gid must be valid
-	 * in the caller's namespace.  So we can just check to make
-	 * sure that the caller is root in his uid, and privileged
-	 * over the file's current owner.
-	 */
-	if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
-		ret = -EACCES;
-		goto out;
-	}
-
-	ret = cgfs_chown_file(controller, cgroup, uid, gid);
-
-out:
-	free_key(k);
-	free(cgdir);
-
-	return ret;
+	if (!wait_for_sock(sockfd, 2))
+		return -1;
+	return recv(sockfd, buf, len, MSG_DONTWAIT);
 }
 
-int cg_chmod(const char *path, mode_t mode)
+static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
 {
-	struct fuse_context *fc = fuse_get_context();
-	char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
-	struct cgfs_files *k = NULL;
-	const char *cgroup;
-	int ret;
-
-	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-		return -EIO;
-
-	if (strcmp(path, "/cgroup") == 0)
-		return -EPERM;
-
-	controller = pick_controller_from_path(fc, path);
-	if (!controller)
-		return errno == ENOENT ? -EPERM : -errno;
-
-	cgroup = find_cgroup_in_path(path);
-	if (!cgroup)
-		/* this is just /cgroup/controller */
-		return -EPERM;
-
-	get_cgdir_and_path(cgroup, &cgdir, &last);
+	struct msghdr msg = { 0 };
+	struct iovec iov;
+	struct cmsghdr *cmsg;
+	char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
+	char buf[1];
+	buf[0] = 'p';
 
-	if (!last) {
-		path1 = "/";
-		path2 = cgdir;
-	} else {
-		path1 = cgdir;
-		path2 = last;
+	if (pingfirst) {
+		if (msgrecv(sock, buf, 1) != 1) {
+			lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
+			return SEND_CREDS_FAIL;
+		}
 	}
 
-	if (is_child_cgroup(controller, path1, path2)) {
-		// get uid, gid, from '/tasks' file and make up a mode
-		// That is a hack, until cgmanager gains a GetCgroupPerms fn.
-		k = cgfs_get_key(controller, cgroup, "tasks");
+	msg.msg_control = cmsgbuf;
+	msg.msg_controllen = sizeof(cmsgbuf);
 
-	} else
-		k = cgfs_get_key(controller, path1, path2);
+	cmsg = CMSG_FIRSTHDR(&msg);
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_CREDENTIALS;
+	memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
 
-	if (!k) {
-		ret = -EINVAL;
-		goto out;
-	}
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
 
-	/*
-	 * This being a fuse request, the uid and gid must be valid
-	 * in the caller's namespace.  So we can just check to make
-	 * sure that the caller is root in his uid, and privileged
-	 * over the file's current owner.
-	 */
-	if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
-		ret = -EPERM;
-		goto out;
-	}
+	buf[0] = v;
+	iov.iov_base = buf;
+	iov.iov_len = sizeof(buf);
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
 
-	if (!cgfs_chmod_file(controller, cgroup, mode)) {
-		ret = -EINVAL;
-		goto out;
+	if (sendmsg(sock, &msg, 0) < 0) {
+		lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
+		if (errno == 3)
+			return SEND_CREDS_NOTSK;
+		return SEND_CREDS_FAIL;
 	}
 
-	ret = 0;
-out:
-	free_key(k);
-	free(cgdir);
-	return ret;
+	return SEND_CREDS_OK;
 }
 
-int cg_mkdir(const char *path, mode_t mode)
+static bool recv_creds(int sock, struct ucred *cred, char *v)
 {
-	struct fuse_context *fc = fuse_get_context();
-	char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
-	const char *cgroup;
+	struct msghdr msg = { 0 };
+	struct iovec iov;
+	struct cmsghdr *cmsg;
+	char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
+	char buf[1];
 	int ret;
+	int optval = 1;
 
-	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-		return -EIO;
-
-	controller = pick_controller_from_path(fc, path);
-	if (!controller)
-		return errno == ENOENT ? -EPERM : -errno;
-
-	cgroup = find_cgroup_in_path(path);
-	if (!cgroup)
-		return -errno;
-
-	get_cgdir_and_path(cgroup, &cgdir, &last);
-	if (!last)
-		path1 = "/";
-	else
-		path1 = cgdir;
+	*v = '1';
 
-	pid_t initpid = lookup_initpid_in_store(fc->pid);
-	if (initpid <= 1 || is_shared_pidns(initpid))
-		initpid = fc->pid;
-	if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
-		if (!next)
-			ret = -EINVAL;
-		else if (last && strcmp(next, last) == 0)
-			ret = -EEXIST;
-		else
-			ret = -EPERM;
-		goto out;
-	}
+	cred->pid = -1;
+	cred->uid = -1;
+	cred->gid = -1;
 
-	if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
-		ret = -EACCES;
-		goto out;
+	if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
+		lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
+		return false;
 	}
-	if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
-		ret = -EACCES;
-		goto out;
+	buf[0] = '1';
+	if (write(sock, buf, 1) != 1) {
+		lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
+		return false;
 	}
 
-	ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
-
-out:
-	free(cgdir);
-	free(next);
-	return ret;
-}
-
-int cg_rmdir(const char *path)
-{
-	struct fuse_context *fc = fuse_get_context();
-	char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
-	const char *cgroup;
-	int ret;
-
-	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
-		return -EIO;
-
-	controller = pick_controller_from_path(fc, path);
-	if (!controller) /* Someone's trying to delete "/cgroup". */
-		return -EPERM;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = cmsgbuf;
+	msg.msg_controllen = sizeof(cmsgbuf);
 
-	cgroup = find_cgroup_in_path(path);
-	if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
-		return -EPERM;
+	iov.iov_base = buf;
+	iov.iov_len = sizeof(buf);
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
 
-	get_cgdir_and_path(cgroup, &cgdir, &last);
-	if (!last) {
-		/* Someone's trying to delete a cgroup on the same level as the
-		 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
-		 * rmdir "/cgroup/blkio/init.slice".
-		 */
-		ret = -EPERM;
-		goto out;
+	if (!wait_for_sock(sock, 2)) {
+		lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
+		return false;
 	}
-
-	pid_t initpid = lookup_initpid_in_store(fc->pid);
-	if (initpid <= 1 || is_shared_pidns(initpid))
-		initpid = fc->pid;
-	if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
-		if (!last || (next && (strcmp(next, last) == 0)))
-			ret = -EBUSY;
-		else
-			ret = -ENOENT;
-		goto out;
+	ret = recvmsg(sock, &msg, MSG_DONTWAIT);
+	if (ret < 0) {
+		lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
+		return false;
 	}
 
-	if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
-		ret = -EACCES;
-		goto out;
-	}
-	if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
-		ret = -EACCES;
-		goto out;
-	}
+	cmsg = CMSG_FIRSTHDR(&msg);
 
-	if (!cgfs_remove(controller, cgroup)) {
-		ret = -EINVAL;
-		goto out;
+	if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
+			cmsg->cmsg_level == SOL_SOCKET &&
+			cmsg->cmsg_type == SCM_CREDENTIALS) {
+		memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
 	}
+	*v = buf[0];
 
-	ret = 0;
-
-out:
-	free(cgdir);
-	free(next);
-	return ret;
+	return true;
 }
 
+struct pid_ns_clone_args {
+	int *cpipe;
+	int sock;
+	pid_t tpid;
+	int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
+};
+
 static bool startswith(const char *line, const char *pref)
 {
 	if (strncmp(line, pref, strlen(pref)) == 0)
diff --git a/bindings.h b/bindings.h
index e3c0c83..7f928d6 100644
--- a/bindings.h
+++ b/bindings.h
@@ -2,6 +2,7 @@
 #define __LXCFS_BINDINGS_H
 
 #include "macro.h"
+#include "cgroup_fuse.h"
 #include "sysfs_fuse.h"
 
 /* directory under which we mount the controllers - /run/lxcfs/controllers */
@@ -42,23 +43,6 @@ struct lxcfs_opts {
 	bool swap_off;
 };
 
-extern int cg_write(const char *path, const char *buf, size_t size, off_t offset,
-	     struct fuse_file_info *fi);
-extern int cg_mkdir(const char *path, mode_t mode);
-extern int cg_chown(const char *path, uid_t uid, gid_t gid);
-extern int cg_rmdir(const char *path);
-extern int cg_chmod(const char *path, mode_t mode);
-extern int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
-		struct fuse_file_info *fi);
-extern int cg_releasedir(const char *path, struct fuse_file_info *fi);
-extern int cg_release(const char *path, struct fuse_file_info *fi);
-extern int cg_read(const char *path, char *buf, size_t size, off_t offset,
-		struct fuse_file_info *fi);
-extern int cg_opendir(const char *path, struct fuse_file_info *fi);
-extern int cg_getattr(const char *path, struct stat *sb);
-extern int cg_open(const char *path, struct fuse_file_info *fi);
-extern int cg_access(const char *path, int mode);
-
 extern int proc_getattr(const char *path, struct stat *sb);
 extern int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
 		struct fuse_file_info *fi);
diff --git a/cgroup_fuse.c b/cgroup_fuse.c
new file mode 100644
index 0000000..e7833a2
--- /dev/null
+++ b/cgroup_fuse.c
@@ -0,0 +1,2302 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#define FUSE_USE_VERSION 26
+
+#define __STDC_FORMAT_MACROS
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <fuse.h>
+#include <inttypes.h>
+#include <libgen.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <wait.h>
+#include <linux/magic.h>
+#include <linux/sched.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <sys/sysinfo.h>
+#include <sys/vfs.h>
+
+#include "bindings.h"
+#include "config.h"
+#include "cgroups/cgroup.h"
+#include "cgroups/cgroup_utils.h"
+#include "memory_utils.h"
+#include "utils.h"
+
+struct cgfs_files {
+	char *name;
+	uint32_t uid, gid;
+	uint32_t mode;
+};
+
+struct pid_ns_clone_args {
+	int *cpipe;
+	int sock;
+	pid_t tpid;
+	/* pid_from_ns or pid_to_ns. */
+	int (*wrapped) (int, pid_t);
+};
+
+/*
+ * given /cgroup/freezer/a/b, return "freezer".
+ * the returned char* should NOT be freed.
+ */
+static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
+{
+	const char *p1;
+	char *contr, *slash;
+
+	if (strlen(path) < 9) {
+		errno = EACCES;
+		return NULL;
+	}
+	if (*(path + 7) != '/') {
+		errno = EINVAL;
+		return NULL;
+	}
+	p1 = path + 8;
+	contr = strdupa(p1);
+	if (!contr) {
+		errno = ENOMEM;
+		return NULL;
+	}
+	slash = strstr(contr, "/");
+	if (slash)
+		*slash = '\0';
+
+	for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
+		if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
+			return (*h)->__controllers;
+	}
+	errno = ENOENT;
+	return NULL;
+}
+
+/*
+ * Find the start of cgroup in /cgroup/controller/the/cgroup/path
+ * Note that the returned value may include files (keynames) etc
+ */
+static const char *find_cgroup_in_path(const char *path)
+{
+	const char *p1;
+
+	if (strlen(path) < 9) {
+		errno = EACCES;
+		return NULL;
+	}
+	p1 = strstr(path + 8, "/");
+	if (!p1) {
+		errno = EINVAL;
+		return NULL;
+	}
+	errno = 0;
+	return p1 + 1;
+}
+
+/*
+ * split the last path element from the path in @cg.
+ * @dir is newly allocated and should be freed, @last not
+*/
+static void get_cgdir_and_path(const char *cg, char **dir, char **last)
+{
+	char *p;
+
+	do {
+		*dir = strdup(cg);
+	} while (!*dir);
+	*last = strrchr(cg, '/');
+	if (!*last) {
+		*last = NULL;
+		return;
+	}
+	p = strrchr(*dir, '/');
+	*p = '\0';
+}
+
+static bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
+{
+	int cfd;
+	size_t len;
+	char *fnam;
+	int ret;
+	struct stat sb;
+
+	cfd = get_cgroup_fd(controller);
+	if (cfd < 0)
+		return false;
+
+	/* Make sure we pass a relative path to *at() family of functions.
+	 * . + /cgroup + / + f + \0
+	 */
+	len = strlen(cgroup) + strlen(f) + 3;
+	fnam = alloca(len);
+	ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
+	if (ret < 0 || (size_t)ret >= len)
+		return false;
+
+	ret = fstatat(cfd, fnam, &sb, 0);
+	if (ret < 0 || !S_ISDIR(sb.st_mode))
+		return false;
+
+	return true;
+}
+
+/*
+ * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
+ */
+static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
+{
+	bool answer = false;
+	char *c2, *task_cg;
+	size_t target_len, task_len;
+
+	if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
+		return true;
+
+	c2 = get_pid_cgroup(pid, contrl);
+	if (!c2)
+		return false;
+	prune_init_slice(c2);
+
+	task_cg = c2 + 1;
+	target_len = strlen(cg);
+	task_len = strlen(task_cg);
+	if (task_len == 0) {
+		/* Task is in the root cg, it can see everything. This case is
+		 * not handled by the strmcps below, since they test for the
+		 * last /, but that is the first / that we've chopped off
+		 * above.
+		 */
+		answer = true;
+		goto out;
+	}
+	if (strcmp(cg, task_cg) == 0) {
+		answer = true;
+		goto out;
+	}
+	if (target_len < task_len) {
+		/* looking up a parent dir */
+		if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
+			answer = true;
+		goto out;
+	}
+	if (target_len > task_len) {
+		/* looking up a child dir */
+		if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
+			answer = true;
+		goto out;
+	}
+
+out:
+	free(c2);
+	return answer;
+}
+
+/*
+ * taskcg is  a/b/c
+ * querycg is /a/b/c/d/e
+ * we return 'd'
+ */
+static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
+{
+	char *start, *end;
+
+	if (strlen(taskcg) <= strlen(querycg)) {
+		lxcfs_error("%s\n", "I was fed bad input.");
+		return NULL;
+	}
+
+	if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
+		start =  strdup(taskcg + 1);
+	else
+		start = strdup(taskcg + strlen(querycg) + 1);
+	if (!start)
+		return NULL;
+	end = strchr(start, '/');
+	if (end)
+		*end = '\0';
+	return start;
+}
+
+/*
+ * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
+ * If pid is in /a, he may act on /a/b, but not on /b.
+ * if the answer is false and nextcg is not NULL, then *nextcg will point
+ * to a string containing the next cgroup directory under cg, which must be
+ * freed by the caller.
+ */
+static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
+{
+	bool answer = false;
+	char *c2 = get_pid_cgroup(pid, contrl);
+	char *linecmp;
+
+	if (!c2)
+		return false;
+	prune_init_slice(c2);
+
+	/*
+	 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
+	 * they pass in a cgroup without leading '/'
+	 *
+	 * The original line here was:
+	 *	linecmp = *cg == '/' ? c2 : c2+1;
+	 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
+	 *       Serge, do you know?
+	 */
+	if (*cg == '/' || !strncmp(cg, "./", 2))
+		linecmp = c2;
+	else
+		linecmp = c2 + 1;
+	if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
+		if (nextcg) {
+			*nextcg = get_next_cgroup_dir(linecmp, cg);
+		}
+		goto out;
+	}
+	answer = true;
+
+out:
+	free(c2);
+	return answer;
+}
+
+static struct cgfs_files *cgfs_get_key(const char *controller,
+				       const char *cgroup, const char *file)
+{
+	int ret, cfd;
+	size_t len;
+	char *fnam;
+	struct stat sb;
+	struct cgfs_files *newkey;
+
+	cfd = get_cgroup_fd(controller);
+	if (cfd < 0)
+		return false;
+
+	if (file && *file == '/')
+		file++;
+
+	if (file && strchr(file, '/'))
+		return NULL;
+
+	/* Make sure we pass a relative path to *at() family of functions.
+	 * . + /cgroup + / + file + \0
+	 */
+	len = strlen(cgroup) + 3;
+	if (file)
+		len += strlen(file) + 1;
+	fnam = alloca(len);
+	snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
+		 file ? "/" : "", file ? file : "");
+
+	ret = fstatat(cfd, fnam, &sb, 0);
+	if (ret < 0)
+		return NULL;
+
+	do {
+		newkey = malloc(sizeof(struct cgfs_files));
+	} while (!newkey);
+	if (file)
+		newkey->name = must_copy_string(file);
+	else if (strrchr(cgroup, '/'))
+		newkey->name = must_copy_string(strrchr(cgroup, '/'));
+	else
+		newkey->name = must_copy_string(cgroup);
+	newkey->uid = sb.st_uid;
+	newkey->gid = sb.st_gid;
+	newkey->mode = sb.st_mode;
+
+	return newkey;
+}
+
+/*
+ * Given a open file * to /proc/pid/{u,g}id_map, and an id
+ * valid in the caller's namespace, return the id mapped into
+ * pid's namespace.
+ * Returns the mapped id, or -1 on error.
+ */
+static unsigned int convert_id_to_ns(FILE *idfile, unsigned int in_id)
+{
+	unsigned int nsuid,   // base id for a range in the idfile's namespace
+		     hostuid, // base id for a range in the caller's namespace
+		     count;   // number of ids in this range
+	char line[400];
+	int ret;
+
+	fseek(idfile, 0L, SEEK_SET);
+	while (fgets(line, 400, idfile)) {
+		ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
+		if (ret != 3)
+			continue;
+		if (hostuid + count < hostuid || nsuid + count < nsuid) {
+			/*
+			 * uids wrapped around - unexpected as this is a procfile,
+			 * so just bail.
+			 */
+			lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
+				nsuid, hostuid, count, line);
+			return -1;
+		}
+		if (hostuid <= in_id && hostuid+count > in_id) {
+			/*
+			 * now since hostuid <= in_id < hostuid+count, and
+			 * hostuid+count and nsuid+count do not wrap around,
+			 * we know that nsuid+(in_id-hostuid) which must be
+			 * less that nsuid+(count) must not wrap around
+			 */
+			return (in_id - hostuid) + nsuid;
+		}
+	}
+
+	// no answer found
+	return -1;
+}
+
+/*
+ * for is_privileged_over,
+ * specify whether we require the calling uid to be root in his
+ * namespace
+ */
+#define NS_ROOT_REQD true
+#define NS_ROOT_OPT false
+
+#define PROCLEN 100
+
+static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
+{
+	char fpath[PROCLEN];
+	int ret;
+	bool answer = false;
+	uid_t nsuid;
+
+	if (victim == -1 || uid == -1)
+		return false;
+
+	/*
+	 * If the request is one not requiring root in the namespace,
+	 * then having the same uid suffices.  (i.e. uid 1000 has write
+	 * access to files owned by uid 1000
+	 */
+	if (!req_ns_root && uid == victim)
+		return true;
+
+	ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
+	if (ret < 0 || ret >= PROCLEN)
+		return false;
+	FILE *f = fopen(fpath, "r");
+	if (!f)
+		return false;
+
+	/* if caller's not root in his namespace, reject */
+	nsuid = convert_id_to_ns(f, uid);
+	if (nsuid)
+		goto out;
+
+	/*
+	 * If victim is not mapped into caller's ns, reject.
+	 * XXX I'm not sure this check is needed given that fuse
+	 * will be sending requests where the vfs has converted
+	 */
+	nsuid = convert_id_to_ns(f, victim);
+	if (nsuid == -1)
+		goto out;
+
+	answer = true;
+
+out:
+	fclose(f);
+	return answer;
+}
+
+static bool perms_include(int fmode, mode_t req_mode)
+{
+	mode_t r;
+
+	switch (req_mode & O_ACCMODE) {
+	case O_RDONLY:
+		r = S_IROTH;
+		break;
+	case O_WRONLY:
+		r = S_IWOTH;
+		break;
+	case O_RDWR:
+		r = S_IROTH | S_IWOTH;
+		break;
+	default:
+		return false;
+	}
+	return ((fmode & r) == r);
+}
+
+static void free_key(struct cgfs_files *k)
+{
+	if (!k)
+		return;
+	free_disarm(k->name);
+	free_disarm(k);
+}
+
+/*
+ * check whether a fuse context may access a cgroup dir or file
+ *
+ * If file is not null, it is a cgroup file to check under cg.
+ * If file is null, then we are checking perms on cg itself.
+ *
+ * For files we can check the mode of the list_keys result.
+ * For cgroups, we must make assumptions based on the files under the
+ * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
+ * yet.
+ */
+static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
+{
+	struct cgfs_files *k = NULL;
+	bool ret = false;
+
+	k = cgfs_get_key(contrl, cg, file);
+	if (!k)
+		return false;
+
+	if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
+		if (perms_include(k->mode >> 6, mode)) {
+			ret = true;
+			goto out;
+		}
+	}
+	if (fc->gid == k->gid) {
+		if (perms_include(k->mode >> 3, mode)) {
+			ret = true;
+			goto out;
+		}
+	}
+	ret = perms_include(k->mode, mode);
+
+out:
+	free_key(k);
+	return ret;
+}
+
+int cg_getattr(const char *path, struct stat *sb)
+{
+	struct timespec now;
+	struct fuse_context *fc = fuse_get_context();
+	char * cgdir = NULL;
+	char *last = NULL, *path1, *path2;
+	struct cgfs_files *k = NULL;
+	const char *cgroup;
+	const char *controller = NULL;
+	int ret = -ENOENT;
+
+
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
+	memset(sb, 0, sizeof(struct stat));
+
+	if (clock_gettime(CLOCK_REALTIME, &now) < 0)
+		return -EINVAL;
+
+	sb->st_uid = sb->st_gid = 0;
+	sb->st_atim = sb->st_mtim = sb->st_ctim = now;
+	sb->st_size = 0;
+
+	if (strcmp(path, "/cgroup") == 0) {
+		sb->st_mode = S_IFDIR | 00755;
+		sb->st_nlink = 2;
+		return 0;
+	}
+
+	controller = pick_controller_from_path(fc, path);
+	if (!controller)
+		return -errno;
+	cgroup = find_cgroup_in_path(path);
+	if (!cgroup) {
+		/* this is just /cgroup/controller, return it as a dir */
+		sb->st_mode = S_IFDIR | 00755;
+		sb->st_nlink = 2;
+		return 0;
+	}
+
+	get_cgdir_and_path(cgroup, &cgdir, &last);
+
+	if (!last) {
+		path1 = "/";
+		path2 = cgdir;
+	} else {
+		path1 = cgdir;
+		path2 = last;
+	}
+
+	pid_t initpid = lookup_initpid_in_store(fc->pid);
+	if (initpid <= 1 || is_shared_pidns(initpid))
+		initpid = fc->pid;
+	/* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
+	 * Then check that caller's cgroup is under path if last is a child
+	 * cgroup, or cgdir if last is a file */
+
+	if (is_child_cgroup(controller, path1, path2)) {
+		if (!caller_may_see_dir(initpid, controller, cgroup)) {
+			ret = -ENOENT;
+			goto out;
+		}
+		if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
+			/* this is just /cgroup/controller, return it as a dir */
+			sb->st_mode = S_IFDIR | 00555;
+			sb->st_nlink = 2;
+			ret = 0;
+			goto out;
+		}
+		if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
+			ret = -EACCES;
+			goto out;
+		}
+
+		// get uid, gid, from '/tasks' file and make up a mode
+		// That is a hack, until cgmanager gains a GetCgroupPerms fn.
+		sb->st_mode = S_IFDIR | 00755;
+		k = cgfs_get_key(controller, cgroup, NULL);
+		if (!k) {
+			sb->st_uid = sb->st_gid = 0;
+		} else {
+			sb->st_uid = k->uid;
+			sb->st_gid = k->gid;
+		}
+		free_key(k);
+		sb->st_nlink = 2;
+		ret = 0;
+		goto out;
+	}
+
+	if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
+		sb->st_mode = S_IFREG | k->mode;
+		sb->st_nlink = 1;
+		sb->st_uid = k->uid;
+		sb->st_gid = k->gid;
+		sb->st_size = 0;
+		free_key(k);
+		if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
+			ret = -ENOENT;
+			goto out;
+		}
+		ret = 0;
+	}
+
+out:
+	free(cgdir);
+	return ret;
+}
+
+/*
+ * Chown all the files in the cgroup directory.  We do this when we create a
+ * cgroup on behalf of a user.
+ */
+static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
+{
+	struct dirent *direntp;
+	char path[MAXPATHLEN];
+	size_t len;
+	DIR *d;
+	int fd1, ret;
+
+	len = strlen(dirname);
+	if (len >= MAXPATHLEN) {
+		lxcfs_error("Pathname too long: %s\n", dirname);
+		return;
+	}
+
+	fd1 = openat(fd, dirname, O_DIRECTORY);
+	if (fd1 < 0)
+		return;
+
+	d = fdopendir(fd1);
+	if (!d) {
+		lxcfs_error("Failed to open %s\n", dirname);
+		return;
+	}
+
+	while ((direntp = readdir(d))) {
+		if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
+			continue;
+		ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
+		if (ret < 0 || ret >= MAXPATHLEN) {
+			lxcfs_error("Pathname too long under %s\n", dirname);
+			continue;
+		}
+		if (fchownat(fd, path, uid, gid, 0) < 0)
+			lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
+	}
+	closedir(d);
+}
+
+static int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
+{
+	int cfd;
+	size_t len;
+	char *dirnam;
+
+	cfd = get_cgroup_fd(controller);
+	if (cfd < 0)
+		return -EINVAL;
+
+	/* Make sure we pass a relative path to *at() family of functions.
+	 * . + /cg + \0
+	 */
+	len = strlen(cg) + 2;
+	dirnam = alloca(len);
+	snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
+
+	if (mkdirat(cfd, dirnam, 0755) < 0)
+		return -errno;
+
+	if (uid == 0 && gid == 0)
+		return 0;
+
+	if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
+		return -errno;
+
+	chown_all_cgroup_files(dirnam, uid, gid, cfd);
+
+	return 0;
+}
+
+int cg_mkdir(const char *path, mode_t mode)
+{
+	struct fuse_context *fc = fuse_get_context();
+	char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
+	const char *cgroup;
+	int ret;
+
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
+	controller = pick_controller_from_path(fc, path);
+	if (!controller)
+		return errno == ENOENT ? -EPERM : -errno;
+
+	cgroup = find_cgroup_in_path(path);
+	if (!cgroup)
+		return -errno;
+
+	get_cgdir_and_path(cgroup, &cgdir, &last);
+	if (!last)
+		path1 = "/";
+	else
+		path1 = cgdir;
+
+	pid_t initpid = lookup_initpid_in_store(fc->pid);
+	if (initpid <= 1 || is_shared_pidns(initpid))
+		initpid = fc->pid;
+	if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
+		if (!next)
+			ret = -EINVAL;
+		else if (last && strcmp(next, last) == 0)
+			ret = -EEXIST;
+		else
+			ret = -EPERM;
+		goto out;
+	}
+
+	if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
+		ret = -EACCES;
+		goto out;
+	}
+	if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
+		ret = -EACCES;
+		goto out;
+	}
+
+	ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
+
+out:
+	free(cgdir);
+	free(next);
+	return ret;
+}
+
+static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
+{
+	struct dirent *direntp;
+	DIR *dir;
+	bool ret = false;
+	char pathname[MAXPATHLEN];
+	int dupfd;
+
+	dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
+	if (dupfd < 0)
+		return false;
+
+	dir = fdopendir(dupfd);
+	if (!dir) {
+		lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
+		close(dupfd);
+		return false;
+	}
+
+	while ((direntp = readdir(dir))) {
+		struct stat mystat;
+		int rc;
+
+		if (!strcmp(direntp->d_name, ".") ||
+		    !strcmp(direntp->d_name, ".."))
+			continue;
+
+		rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
+		if (rc < 0 || rc >= MAXPATHLEN) {
+			lxcfs_error("%s\n", "Pathname too long.");
+			continue;
+		}
+
+		rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
+		if (rc) {
+			lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
+			continue;
+		}
+		if (S_ISDIR(mystat.st_mode))
+			if (!recursive_rmdir(pathname, fd, cfd))
+				lxcfs_debug("Error removing %s.\n", pathname);
+	}
+
+	ret = true;
+	if (closedir(dir) < 0) {
+		lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
+		ret = false;
+	}
+
+	if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
+		lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
+		ret = false;
+	}
+
+	close(dupfd);
+
+	return ret;
+}
+
+static bool cgfs_remove(const char *controller, const char *cg)
+{
+	int fd, cfd;
+	size_t len;
+	char *dirnam;
+	bool bret;
+
+	cfd = get_cgroup_fd(controller);
+	if (cfd < 0)
+		return false;
+
+	/* Make sure we pass a relative path to *at() family of functions.
+	 * . +  /cg + \0
+	 */
+	len = strlen(cg) + 2;
+	dirnam = alloca(len);
+	snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
+
+	fd = openat(cfd, dirnam, O_DIRECTORY);
+	if (fd < 0)
+		return false;
+
+	bret = recursive_rmdir(dirnam, fd, cfd);
+	close(fd);
+	return bret;
+}
+
+int cg_rmdir(const char *path)
+{
+	struct fuse_context *fc = fuse_get_context();
+	char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
+	const char *cgroup;
+	int ret;
+
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
+	controller = pick_controller_from_path(fc, path);
+	if (!controller) /* Someone's trying to delete "/cgroup". */
+		return -EPERM;
+
+	cgroup = find_cgroup_in_path(path);
+	if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
+		return -EPERM;
+
+	get_cgdir_and_path(cgroup, &cgdir, &last);
+	if (!last) {
+		/* Someone's trying to delete a cgroup on the same level as the
+		 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
+		 * rmdir "/cgroup/blkio/init.slice".
+		 */
+		ret = -EPERM;
+		goto out;
+	}
+
+	pid_t initpid = lookup_initpid_in_store(fc->pid);
+	if (initpid <= 1 || is_shared_pidns(initpid))
+		initpid = fc->pid;
+	if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
+		if (!last || (next && (strcmp(next, last) == 0)))
+			ret = -EBUSY;
+		else
+			ret = -ENOENT;
+		goto out;
+	}
+
+	if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
+		ret = -EACCES;
+		goto out;
+	}
+	if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
+		ret = -EACCES;
+		goto out;
+	}
+
+	if (!cgfs_remove(controller, cgroup)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = 0;
+
+out:
+	free(cgdir);
+	free(next);
+	return ret;
+}
+
+static bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
+{
+	int cfd;
+	size_t len;
+	char *pathname;
+
+	cfd = get_cgroup_fd(controller);
+	if (cfd < 0)
+		return false;
+
+	/* Make sure we pass a relative path to *at() family of functions.
+	 * . + /file + \0
+	 */
+	len = strlen(file) + 2;
+	pathname = alloca(len);
+	snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
+	if (fchmodat(cfd, pathname, mode, 0) < 0)
+		return false;
+	return true;
+}
+
+int cg_chmod(const char *path, mode_t mode)
+{
+	struct fuse_context *fc = fuse_get_context();
+	char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
+	struct cgfs_files *k = NULL;
+	const char *cgroup;
+	int ret;
+
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
+	if (strcmp(path, "/cgroup") == 0)
+		return -EPERM;
+
+	controller = pick_controller_from_path(fc, path);
+	if (!controller)
+		return errno == ENOENT ? -EPERM : -errno;
+
+	cgroup = find_cgroup_in_path(path);
+	if (!cgroup)
+		/* this is just /cgroup/controller */
+		return -EPERM;
+
+	get_cgdir_and_path(cgroup, &cgdir, &last);
+
+	if (!last) {
+		path1 = "/";
+		path2 = cgdir;
+	} else {
+		path1 = cgdir;
+		path2 = last;
+	}
+
+	if (is_child_cgroup(controller, path1, path2)) {
+		// get uid, gid, from '/tasks' file and make up a mode
+		// That is a hack, until cgmanager gains a GetCgroupPerms fn.
+		k = cgfs_get_key(controller, cgroup, "tasks");
+
+	} else
+		k = cgfs_get_key(controller, path1, path2);
+
+	if (!k) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * This being a fuse request, the uid and gid must be valid
+	 * in the caller's namespace.  So we can just check to make
+	 * sure that the caller is root in his uid, and privileged
+	 * over the file's current owner.
+	 */
+	if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	if (!cgfs_chmod_file(controller, cgroup, mode)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = 0;
+out:
+	free_key(k);
+	free(cgdir);
+	return ret;
+}
+
+static int is_dir(const char *path, int fd)
+{
+	struct stat statbuf;
+	int ret = fstatat(fd, path, &statbuf, fd);
+	if (ret == 0 && S_ISDIR(statbuf.st_mode))
+		return 1;
+	return 0;
+}
+
+static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
+{
+	size_t len;
+	char *fname;
+
+	len = strlen(dirname) + strlen("/cgroup.procs") + 1;
+	fname = alloca(len);
+	snprintf(fname, len, "%s/tasks", dirname);
+	if (fchownat(fd, fname, uid, gid, 0) != 0)
+		return -errno;
+	snprintf(fname, len, "%s/cgroup.procs", dirname);
+	if (fchownat(fd, fname, uid, gid, 0) != 0)
+		return -errno;
+	return 0;
+}
+
+static int cgfs_chown_file(const char *controller, const char *file, uid_t uid,
+			   gid_t gid)
+{
+	int cfd;
+	size_t len;
+	char *pathname;
+
+	cfd = get_cgroup_fd(controller);
+	if (cfd < 0)
+		return false;
+
+	/* Make sure we pass a relative path to *at() family of functions.
+	 * . + /file + \0
+	 */
+	len = strlen(file) + 2;
+	pathname = alloca(len);
+	snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
+	if (fchownat(cfd, pathname, uid, gid, 0) < 0)
+		return -errno;
+
+	if (is_dir(pathname, cfd))
+		return chown_tasks_files(pathname, uid, gid, cfd);
+
+	return 0;
+}
+
+int cg_chown(const char *path, uid_t uid, gid_t gid)
+{
+	struct fuse_context *fc = fuse_get_context();
+	char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
+	struct cgfs_files *k = NULL;
+	const char *cgroup;
+	int ret;
+
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
+	if (strcmp(path, "/cgroup") == 0)
+		return -EPERM;
+
+	controller = pick_controller_from_path(fc, path);
+	if (!controller)
+		return errno == ENOENT ? -EPERM : -errno;
+
+	cgroup = find_cgroup_in_path(path);
+	if (!cgroup)
+		/* this is just /cgroup/controller */
+		return -EPERM;
+
+	get_cgdir_and_path(cgroup, &cgdir, &last);
+
+	if (!last) {
+		path1 = "/";
+		path2 = cgdir;
+	} else {
+		path1 = cgdir;
+		path2 = last;
+	}
+
+	if (is_child_cgroup(controller, path1, path2)) {
+		// get uid, gid, from '/tasks' file and make up a mode
+		// That is a hack, until cgmanager gains a GetCgroupPerms fn.
+		k = cgfs_get_key(controller, cgroup, "tasks");
+
+	} else
+		k = cgfs_get_key(controller, path1, path2);
+
+	if (!k) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * This being a fuse request, the uid and gid must be valid
+	 * in the caller's namespace.  So we can just check to make
+	 * sure that the caller is root in his uid, and privileged
+	 * over the file's current owner.
+	 */
+	if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
+		ret = -EACCES;
+		goto out;
+	}
+
+	ret = cgfs_chown_file(controller, cgroup, uid, gid);
+
+out:
+	free_key(k);
+	free(cgdir);
+
+	return ret;
+}
+
+int cg_open(const char *path, struct fuse_file_info *fi)
+{
+	const char *cgroup;
+	char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
+	struct cgfs_files *k = NULL;
+	struct file_info *file_info;
+	struct fuse_context *fc = fuse_get_context();
+	int ret;
+
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
+	controller = pick_controller_from_path(fc, path);
+	if (!controller)
+		return -errno;
+	cgroup = find_cgroup_in_path(path);
+	if (!cgroup)
+		return -errno;
+
+	get_cgdir_and_path(cgroup, &cgdir, &last);
+	if (!last) {
+		path1 = "/";
+		path2 = cgdir;
+	} else {
+		path1 = cgdir;
+		path2 = last;
+	}
+
+	k = cgfs_get_key(controller, path1, path2);
+	if (!k) {
+		ret = -EINVAL;
+		goto out;
+	}
+	free_key(k);
+
+	pid_t initpid = lookup_initpid_in_store(fc->pid);
+	if (initpid <= 1 || is_shared_pidns(initpid))
+		initpid = fc->pid;
+	if (!caller_may_see_dir(initpid, controller, path1)) {
+		ret = -ENOENT;
+		goto out;
+	}
+	if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
+		ret = -EACCES;
+		goto out;
+	}
+
+	/* we'll free this at cg_release */
+	file_info = malloc(sizeof(*file_info));
+	if (!file_info) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	file_info->controller = must_copy_string(controller);
+	file_info->cgroup = must_copy_string(path1);
+	file_info->file = must_copy_string(path2);
+	file_info->type = LXC_TYPE_CGFILE;
+	file_info->buf = NULL;
+	file_info->buflen = 0;
+
+	fi->fh = (unsigned long)file_info;
+	ret = 0;
+
+out:
+	free(cgdir);
+	return ret;
+}
+
+#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
+
+static bool wait_for_sock(int sock, int timeout)
+{
+	struct epoll_event ev;
+	int epfd, ret, now, starttime, deltatime, saved_errno;
+
+	if ((starttime = time(NULL)) < 0)
+		return false;
+
+	if ((epfd = epoll_create(1)) < 0) {
+		lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
+		return false;
+	}
+
+	ev.events = POLLIN_SET;
+	ev.data.fd = sock;
+	if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
+		lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
+		close(epfd);
+		return false;
+	}
+
+again:
+	if ((now = time(NULL)) < 0) {
+		close(epfd);
+		return false;
+	}
+
+	deltatime = (starttime + timeout) - now;
+	if (deltatime < 0) { // timeout
+		errno = 0;
+		close(epfd);
+		return false;
+	}
+	ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
+	if (ret < 0 && errno == EINTR)
+		goto again;
+	saved_errno = errno;
+	close(epfd);
+
+	if (ret <= 0) {
+		errno = saved_errno;
+		return false;
+	}
+	return true;
+}
+
+static int msgrecv(int sockfd, void *buf, size_t len)
+{
+	if (!wait_for_sock(sockfd, 2))
+		return -1;
+	return recv(sockfd, buf, len, MSG_DONTWAIT);
+}
+
+#define SEND_CREDS_OK 0
+#define SEND_CREDS_NOTSK 1
+#define SEND_CREDS_FAIL 2
+
+static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
+{
+	struct msghdr msg = { 0 };
+	struct iovec iov;
+	struct cmsghdr *cmsg;
+	char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
+	char buf[1];
+	buf[0] = 'p';
+
+	if (pingfirst) {
+		if (msgrecv(sock, buf, 1) != 1) {
+			lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
+			return SEND_CREDS_FAIL;
+		}
+	}
+
+	msg.msg_control = cmsgbuf;
+	msg.msg_controllen = sizeof(cmsgbuf);
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_CREDENTIALS;
+	memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
+
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+
+	buf[0] = v;
+	iov.iov_base = buf;
+	iov.iov_len = sizeof(buf);
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	if (sendmsg(sock, &msg, 0) < 0) {
+		lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
+		if (errno == 3)
+			return SEND_CREDS_NOTSK;
+		return SEND_CREDS_FAIL;
+	}
+
+	return SEND_CREDS_OK;
+}
+
+static int wait_for_pid(pid_t pid)
+{
+	int status, ret;
+
+	if (pid <= 0)
+		return -1;
+
+again:
+	ret = waitpid(pid, &status, 0);
+	if (ret == -1) {
+		if (errno == EINTR)
+			goto again;
+		return -1;
+	}
+	if (ret != pid)
+		goto again;
+	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
+		return -1;
+	return 0;
+}
+
+static bool recv_creds(int sock, struct ucred *cred, char *v)
+{
+	struct msghdr msg = { 0 };
+	struct iovec iov;
+	struct cmsghdr *cmsg;
+	char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
+	char buf[1];
+	int ret;
+	int optval = 1;
+
+	*v = '1';
+
+	cred->pid = -1;
+	cred->uid = -1;
+	cred->gid = -1;
+
+	if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
+		lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
+		return false;
+	}
+	buf[0] = '1';
+	if (write(sock, buf, 1) != 1) {
+		lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
+		return false;
+	}
+
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = cmsgbuf;
+	msg.msg_controllen = sizeof(cmsgbuf);
+
+	iov.iov_base = buf;
+	iov.iov_len = sizeof(buf);
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	if (!wait_for_sock(sock, 2)) {
+		lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
+		return false;
+	}
+	ret = recvmsg(sock, &msg, MSG_DONTWAIT);
+	if (ret < 0) {
+		lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
+		return false;
+	}
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+
+	if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
+			cmsg->cmsg_level == SOL_SOCKET &&
+			cmsg->cmsg_type == SCM_CREDENTIALS) {
+		memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
+	}
+	*v = buf[0];
+
+	return true;
+}
+
+/*
+ * pid_to_ns - reads pids from a ucred over a socket, then writes the
+ * int value back over the socket.  This shifts the pid from the
+ * sender's pidns into tpid's pidns.
+ */
+static int pid_to_ns(int sock, pid_t tpid)
+{
+	char v = '0';
+	struct ucred cred;
+
+	while (recv_creds(sock, &cred, &v)) {
+		if (v == '1')
+			return 0;
+
+		if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
+			return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
+ * with clone(). This simply writes '1' as ACK back to the parent
+ * before calling the actual wrapped function.
+ */
+static int pid_ns_clone_wrapper(void *arg) {
+	struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
+	char b = '1';
+
+	close(args->cpipe[0]);
+	if (write(args->cpipe[1], &b, sizeof(char)) < 0)
+		lxcfs_error("(child): error on write: %s.\n", strerror(errno));
+	close(args->cpipe[1]);
+	return args->wrapped(args->sock, args->tpid);
+}
+
+/*
+ * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
+ * in your old pidns.  Only children which you clone will be in the target
+ * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
+ * actually convert pids.
+ *
+ * Note: glibc's fork() does not respect pidns, which can lead to failed
+ * assertions inside glibc (and thus failed forks) if the child's pid in
+ * the pidns and the parent pid outside are identical. Using clone prevents
+ * this issue.
+ */
+static void pid_to_ns_wrapper(int sock, pid_t tpid)
+{
+	int newnsfd = -1, ret, cpipe[2];
+	char fnam[100];
+	pid_t cpid;
+	char v;
+
+	ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
+	if (ret < 0 || ret >= sizeof(fnam))
+		_exit(1);
+	newnsfd = open(fnam, O_RDONLY);
+	if (newnsfd < 0)
+		_exit(1);
+	if (setns(newnsfd, 0) < 0)
+		_exit(1);
+	close(newnsfd);
+
+	if (pipe(cpipe) < 0)
+		_exit(1);
+
+	struct pid_ns_clone_args args = {
+		.cpipe = cpipe,
+		.sock = sock,
+		.tpid = tpid,
+		.wrapped = &pid_to_ns
+	};
+	size_t stack_size = sysconf(_SC_PAGESIZE);
+	void *stack = alloca(stack_size);
+
+	cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
+	if (cpid < 0)
+		_exit(1);
+
+	/* Give the child 1 second to be done forking and write its ack. */
+	if (!wait_for_sock(cpipe[0], 1))
+		_exit(1);
+	ret = read(cpipe[0], &v, 1);
+	if (ret != sizeof(char) || v != '1')
+		_exit(1);
+
+	if (!wait_for_pid(cpid))
+		_exit(1);
+	_exit(0);
+}
+
+/*
+ * append pid to *src.
+ * src: a pointer to a char* in which ot append the pid.
+ * sz: the number of characters printed so far, minus trailing \0.
+ * asz: the allocated size so far
+ * pid: the pid to append
+ */
+static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
+{
+	must_strcat(src, sz, asz, "%d\n", (int)pid);
+}
+
+/*
+ * To read cgroup files with a particular pid, we will setns into the child
+ * pidns, open a pipe, fork a child - which will be the first to really be in
+ * the child ns - which does the cgfs_get_value and writes the data to the pipe.
+ */
+static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg,
+			 const char *file, char **d)
+{
+	int sock[2] = {-1, -1};
+	char *tmpdata = NULL;
+	int ret;
+	pid_t qpid, cpid = -1;
+	bool answer = false;
+	char v = '0';
+	struct ucred cred;
+	size_t sz = 0, asz = 0;
+
+	if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
+		return false;
+
+	/*
+	 * Now we read the pids from returned data one by one, pass
+	 * them into a child in the target namespace, read back the
+	 * translated pids, and put them into our to-return data
+	 */
+
+	if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
+		perror("socketpair");
+		free(tmpdata);
+		return false;
+	}
+
+	cpid = fork();
+	if (cpid == -1)
+		goto out;
+
+	if (!cpid) // child - exits when done
+		pid_to_ns_wrapper(sock[1], tpid);
+
+	char *ptr = tmpdata;
+	cred.uid = 0;
+	cred.gid = 0;
+	while (sscanf(ptr, "%d\n", &qpid) == 1) {
+		cred.pid = qpid;
+		ret = send_creds(sock[0], &cred, v, true);
+
+		if (ret == SEND_CREDS_NOTSK)
+			goto next;
+		if (ret == SEND_CREDS_FAIL)
+			goto out;
+
+		// read converted results
+		if (!wait_for_sock(sock[0], 2)) {
+			lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
+			goto out;
+		}
+		if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
+			lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
+			goto out;
+		}
+		must_strcat_pid(d, &sz, &asz, qpid);
+next:
+		ptr = strchr(ptr, '\n');
+		if (!ptr)
+			break;
+		ptr++;
+	}
+
+	cred.pid = getpid();
+	v = '1';
+	if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
+		// failed to ask child to exit
+		lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
+		goto out;
+	}
+
+	answer = true;
+
+out:
+	free(tmpdata);
+	if (cpid != -1)
+		wait_for_pid(cpid);
+	if (sock[0] != -1) {
+		close(sock[0]);
+		close(sock[1]);
+	}
+	return answer;
+}
+
+int cg_read(const char *path, char *buf, size_t size, off_t offset,
+	    struct fuse_file_info *fi)
+{
+	struct fuse_context *fc = fuse_get_context();
+	struct file_info *f = (struct file_info *)fi->fh;
+	struct cgfs_files *k = NULL;
+	char *data = NULL;
+	int ret, s;
+	bool r;
+
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
+	if (f->type != LXC_TYPE_CGFILE) {
+		lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
+		return -EIO;
+	}
+
+	if (offset)
+		return 0;
+
+	if (!f->controller)
+		return -EINVAL;
+
+	if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
+		return -EINVAL;
+	}
+	free_key(k);
+
+
+	if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
+		ret = -EACCES;
+		goto out;
+	}
+
+	if (strcmp(f->file, "tasks") == 0 ||
+			strcmp(f->file, "/tasks") == 0 ||
+			strcmp(f->file, "/cgroup.procs") == 0 ||
+			strcmp(f->file, "cgroup.procs") == 0)
+		// special case - we have to translate the pids
+		r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
+	else
+		r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
+
+	if (!r) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!data) {
+		ret = 0;
+		goto out;
+	}
+	s = strlen(data);
+	if (s > size)
+		s = size;
+	memcpy(buf, data, s);
+	if (s > 0 && s < size && data[s-1] != '\n')
+		buf[s++] = '\n';
+
+	ret = s;
+
+out:
+	free(data);
+	return ret;
+}
+
+int cg_opendir(const char *path, struct fuse_file_info *fi)
+{
+	struct fuse_context *fc = fuse_get_context();
+	const char *cgroup;
+	struct file_info *dir_info;
+	char *controller = NULL;
+
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
+	if (strcmp(path, "/cgroup") == 0) {
+		cgroup = NULL;
+		controller = NULL;
+	} else {
+		// return list of keys for the controller, and list of child cgroups
+		controller = pick_controller_from_path(fc, path);
+		if (!controller)
+			return -errno;
+
+		cgroup = find_cgroup_in_path(path);
+		if (!cgroup) {
+			/* this is just /cgroup/controller, return its contents */
+			cgroup = "/";
+		}
+	}
+
+	pid_t initpid = lookup_initpid_in_store(fc->pid);
+	if (initpid <= 1 || is_shared_pidns(initpid))
+		initpid = fc->pid;
+	if (cgroup) {
+		if (!caller_may_see_dir(initpid, controller, cgroup))
+			return -ENOENT;
+		if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
+			return -EACCES;
+	}
+
+	/* we'll free this at cg_releasedir */
+	dir_info = malloc(sizeof(*dir_info));
+	if (!dir_info)
+		return -ENOMEM;
+	dir_info->controller = must_copy_string(controller);
+	dir_info->cgroup = must_copy_string(cgroup);
+	dir_info->type = LXC_TYPE_CGDIR;
+	dir_info->buf = NULL;
+	dir_info->file = NULL;
+	dir_info->buflen = 0;
+
+	fi->fh = (unsigned long)dir_info;
+	return 0;
+}
+
+int cg_release(const char *path, struct fuse_file_info *fi)
+{
+	do_release_file_info(fi);
+	return 0;
+}
+
+int cg_releasedir(const char *path, struct fuse_file_info *fi)
+{
+	do_release_file_info(fi);
+	return 0;
+}
+
+static FILE *open_pids_file(const char *controller, const char *cgroup)
+{
+	int fd, cfd;
+	size_t len;
+	char *pathname;
+
+	cfd = get_cgroup_fd(controller);
+	if (cfd < 0)
+		return false;
+
+	/* Make sure we pass a relative path to *at() family of functions.
+	 * . + /cgroup + / "cgroup.procs" + \0
+	 */
+	len = strlen(cgroup) + strlen("cgroup.procs") + 3;
+	pathname = alloca(len);
+	snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
+
+	fd = openat(cfd, pathname, O_WRONLY);
+	if (fd < 0)
+		return NULL;
+
+	return fdopen(fd, "w");
+}
+
+static int pid_from_ns(int sock, pid_t tpid)
+{
+	pid_t vpid;
+	struct ucred cred;
+	char v;
+	int ret;
+
+	cred.uid = 0;
+	cred.gid = 0;
+	while (1) {
+		if (!wait_for_sock(sock, 2)) {
+			lxcfs_error("%s\n", "Timeout reading from parent.");
+			return 1;
+		}
+		if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
+			lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
+			return 1;
+		}
+		if (vpid == -1) // done
+			break;
+		v = '0';
+		cred.pid = vpid;
+		if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
+			v = '1';
+			cred.pid = getpid();
+			if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
+				return 1;
+		}
+	}
+	return 0;
+}
+
+static void pid_from_ns_wrapper(int sock, pid_t tpid)
+{
+	int newnsfd = -1, ret, cpipe[2];
+	char fnam[100];
+	pid_t cpid;
+	char v;
+
+	ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
+	if (ret < 0 || ret >= sizeof(fnam))
+		_exit(1);
+	newnsfd = open(fnam, O_RDONLY);
+	if (newnsfd < 0)
+		_exit(1);
+	if (setns(newnsfd, 0) < 0)
+		_exit(1);
+	close(newnsfd);
+
+	if (pipe(cpipe) < 0)
+		_exit(1);
+
+	struct pid_ns_clone_args args = {
+		.cpipe = cpipe,
+		.sock = sock,
+		.tpid = tpid,
+		.wrapped = &pid_from_ns
+	};
+	size_t stack_size = sysconf(_SC_PAGESIZE);
+	void *stack = alloca(stack_size);
+
+	cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
+	if (cpid < 0)
+		_exit(1);
+
+	// give the child 1 second to be done forking and
+	// write its ack
+	if (!wait_for_sock(cpipe[0], 1))
+		_exit(1);
+	ret = read(cpipe[0], &v, 1);
+	if (ret != sizeof(char) || v != '1')
+		_exit(1);
+
+	if (!wait_for_pid(cpid))
+		_exit(1);
+	_exit(0);
+}
+
+/*
+ * get_pid_creds: get the real uid and gid of @pid from
+ * /proc/$$/status
+ * (XXX should we use euid here?)
+ */
+static void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
+{
+	char line[400];
+	uid_t u;
+	gid_t g;
+	FILE *f;
+
+	*uid = -1;
+	*gid = -1;
+	sprintf(line, "/proc/%d/status", pid);
+	if ((f = fopen(line, "r")) == NULL) {
+		lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
+		return;
+	}
+	while (fgets(line, 400, f)) {
+		if (strncmp(line, "Uid:", 4) == 0) {
+			if (sscanf(line+4, "%u", &u) != 1) {
+				lxcfs_error("bad uid line for pid %u\n", pid);
+				fclose(f);
+				return;
+			}
+			*uid = u;
+		} else if (strncmp(line, "Gid:", 4) == 0) {
+			if (sscanf(line+4, "%u", &g) != 1) {
+				lxcfs_error("bad gid line for pid %u\n", pid);
+				fclose(f);
+				return;
+			}
+			*gid = g;
+		}
+	}
+	fclose(f);
+}
+
+/*
+ * Given host @uid, return the uid to which it maps in
+ * @pid's user namespace, or -1 if none.
+ */
+static bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
+{
+	FILE *f;
+	char line[400];
+
+	sprintf(line, "/proc/%d/uid_map", pid);
+	if ((f = fopen(line, "r")) == NULL) {
+		return false;
+	}
+
+	*answer = convert_id_to_ns(f, uid);
+	fclose(f);
+
+	if (*answer == -1)
+		return false;
+	return true;
+}
+
+/*
+ * May the requestor @r move victim @v to a new cgroup?
+ * This is allowed if
+ *   . they are the same task
+ *   . they are ownedy by the same uid
+ *   . @r is root on the host, or
+ *   . @v's uid is mapped into @r's where @r is root.
+ */
+static bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
+{
+	uid_t v_uid, tmpuid;
+	gid_t v_gid;
+
+	if (r == v)
+		return true;
+	if (r_uid == 0)
+		return true;
+	get_pid_creds(v, &v_uid, &v_gid);
+	if (r_uid == v_uid)
+		return true;
+	if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
+			&& hostuid_to_ns(v_uid, r, &tmpuid))
+		return true;
+	return false;
+}
+
+static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl,
+			  const char *cg, const char *file, const char *buf)
+{
+	int sock[2] = {-1, -1};
+	pid_t qpid, cpid = -1;
+	FILE *pids_file = NULL;
+	bool answer = false, fail = false;
+
+	pids_file = open_pids_file(contrl, cg);
+	if (!pids_file)
+		return false;
+
+	/*
+	 * write the pids to a socket, have helper in writer's pidns
+	 * call movepid for us
+	 */
+	if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
+		perror("socketpair");
+		goto out;
+	}
+
+	cpid = fork();
+	if (cpid == -1)
+		goto out;
+
+	if (!cpid) { // child
+		fclose(pids_file);
+		pid_from_ns_wrapper(sock[1], tpid);
+	}
+
+	const char *ptr = buf;
+	while (sscanf(ptr, "%d", &qpid) == 1) {
+		struct ucred cred;
+		char v;
+
+		if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
+			lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
+			goto out;
+		}
+
+		if (recv_creds(sock[0], &cred, &v)) {
+			if (v == '0') {
+				if (!may_move_pid(tpid, tuid, cred.pid)) {
+					fail = true;
+					break;
+				}
+				if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
+					fail = true;
+			}
+		}
+
+		ptr = strchr(ptr, '\n');
+		if (!ptr)
+			break;
+		ptr++;
+	}
+
+	/* All good, write the value */
+	qpid = -1;
+	if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
+		lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
+
+	if (!fail)
+		answer = true;
+
+out:
+	if (cpid != -1)
+		wait_for_pid(cpid);
+	if (sock[0] != -1) {
+		close(sock[0]);
+		close(sock[1]);
+	}
+	if (pids_file) {
+		if (fclose(pids_file) != 0)
+			answer = false;
+	}
+	return answer;
+}
+
+static bool write_string(const char *fnam, const char *string, int fd)
+{
+	FILE *f;
+	size_t len, ret;
+
+	f = fdopen(fd, "w");
+	if (!f)
+		return false;
+
+	len = strlen(string);
+	ret = fwrite(string, 1, len, f);
+	if (ret != len) {
+		lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
+			    strerror(errno), string, fnam);
+		fclose(f);
+		return false;
+	}
+
+	if (fclose(f) < 0) {
+		lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
+		return false;
+	}
+
+	return true;
+}
+
+static bool cgfs_set_value(const char *controller, const char *cgroup,
+			   const char *file, const char *value)
+{
+	int ret, fd, cfd;
+	size_t len;
+	char *fnam;
+
+	cfd = get_cgroup_fd(controller);
+	if (cfd < 0)
+		return false;
+
+	/* Make sure we pass a relative path to *at() family of functions.
+	 * . + /cgroup + / + file + \0
+	 */
+	len = strlen(cgroup) + strlen(file) + 3;
+	fnam = alloca(len);
+	ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
+	if (ret < 0 || (size_t)ret >= len)
+		return false;
+
+	fd = openat(cfd, fnam, O_WRONLY);
+	if (fd < 0)
+		return false;
+
+	return write_string(fnam, value, fd);
+}
+
+int cg_write(const char *path, const char *buf, size_t size, off_t offset,
+	     struct fuse_file_info *fi)
+{
+	struct fuse_context *fc = fuse_get_context();
+	char *localbuf = NULL;
+	struct cgfs_files *k = NULL;
+	struct file_info *f = (struct file_info *)fi->fh;
+	bool r;
+
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
+	if (f->type != LXC_TYPE_CGFILE) {
+		lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
+		return -EIO;
+	}
+
+	if (offset)
+		return 0;
+
+	localbuf = alloca(size+1);
+	localbuf[size] = '\0';
+	memcpy(localbuf, buf, size);
+
+	if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
+		size = -EINVAL;
+		goto out;
+	}
+
+	if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
+		size = -EACCES;
+		goto out;
+	}
+
+	if (strcmp(f->file, "tasks") == 0 ||
+			strcmp(f->file, "/tasks") == 0 ||
+			strcmp(f->file, "/cgroup.procs") == 0 ||
+			strcmp(f->file, "cgroup.procs") == 0)
+		// special case - we have to translate the pids
+		r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
+	else
+		r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
+
+	if (!r)
+		size = -EINVAL;
+
+out:
+	free_key(k);
+	return size;
+}
+
+static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup,
+				bool directories, void ***list, size_t typesize,
+				void *(*iterator)(const char *, const char *, const char *))
+{
+	int cfd, fd, ret;
+	size_t len;
+	char *cg;
+	char pathname[MAXPATHLEN];
+	size_t sz = 0, asz = 0;
+	struct dirent *dirent;
+	DIR *dir;
+
+	cfd = get_cgroup_fd(controller);
+	*list = NULL;
+	if (cfd < 0)
+		return false;
+
+	/* Make sure we pass a relative path to *at() family of functions. */
+	len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
+	cg = alloca(len);
+	ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
+	if (ret < 0 || (size_t)ret >= len) {
+		lxcfs_error("Pathname too long under %s\n", cgroup);
+		return false;
+	}
+
+	fd = openat(cfd, cg, O_DIRECTORY);
+	if (fd < 0)
+		return false;
+
+	dir = fdopendir(fd);
+	if (!dir)
+		return false;
+
+	while ((dirent = readdir(dir))) {
+		struct stat mystat;
+
+		if (!strcmp(dirent->d_name, ".") ||
+		    !strcmp(dirent->d_name, ".."))
+			continue;
+
+		ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
+		if (ret < 0 || ret >= MAXPATHLEN) {
+			lxcfs_error("Pathname too long under %s\n", cg);
+			continue;
+		}
+
+		ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
+		if (ret) {
+			lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
+			continue;
+		}
+		if ((!directories && !S_ISREG(mystat.st_mode)) ||
+		    (directories && !S_ISDIR(mystat.st_mode)))
+			continue;
+
+		if (sz+2 >= asz) {
+			void **tmp;
+			asz += BATCH_SIZE;
+			do {
+				tmp = realloc(*list, asz * typesize);
+			} while  (!tmp);
+			*list = tmp;
+		}
+		(*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
+		(*list)[sz+1] = NULL;
+		sz++;
+	}
+	if (closedir(dir) < 0) {
+		lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
+		return false;
+	}
+	return true;
+}
+
+static void *make_key_list_entry(const char *controller, const char *cgroup,
+				 const char *dir_entry)
+{
+	struct cgfs_files *entry;
+
+	entry = cgfs_get_key(controller, cgroup, dir_entry);
+	if (!entry)
+		lxcfs_error("Failed to retrieve files under %s:%s\n",
+			    controller, cgroup);
+	return entry;
+}
+
+static bool cgfs_list_keys(const char *controller, const char *cgroup,
+			   struct cgfs_files ***keys)
+{
+	return cgfs_iterate_cgroup(controller, cgroup, false, (void ***)keys,
+				   sizeof(*keys), &make_key_list_entry);
+}
+
+static void *make_children_list_entry(const char *controller,
+				      const char *cgroup, const char *dir_entry)
+{
+	return strdup(dir_entry);
+}
+
+static bool cgfs_list_children(const char *controller, const char *cgroup,
+			       char ***list)
+{
+	return cgfs_iterate_cgroup(controller, cgroup, true, (void ***)list,
+				   sizeof(*list), &make_children_list_entry);
+}
+
+static void free_keys(struct cgfs_files **keys)
+{
+	if (!keys)
+		return;
+
+	for (int i = 0; keys[i]; i++)
+		free_key(keys[i]);
+
+	free_disarm(keys);
+}
+
+int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler,
+	       off_t offset, struct fuse_file_info *fi)
+{
+	struct file_info *d = (struct file_info *)fi->fh;
+	struct cgfs_files **list = NULL;
+	int i, ret;
+	char *nextcg = NULL;
+	struct fuse_context *fc = fuse_get_context();
+	char **clist = NULL;
+
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
+	if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
+		return -EIO;
+
+	if (d->type != LXC_TYPE_CGDIR) {
+		lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
+		return -EIO;
+	}
+	if (!d->cgroup && !d->controller) {
+		/*
+		 * ls /var/lib/lxcfs/cgroup - just show list of controllers.
+		 * This only works with the legacy hierarchy.
+		 */
+		for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
+			if (is_unified_hierarchy(*h))
+				continue;
+
+			if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
+				return -EIO;
+		}
+
+		return 0;
+	}
+
+	if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
+		// not a valid cgroup
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pid_t initpid = lookup_initpid_in_store(fc->pid);
+	if (initpid <= 1 || is_shared_pidns(initpid))
+		initpid = fc->pid;
+	if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
+		if (nextcg) {
+			ret = filler(buf, nextcg,  NULL, 0);
+			free(nextcg);
+			if (ret != 0) {
+				ret = -EIO;
+				goto out;
+			}
+		}
+		ret = 0;
+		goto out;
+	}
+
+	for (i = 0; list && list[i]; i++) {
+		if (filler(buf, list[i]->name, NULL, 0) != 0) {
+			ret = -EIO;
+			goto out;
+		}
+	}
+
+	// now get the list of child cgroups
+
+	if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
+		ret = 0;
+		goto out;
+	}
+	if (clist) {
+		for (i = 0; clist[i]; i++) {
+			if (filler(buf, clist[i], NULL, 0) != 0) {
+				ret = -EIO;
+				goto out;
+			}
+		}
+	}
+	ret = 0;
+
+out:
+	free_keys(list);
+	if (clist) {
+		for (i = 0; clist[i]; i++)
+			free(clist[i]);
+		free(clist);
+	}
+	return ret;
+}
+
+int cg_access(const char *path, int mode)
+{
+	int ret;
+	const char *cgroup;
+	char *path1, *path2, *controller;
+	char *last = NULL, *cgdir = NULL;
+	struct cgfs_files *k = NULL;
+	struct fuse_context *fc = fuse_get_context();
+
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
+	if (strcmp(path, "/cgroup") == 0)
+		return 0;
+
+	controller = pick_controller_from_path(fc, path);
+	if (!controller)
+		return -errno;
+	cgroup = find_cgroup_in_path(path);
+	if (!cgroup) {
+		// access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
+		if ((mode & W_OK) == 0)
+			return 0;
+		return -EACCES;
+	}
+
+	get_cgdir_and_path(cgroup, &cgdir, &last);
+	if (!last) {
+		path1 = "/";
+		path2 = cgdir;
+	} else {
+		path1 = cgdir;
+		path2 = last;
+	}
+
+	k = cgfs_get_key(controller, path1, path2);
+	if (!k) {
+		if ((mode & W_OK) == 0)
+			ret = 0;
+		else
+			ret = -EACCES;
+		goto out;
+	}
+	free_key(k);
+
+	pid_t initpid = lookup_initpid_in_store(fc->pid);
+	if (initpid <= 1 || is_shared_pidns(initpid))
+		initpid = fc->pid;
+	if (!caller_may_see_dir(initpid, controller, path1)) {
+		ret = -ENOENT;
+		goto out;
+	}
+	if (!fc_may_access(fc, controller, path1, path2, mode)) {
+		ret = -EACCES;
+		goto out;
+	}
+
+	ret = 0;
+
+out:
+	free(cgdir);
+	return ret;
+}
diff --git a/cgroup_fuse.h b/cgroup_fuse.h
new file mode 100644
index 0000000..4515530
--- /dev/null
+++ b/cgroup_fuse.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#ifndef __LXCFS_CGROUP_FUSE_H
+#define __LXCFS_CGROUP_FUSE_H
+
+extern int cg_getattr(const char *path, struct stat *sb);
+extern int cg_mkdir(const char *path, mode_t mode);
+extern int cg_rmdir(const char *path);
+extern int cg_chmod(const char *path, mode_t mode);
+extern int cg_chown(const char *path, uid_t uid, gid_t gid);
+extern int cg_open(const char *path, struct fuse_file_info *fi);
+extern int cg_read(const char *path, char *buf, size_t size, off_t offset,
+		   struct fuse_file_info *fi);
+extern int cg_opendir(const char *path, struct fuse_file_info *fi);
+extern int cg_release(const char *path, struct fuse_file_info *fi);
+extern int cg_releasedir(const char *path, struct fuse_file_info *fi);
+extern int cg_write(const char *path, const char *buf, size_t size,
+		    off_t offset, struct fuse_file_info *fi);
+extern int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler,
+		      off_t offset, struct fuse_file_info *fi);
+extern int cg_access(const char *path, int mode);
+
+#endif /* __LXCFS_CGROUP_FUSE_H */
diff --git a/utils.c b/utils.c
index 1ebcf16..5bfd442 100644
--- a/utils.c
+++ b/utils.c
@@ -127,3 +127,19 @@ int preserve_ns(const int pid, const char *ns)
 
 	return open(path, O_RDONLY | O_CLOEXEC);
 }
+
+void do_release_file_info(struct fuse_file_info *fi)
+{
+	struct file_info *f = (struct file_info *)fi->fh;
+
+	if (!f)
+		return;
+
+	fi->fh = 0;
+
+	free_disarm(f->controller);
+	free_disarm(f->cgroup);
+	free_disarm(f->file);
+	free_disarm(f->buf);
+	free_disarm(f);
+}
diff --git a/utils.h b/utils.h
index fbe775e..0a4dd3c 100644
--- a/utils.h
+++ b/utils.h
@@ -3,11 +3,16 @@
 #ifndef __LXCFS_UTILS_H
 #define __LXCFS_UTILS_H
 
+#define FUSE_USE_VERSION 26
+
+#include <fuse.h>
+
 /* Reserve buffer size to account for file size changes. */
 #define BUF_RESERVE_SIZE 512
 
 extern void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...);
 extern bool is_shared_pidns(pid_t pid);
 extern int preserve_ns(const int pid, const char *ns);
+extern void do_release_file_info(struct fuse_file_info *fi);
 
 #endif /* __LXCFS_UTILS_H */