From: Christian Brauner Date: Tue, 25 Feb 2020 16:17:10 +0000 (+0100) Subject: bindings: split cgroup part of lxcfs into separate files X-Git-Tag: lxcfs-4.0.0~33^2~2 X-Git-Url: https://git.proxmox.com/?p=mirror_lxcfs.git;a=commitdiff_plain;h=580fe4df03735cdbb2f7c3d474b71b951aaddca3 bindings: split cgroup part of lxcfs into separate files This was long overdue since the cgroup faking part is basically unused at this point on most kernels. Signed-off-by: Christian Brauner --- diff --git a/Makefile.am b/Makefile.am index d37aa7f..e3c4c24 100644 --- a/Makefile.am +++ b/Makefile.am @@ -13,6 +13,7 @@ AM_LDFLAGS = $(FUSE_LIBS) -pthread AM_CFLAGS += -DRUNTIME_PATH=\"$(RUNTIME_PATH)\" liblxcfs_la_SOURCES = bindings.c bindings.h \ + cgroup_fuse.c cgroup_fuse.h cgroups/cgfsng.c \ cgroups/cgroup.c cgroups/cgroup.h \ cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \ @@ -25,6 +26,7 @@ liblxcfs_la_CFLAGS = $(AM_CFLAGS) liblxcfs_la_LDFLAGS = $(AM_CFLAGS) -module -avoid-version -shared liblxcfstest_la_SOURCES = bindings.c bindings.h \ + cgroup_fuse.c cgroup_fuse.h cgroups/cgfsng.c \ cgroups/cgroup.c cgroups/cgroup.h \ cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \ diff --git a/bindings.c b/bindings.c index ddaa528..83243b8 100644 --- a/bindings.c +++ b/bindings.c @@ -39,6 +39,7 @@ #include "bindings.h" #include "config.h" +#include "cgroup_fuse.h" #include "cgroups/cgroup.h" #include "cgroups/cgroup_utils.h" #include "memory_utils.h" @@ -574,41 +575,6 @@ static struct pidns_init_store *lookup_verify_initpid(struct stat *sb) return NULL; } -static int is_dir(const char *path, int fd) -{ - struct stat statbuf; - int ret = fstatat(fd, path, &statbuf, fd); - if (ret == 0 && S_ISDIR(statbuf.st_mode)) - return 1; - return 0; -} - -static bool write_string(const char *fnam, const char *string, int fd) -{ - FILE *f; - size_t len, ret; - - f = fdopen(fd, "w"); - if (!f) - return false; - - len = strlen(string); - ret = fwrite(string, 1, len, f); - if (ret != len) { - lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n", - strerror(errno), string, fnam); - fclose(f); - return false; - } - - if (fclose(f) < 0) { - lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam); - return false; - } - - return true; -} - struct cgfs_files { char *name; uint32_t uid, gid; @@ -627,10 +593,9 @@ static void print_subsystems(void) } } -bool cgfs_set_value(const char *controller, const char *cgroup, const char *file, - const char *value) +bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file) { - int ret, fd, cfd; + int ret, cfd; size_t len; char *fnam; @@ -647,2401 +612,353 @@ bool cgfs_set_value(const char *controller, const char *cgroup, const char *file if (ret < 0 || (size_t)ret >= len) return false; - fd = openat(cfd, fnam, O_WRONLY); - if (fd < 0) - return false; - - return write_string(fnam, value, fd); + return (faccessat(cfd, fnam, F_OK, 0) == 0); } -// Chown all the files in the cgroup directory. We do this when we create -// a cgroup on behalf of a user. -static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd) -{ - struct dirent *direntp; - char path[MAXPATHLEN]; - size_t len; - DIR *d; - int fd1, ret; +#define SEND_CREDS_OK 0 +#define SEND_CREDS_NOTSK 1 +#define SEND_CREDS_FAIL 2 +static bool recv_creds(int sock, struct ucred *cred, char *v); +static int wait_for_pid(pid_t pid); +static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst); +static int send_creds_clone_wrapper(void *arg); - len = strlen(dirname); - if (len >= MAXPATHLEN) { - lxcfs_error("Pathname too long: %s\n", dirname); - return; - } +/* + * clone a task which switches to @task's namespace and writes '1'. + * over a unix sock so we can read the task's reaper's pid in our + * namespace + * + * Note: glibc's fork() does not respect pidns, which can lead to failed + * assertions inside glibc (and thus failed forks) if the child's pid in + * the pidns and the parent pid outside are identical. Using clone prevents + * this issue. + */ +static void write_task_init_pid_exit(int sock, pid_t target) +{ + char fnam[100]; + pid_t pid; + int fd, ret; + size_t stack_size = sysconf(_SC_PAGESIZE); + void *stack = alloca(stack_size); - fd1 = openat(fd, dirname, O_DIRECTORY); - if (fd1 < 0) - return; + ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target); + if (ret < 0 || ret >= sizeof(fnam)) + _exit(1); - d = fdopendir(fd1); - if (!d) { - lxcfs_error("Failed to open %s\n", dirname); - return; + fd = open(fnam, O_RDONLY); + if (fd < 0) { + perror("write_task_init_pid_exit open of ns/pid"); + _exit(1); } - - while ((direntp = readdir(d))) { - if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, "..")) - continue; - ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); - if (ret < 0 || ret >= MAXPATHLEN) { - lxcfs_error("Pathname too long under %s\n", dirname); - continue; - } - if (fchownat(fd, path, uid, gid, 0) < 0) - lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid); + if (setns(fd, 0)) { + perror("write_task_init_pid_exit setns 1"); + close(fd); + _exit(1); + } + pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock); + if (pid < 0) + _exit(1); + if (pid != 0) { + if (!wait_for_pid(pid)) + _exit(1); + _exit(0); } - closedir(d); } -int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid) -{ - int cfd; - size_t len; - char *dirnam; - - cfd = get_cgroup_fd(controller); - if (cfd < 0) - return -EINVAL; - - /* Make sure we pass a relative path to *at() family of functions. - * . + /cg + \0 - */ - len = strlen(cg) + 2; - dirnam = alloca(len); - snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg); - - if (mkdirat(cfd, dirnam, 0755) < 0) - return -errno; - - if (uid == 0 && gid == 0) - return 0; - - if (fchownat(cfd, dirnam, uid, gid, 0) < 0) - return -errno; - - chown_all_cgroup_files(dirnam, uid, gid, cfd); +static int send_creds_clone_wrapper(void *arg) { + struct ucred cred; + char v; + int sock = *(int *)arg; + /* we are the child */ + cred.uid = 0; + cred.gid = 0; + cred.pid = 1; + v = '1'; + if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) + return 1; return 0; } -static bool recursive_rmdir(const char *dirname, int fd, const int cfd) +static pid_t get_init_pid_for_task(pid_t task) { - struct dirent *direntp; - DIR *dir; - bool ret = false; - char pathname[MAXPATHLEN]; - int dupfd; - - dupfd = dup(fd); // fdopendir() does bad things once it uses an fd. - if (dupfd < 0) - return false; - - dir = fdopendir(dupfd); - if (!dir) { - lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno)); - close(dupfd); - return false; - } - - while ((direntp = readdir(dir))) { - struct stat mystat; - int rc; - - if (!strcmp(direntp->d_name, ".") || - !strcmp(direntp->d_name, "..")) - continue; - - rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); - if (rc < 0 || rc >= MAXPATHLEN) { - lxcfs_error("%s\n", "Pathname too long."); - continue; - } - - rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW); - if (rc) { - lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno)); - continue; - } - if (S_ISDIR(mystat.st_mode)) - if (!recursive_rmdir(pathname, fd, cfd)) - lxcfs_debug("Error removing %s.\n", pathname); - } + int sock[2]; + pid_t pid; + pid_t ret = -1; + char v = '0'; + struct ucred cred; - ret = true; - if (closedir(dir) < 0) { - lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno)); - ret = false; + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) { + perror("socketpair"); + return -1; } - if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) { - lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno)); - ret = false; + pid = fork(); + if (pid < 0) + goto out; + if (!pid) { + close(sock[1]); + write_task_init_pid_exit(sock[0], task); + _exit(0); } - close(dupfd); + if (!recv_creds(sock[1], &cred, &v)) + goto out; + ret = cred.pid; +out: + close(sock[0]); + close(sock[1]); + if (pid > 0) + wait_for_pid(pid); return ret; } -bool cgfs_remove(const char *controller, const char *cg) +pid_t lookup_initpid_in_store(pid_t qpid) { - int fd, cfd; - size_t len; - char *dirnam; - bool bret; - - cfd = get_cgroup_fd(controller); - if (cfd < 0) - return false; - - /* Make sure we pass a relative path to *at() family of functions. - * . + /cg + \0 - */ - len = strlen(cg) + 2; - dirnam = alloca(len); - snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg); + pid_t answer = 0; + struct stat sb; + struct pidns_init_store *e; + char fnam[100]; - fd = openat(cfd, dirnam, O_DIRECTORY); - if (fd < 0) - return false; + snprintf(fnam, 100, "/proc/%d/ns/pid", qpid); + store_lock(); + if (stat(fnam, &sb) < 0) + goto out; + e = lookup_verify_initpid(&sb); + if (e) { + answer = e->initpid; + goto out; + } + answer = get_init_pid_for_task(qpid); + if (answer > 0) + save_initpid(&sb, answer); - bret = recursive_rmdir(dirnam, fd, cfd); - close(fd); - return bret; +out: + /* we prune at end in case we are returning + * the value we were about to return */ + prune_initpid_store(); + store_unlock(); + return answer; } -bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode) +static int wait_for_pid(pid_t pid) { - int cfd; - size_t len; - char *pathname; - - cfd = get_cgroup_fd(controller); - if (cfd < 0) - return false; + int status, ret; - /* Make sure we pass a relative path to *at() family of functions. - * . + /file + \0 - */ - len = strlen(file) + 2; - pathname = alloca(len); - snprintf(pathname, len, "%s%s", dot_or_empty(file), file); - if (fchmodat(cfd, pathname, mode, 0) < 0) - return false; - return true; -} + if (pid <= 0) + return -1; -static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd) -{ - size_t len; - char *fname; - - len = strlen(dirname) + strlen("/cgroup.procs") + 1; - fname = alloca(len); - snprintf(fname, len, "%s/tasks", dirname); - if (fchownat(fd, fname, uid, gid, 0) != 0) - return -errno; - snprintf(fname, len, "%s/cgroup.procs", dirname); - if (fchownat(fd, fname, uid, gid, 0) != 0) - return -errno; +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + return -1; + } + if (ret != pid) + goto again; + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) + return -1; return 0; } -int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid) +char *get_pid_cgroup(pid_t pid, const char *contrl) { int cfd; - size_t len; - char *pathname; - cfd = get_cgroup_fd(controller); + cfd = get_cgroup_fd(contrl); if (cfd < 0) return false; - /* Make sure we pass a relative path to *at() family of functions. - * . + /file + \0 - */ - len = strlen(file) + 2; - pathname = alloca(len); - snprintf(pathname, len, "%s%s", dot_or_empty(file), file); - if (fchownat(cfd, pathname, uid, gid, 0) < 0) - return -errno; - - if (is_dir(pathname, cfd)) - // like cgmanager did, we want to chown the tasks file as well - return chown_tasks_files(pathname, uid, gid, cfd); + if (pure_unified_layout(cgroup_ops)) + return cg_unified_get_current_cgroup(pid); - return 0; + return cg_legacy_get_current_cgroup(pid, contrl); } -FILE *open_pids_file(const char *controller, const char *cgroup) +#define INITSCOPE "/init.scope" +void prune_init_slice(char *cg) { - int fd, cfd; - size_t len; - char *pathname; - - cfd = get_cgroup_fd(controller); - if (cfd < 0) - return false; - - /* Make sure we pass a relative path to *at() family of functions. - * . + /cgroup + / "cgroup.procs" + \0 - */ - len = strlen(cgroup) + strlen("cgroup.procs") + 3; - pathname = alloca(len); - snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup); + char *point; + size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE); - fd = openat(cfd, pathname, O_WRONLY); - if (fd < 0) - return NULL; + if (cg_len < initscope_len) + return; - return fdopen(fd, "w"); + point = cg + cg_len - initscope_len; + if (strcmp(point, INITSCOPE) == 0) { + if (point == cg) + *(point+1) = '\0'; + else + *point = '\0'; + } } -static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories, - void ***list, size_t typesize, - void* (*iterator)(const char*, const char*, const char*)) +#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP ) + +static bool wait_for_sock(int sock, int timeout) { - int cfd, fd, ret; - size_t len; - char *cg; - char pathname[MAXPATHLEN]; - size_t sz = 0, asz = 0; - struct dirent *dirent; - DIR *dir; + struct epoll_event ev; + int epfd, ret, now, starttime, deltatime, saved_errno; - cfd = get_cgroup_fd(controller); - *list = NULL; - if (cfd < 0) + if ((starttime = time(NULL)) < 0) return false; - /* Make sure we pass a relative path to *at() family of functions. */ - len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */; - cg = alloca(len); - ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup); - if (ret < 0 || (size_t)ret >= len) { - lxcfs_error("Pathname too long under %s\n", cgroup); + if ((epfd = epoll_create(1)) < 0) { + lxcfs_error("%s\n", "Failed to create epoll socket: %m."); return false; } - fd = openat(cfd, cg, O_DIRECTORY); - if (fd < 0) - return false; - - dir = fdopendir(fd); - if (!dir) - return false; - - while ((dirent = readdir(dir))) { - struct stat mystat; - - if (!strcmp(dirent->d_name, ".") || - !strcmp(dirent->d_name, "..")) - continue; - - ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name); - if (ret < 0 || ret >= MAXPATHLEN) { - lxcfs_error("Pathname too long under %s\n", cg); - continue; - } - - ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW); - if (ret) { - lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno)); - continue; - } - if ((!directories && !S_ISREG(mystat.st_mode)) || - (directories && !S_ISDIR(mystat.st_mode))) - continue; - - if (sz+2 >= asz) { - void **tmp; - asz += BATCH_SIZE; - do { - tmp = realloc(*list, asz * typesize); - } while (!tmp); - *list = tmp; - } - (*list)[sz] = (*iterator)(controller, cg, dirent->d_name); - (*list)[sz+1] = NULL; - sz++; - } - if (closedir(dir) < 0) { - lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno)); - return false; - } - return true; -} - -static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry) -{ - char *dup; - do { - dup = strdup(dir_entry); - } while (!dup); - return dup; -} - -bool cgfs_list_children(const char *controller, const char *cgroup, char ***list) -{ - return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry); -} - -void free_key(struct cgfs_files *k) -{ - if (!k) - return; - free_disarm(k->name); - free_disarm(k); -} - -void free_keys(struct cgfs_files **keys) -{ - int i; - - if (!keys) - return; - for (i = 0; keys[i]; i++) { - free_key(keys[i]); - } - free_disarm(keys); -} - -bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file) -{ - int ret, cfd; - size_t len; - char *fnam; - - cfd = get_cgroup_fd(controller); - if (cfd < 0) - return false; - - /* Make sure we pass a relative path to *at() family of functions. - * . + /cgroup + / + file + \0 - */ - len = strlen(cgroup) + strlen(file) + 3; - fnam = alloca(len); - ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file); - if (ret < 0 || (size_t)ret >= len) - return false; - - return (faccessat(cfd, fnam, F_OK, 0) == 0); -} - -struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file) -{ - int ret, cfd; - size_t len; - char *fnam; - struct stat sb; - struct cgfs_files *newkey; - - cfd = get_cgroup_fd(controller); - if (cfd < 0) - return false; - - if (file && *file == '/') - file++; - - if (file && strchr(file, '/')) - return NULL; - - /* Make sure we pass a relative path to *at() family of functions. - * . + /cgroup + / + file + \0 - */ - len = strlen(cgroup) + 3; - if (file) - len += strlen(file) + 1; - fnam = alloca(len); - snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup, - file ? "/" : "", file ? file : ""); - - ret = fstatat(cfd, fnam, &sb, 0); - if (ret < 0) - return NULL; - - do { - newkey = malloc(sizeof(struct cgfs_files)); - } while (!newkey); - if (file) - newkey->name = must_copy_string(file); - else if (strrchr(cgroup, '/')) - newkey->name = must_copy_string(strrchr(cgroup, '/')); - else - newkey->name = must_copy_string(cgroup); - newkey->uid = sb.st_uid; - newkey->gid = sb.st_gid; - newkey->mode = sb.st_mode; - - return newkey; -} - -static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry) -{ - struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry); - if (!entry) { - lxcfs_error("Error getting files under %s:%s\n", controller, - cgroup); - } - return entry; -} - -bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys) -{ - return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry); -} - -bool is_child_cgroup(const char *controller, const char *cgroup, const char *f) -{ - int cfd; - size_t len; - char *fnam; - int ret; - struct stat sb; - - cfd = get_cgroup_fd(controller); - if (cfd < 0) - return false; - - /* Make sure we pass a relative path to *at() family of functions. - * . + /cgroup + / + f + \0 - */ - len = strlen(cgroup) + strlen(f) + 3; - fnam = alloca(len); - ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f); - if (ret < 0 || (size_t)ret >= len) - return false; - - ret = fstatat(cfd, fnam, &sb, 0); - if (ret < 0 || !S_ISDIR(sb.st_mode)) - return false; - - return true; -} - -#define SEND_CREDS_OK 0 -#define SEND_CREDS_NOTSK 1 -#define SEND_CREDS_FAIL 2 -static bool recv_creds(int sock, struct ucred *cred, char *v); -static int wait_for_pid(pid_t pid); -static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst); -static int send_creds_clone_wrapper(void *arg); - -/* - * clone a task which switches to @task's namespace and writes '1'. - * over a unix sock so we can read the task's reaper's pid in our - * namespace - * - * Note: glibc's fork() does not respect pidns, which can lead to failed - * assertions inside glibc (and thus failed forks) if the child's pid in - * the pidns and the parent pid outside are identical. Using clone prevents - * this issue. - */ -static void write_task_init_pid_exit(int sock, pid_t target) -{ - char fnam[100]; - pid_t pid; - int fd, ret; - size_t stack_size = sysconf(_SC_PAGESIZE); - void *stack = alloca(stack_size); - - ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target); - if (ret < 0 || ret >= sizeof(fnam)) - _exit(1); - - fd = open(fnam, O_RDONLY); - if (fd < 0) { - perror("write_task_init_pid_exit open of ns/pid"); - _exit(1); - } - if (setns(fd, 0)) { - perror("write_task_init_pid_exit setns 1"); - close(fd); - _exit(1); - } - pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock); - if (pid < 0) - _exit(1); - if (pid != 0) { - if (!wait_for_pid(pid)) - _exit(1); - _exit(0); - } -} - -static int send_creds_clone_wrapper(void *arg) { - struct ucred cred; - char v; - int sock = *(int *)arg; - - /* we are the child */ - cred.uid = 0; - cred.gid = 0; - cred.pid = 1; - v = '1'; - if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) - return 1; - return 0; -} - -static pid_t get_init_pid_for_task(pid_t task) -{ - int sock[2]; - pid_t pid; - pid_t ret = -1; - char v = '0'; - struct ucred cred; - - if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) { - perror("socketpair"); - return -1; - } - - pid = fork(); - if (pid < 0) - goto out; - if (!pid) { - close(sock[1]); - write_task_init_pid_exit(sock[0], task); - _exit(0); - } - - if (!recv_creds(sock[1], &cred, &v)) - goto out; - ret = cred.pid; - -out: - close(sock[0]); - close(sock[1]); - if (pid > 0) - wait_for_pid(pid); - return ret; -} - -pid_t lookup_initpid_in_store(pid_t qpid) -{ - pid_t answer = 0; - struct stat sb; - struct pidns_init_store *e; - char fnam[100]; - - snprintf(fnam, 100, "/proc/%d/ns/pid", qpid); - store_lock(); - if (stat(fnam, &sb) < 0) - goto out; - e = lookup_verify_initpid(&sb); - if (e) { - answer = e->initpid; - goto out; - } - answer = get_init_pid_for_task(qpid); - if (answer > 0) - save_initpid(&sb, answer); - -out: - /* we prune at end in case we are returning - * the value we were about to return */ - prune_initpid_store(); - store_unlock(); - return answer; -} - -static int wait_for_pid(pid_t pid) -{ - int status, ret; - - if (pid <= 0) - return -1; - -again: - ret = waitpid(pid, &status, 0); - if (ret == -1) { - if (errno == EINTR) - goto again; - return -1; - } - if (ret != pid) - goto again; - if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) - return -1; - return 0; -} - - -/* - * append pid to *src. - * src: a pointer to a char* in which ot append the pid. - * sz: the number of characters printed so far, minus trailing \0. - * asz: the allocated size so far - * pid: the pid to append - */ -static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid) -{ - must_strcat(src, sz, asz, "%d\n", (int)pid); -} - -/* - * Given a open file * to /proc/pid/{u,g}id_map, and an id - * valid in the caller's namespace, return the id mapped into - * pid's namespace. - * Returns the mapped id, or -1 on error. - */ -unsigned int -convert_id_to_ns(FILE *idfile, unsigned int in_id) -{ - unsigned int nsuid, // base id for a range in the idfile's namespace - hostuid, // base id for a range in the caller's namespace - count; // number of ids in this range - char line[400]; - int ret; - - fseek(idfile, 0L, SEEK_SET); - while (fgets(line, 400, idfile)) { - ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count); - if (ret != 3) - continue; - if (hostuid + count < hostuid || nsuid + count < nsuid) { - /* - * uids wrapped around - unexpected as this is a procfile, - * so just bail. - */ - lxcfs_error("pid wrapparound at entry %u %u %u in %s\n", - nsuid, hostuid, count, line); - return -1; - } - if (hostuid <= in_id && hostuid+count > in_id) { - /* - * now since hostuid <= in_id < hostuid+count, and - * hostuid+count and nsuid+count do not wrap around, - * we know that nsuid+(in_id-hostuid) which must be - * less that nsuid+(count) must not wrap around - */ - return (in_id - hostuid) + nsuid; - } - } - - // no answer found - return -1; -} - -/* - * for is_privileged_over, - * specify whether we require the calling uid to be root in his - * namespace - */ -#define NS_ROOT_REQD true -#define NS_ROOT_OPT false - -#define PROCLEN 100 - -static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root) -{ - char fpath[PROCLEN]; - int ret; - bool answer = false; - uid_t nsuid; - - if (victim == -1 || uid == -1) - return false; - - /* - * If the request is one not requiring root in the namespace, - * then having the same uid suffices. (i.e. uid 1000 has write - * access to files owned by uid 1000 - */ - if (!req_ns_root && uid == victim) - return true; - - ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid); - if (ret < 0 || ret >= PROCLEN) - return false; - FILE *f = fopen(fpath, "r"); - if (!f) - return false; - - /* if caller's not root in his namespace, reject */ - nsuid = convert_id_to_ns(f, uid); - if (nsuid) - goto out; - - /* - * If victim is not mapped into caller's ns, reject. - * XXX I'm not sure this check is needed given that fuse - * will be sending requests where the vfs has converted - */ - nsuid = convert_id_to_ns(f, victim); - if (nsuid == -1) - goto out; - - answer = true; - -out: - fclose(f); - return answer; -} - -static bool perms_include(int fmode, mode_t req_mode) -{ - mode_t r; - - switch (req_mode & O_ACCMODE) { - case O_RDONLY: - r = S_IROTH; - break; - case O_WRONLY: - r = S_IWOTH; - break; - case O_RDWR: - r = S_IROTH | S_IWOTH; - break; - default: - return false; - } - return ((fmode & r) == r); -} - - -/* - * taskcg is a/b/c - * querycg is /a/b/c/d/e - * we return 'd' - */ -static char *get_next_cgroup_dir(const char *taskcg, const char *querycg) -{ - char *start, *end; - - if (strlen(taskcg) <= strlen(querycg)) { - lxcfs_error("%s\n", "I was fed bad input."); - return NULL; - } - - if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0)) - start = strdup(taskcg + 1); - else - start = strdup(taskcg + strlen(querycg) + 1); - if (!start) - return NULL; - end = strchr(start, '/'); - if (end) - *end = '\0'; - return start; -} - -char *get_pid_cgroup(pid_t pid, const char *contrl) -{ - int cfd; - - cfd = get_cgroup_fd(contrl); - if (cfd < 0) - return false; - - if (pure_unified_layout(cgroup_ops)) - return cg_unified_get_current_cgroup(pid); - - return cg_legacy_get_current_cgroup(pid, contrl); -} - -/* - * check whether a fuse context may access a cgroup dir or file - * - * If file is not null, it is a cgroup file to check under cg. - * If file is null, then we are checking perms on cg itself. - * - * For files we can check the mode of the list_keys result. - * For cgroups, we must make assumptions based on the files under the - * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups - * yet. - */ -static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode) -{ - struct cgfs_files *k = NULL; - bool ret = false; - - k = cgfs_get_key(contrl, cg, file); - if (!k) - return false; - - if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) { - if (perms_include(k->mode >> 6, mode)) { - ret = true; - goto out; - } - } - if (fc->gid == k->gid) { - if (perms_include(k->mode >> 3, mode)) { - ret = true; - goto out; - } - } - ret = perms_include(k->mode, mode); - -out: - free_key(k); - return ret; -} - -#define INITSCOPE "/init.scope" -void prune_init_slice(char *cg) -{ - char *point; - size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE); - - if (cg_len < initscope_len) - return; - - point = cg + cg_len - initscope_len; - if (strcmp(point, INITSCOPE) == 0) { - if (point == cg) - *(point+1) = '\0'; - else - *point = '\0'; - } -} - -/* - * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d. - * If pid is in /a, he may act on /a/b, but not on /b. - * if the answer is false and nextcg is not NULL, then *nextcg will point - * to a string containing the next cgroup directory under cg, which must be - * freed by the caller. - */ -static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg) -{ - bool answer = false; - char *c2 = get_pid_cgroup(pid, contrl); - char *linecmp; - - if (!c2) - return false; - prune_init_slice(c2); - - /* - * callers pass in '/' or './' (openat()) for root cgroup, otherwise - * they pass in a cgroup without leading '/' - * - * The original line here was: - * linecmp = *cg == '/' ? c2 : c2+1; - * TODO: I'm not sure why you'd want to increment when *cg != '/'? - * Serge, do you know? - */ - if (*cg == '/' || !strncmp(cg, "./", 2)) - linecmp = c2; - else - linecmp = c2 + 1; - if (strncmp(linecmp, cg, strlen(linecmp)) != 0) { - if (nextcg) { - *nextcg = get_next_cgroup_dir(linecmp, cg); - } - goto out; - } - answer = true; - -out: - free(c2); - return answer; -} - -/* - * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c. - */ -static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg) -{ - bool answer = false; - char *c2, *task_cg; - size_t target_len, task_len; - - if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0) - return true; - - c2 = get_pid_cgroup(pid, contrl); - if (!c2) - return false; - prune_init_slice(c2); - - task_cg = c2 + 1; - target_len = strlen(cg); - task_len = strlen(task_cg); - if (task_len == 0) { - /* Task is in the root cg, it can see everything. This case is - * not handled by the strmcps below, since they test for the - * last /, but that is the first / that we've chopped off - * above. - */ - answer = true; - goto out; - } - if (strcmp(cg, task_cg) == 0) { - answer = true; - goto out; - } - if (target_len < task_len) { - /* looking up a parent dir */ - if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/') - answer = true; - goto out; - } - if (target_len > task_len) { - /* looking up a child dir */ - if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/') - answer = true; - goto out; - } - -out: - free(c2); - return answer; -} - -/* - * given /cgroup/freezer/a/b, return "freezer". - * the returned char* should NOT be freed. - */ -static char *pick_controller_from_path(struct fuse_context *fc, const char *path) -{ - const char *p1; - char *contr, *slash; - - if (strlen(path) < 9) { - errno = EACCES; - return NULL; - } - if (*(path + 7) != '/') { - errno = EINVAL; - return NULL; - } - p1 = path + 8; - contr = strdupa(p1); - if (!contr) { - errno = ENOMEM; - return NULL; - } - slash = strstr(contr, "/"); - if (slash) - *slash = '\0'; - - for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { - if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0) - return (*h)->__controllers; - } - errno = ENOENT; - return NULL; -} - -/* - * Find the start of cgroup in /cgroup/controller/the/cgroup/path - * Note that the returned value may include files (keynames) etc - */ -static const char *find_cgroup_in_path(const char *path) -{ - const char *p1; - - if (strlen(path) < 9) { - errno = EACCES; - return NULL; - } - p1 = strstr(path + 8, "/"); - if (!p1) { - errno = EINVAL; - return NULL; - } - errno = 0; - return p1 + 1; -} - -/* - * split the last path element from the path in @cg. - * @dir is newly allocated and should be freed, @last not -*/ -static void get_cgdir_and_path(const char *cg, char **dir, char **last) -{ - char *p; - - do { - *dir = strdup(cg); - } while (!*dir); - *last = strrchr(cg, '/'); - if (!*last) { - *last = NULL; - return; - } - p = strrchr(*dir, '/'); - *p = '\0'; -} - -/* - * FUSE ops for /cgroup - */ - -int cg_getattr(const char *path, struct stat *sb) -{ - struct timespec now; - struct fuse_context *fc = fuse_get_context(); - char * cgdir = NULL; - char *last = NULL, *path1, *path2; - struct cgfs_files *k = NULL; - const char *cgroup; - const char *controller = NULL; - int ret = -ENOENT; - - - if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) - return -EIO; - - memset(sb, 0, sizeof(struct stat)); - - if (clock_gettime(CLOCK_REALTIME, &now) < 0) - return -EINVAL; - - sb->st_uid = sb->st_gid = 0; - sb->st_atim = sb->st_mtim = sb->st_ctim = now; - sb->st_size = 0; - - if (strcmp(path, "/cgroup") == 0) { - sb->st_mode = S_IFDIR | 00755; - sb->st_nlink = 2; - return 0; - } - - controller = pick_controller_from_path(fc, path); - if (!controller) - return -errno; - cgroup = find_cgroup_in_path(path); - if (!cgroup) { - /* this is just /cgroup/controller, return it as a dir */ - sb->st_mode = S_IFDIR | 00755; - sb->st_nlink = 2; - return 0; - } - - get_cgdir_and_path(cgroup, &cgdir, &last); - - if (!last) { - path1 = "/"; - path2 = cgdir; - } else { - path1 = cgdir; - path2 = last; - } - - pid_t initpid = lookup_initpid_in_store(fc->pid); - if (initpid <= 1 || is_shared_pidns(initpid)) - initpid = fc->pid; - /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys. - * Then check that caller's cgroup is under path if last is a child - * cgroup, or cgdir if last is a file */ - - if (is_child_cgroup(controller, path1, path2)) { - if (!caller_may_see_dir(initpid, controller, cgroup)) { - ret = -ENOENT; - goto out; - } - if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) { - /* this is just /cgroup/controller, return it as a dir */ - sb->st_mode = S_IFDIR | 00555; - sb->st_nlink = 2; - ret = 0; - goto out; - } - if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) { - ret = -EACCES; - goto out; - } - - // get uid, gid, from '/tasks' file and make up a mode - // That is a hack, until cgmanager gains a GetCgroupPerms fn. - sb->st_mode = S_IFDIR | 00755; - k = cgfs_get_key(controller, cgroup, NULL); - if (!k) { - sb->st_uid = sb->st_gid = 0; - } else { - sb->st_uid = k->uid; - sb->st_gid = k->gid; - } - free_key(k); - sb->st_nlink = 2; - ret = 0; - goto out; - } - - if ((k = cgfs_get_key(controller, path1, path2)) != NULL) { - sb->st_mode = S_IFREG | k->mode; - sb->st_nlink = 1; - sb->st_uid = k->uid; - sb->st_gid = k->gid; - sb->st_size = 0; - free_key(k); - if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) { - ret = -ENOENT; - goto out; - } - ret = 0; - } - -out: - free(cgdir); - return ret; -} - -int cg_opendir(const char *path, struct fuse_file_info *fi) -{ - struct fuse_context *fc = fuse_get_context(); - const char *cgroup; - struct file_info *dir_info; - char *controller = NULL; - - if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) - return -EIO; - - if (strcmp(path, "/cgroup") == 0) { - cgroup = NULL; - controller = NULL; - } else { - // return list of keys for the controller, and list of child cgroups - controller = pick_controller_from_path(fc, path); - if (!controller) - return -errno; - - cgroup = find_cgroup_in_path(path); - if (!cgroup) { - /* this is just /cgroup/controller, return its contents */ - cgroup = "/"; - } - } - - pid_t initpid = lookup_initpid_in_store(fc->pid); - if (initpid <= 1 || is_shared_pidns(initpid)) - initpid = fc->pid; - if (cgroup) { - if (!caller_may_see_dir(initpid, controller, cgroup)) - return -ENOENT; - if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) - return -EACCES; - } - - /* we'll free this at cg_releasedir */ - dir_info = malloc(sizeof(*dir_info)); - if (!dir_info) - return -ENOMEM; - dir_info->controller = must_copy_string(controller); - dir_info->cgroup = must_copy_string(cgroup); - dir_info->type = LXC_TYPE_CGDIR; - dir_info->buf = NULL; - dir_info->file = NULL; - dir_info->buflen = 0; - - fi->fh = (unsigned long)dir_info; - return 0; -} - -int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, - struct fuse_file_info *fi) -{ - struct file_info *d = (struct file_info *)fi->fh; - struct cgfs_files **list = NULL; - int i, ret; - char *nextcg = NULL; - struct fuse_context *fc = fuse_get_context(); - char **clist = NULL; - - if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) - return -EIO; - - if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0) - return -EIO; - - if (d->type != LXC_TYPE_CGDIR) { - lxcfs_error("%s\n", "Internal error: file cache info used in readdir."); - return -EIO; - } - if (!d->cgroup && !d->controller) { - /* - * ls /var/lib/lxcfs/cgroup - just show list of controllers. - * This only works with the legacy hierarchy. - */ - for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { - if (is_unified_hierarchy(*h)) - continue; - - if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0)) - return -EIO; - } - - return 0; - } - - if (!cgfs_list_keys(d->controller, d->cgroup, &list)) { - // not a valid cgroup - ret = -EINVAL; - goto out; - } - - pid_t initpid = lookup_initpid_in_store(fc->pid); - if (initpid <= 1 || is_shared_pidns(initpid)) - initpid = fc->pid; - if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) { - if (nextcg) { - ret = filler(buf, nextcg, NULL, 0); - free(nextcg); - if (ret != 0) { - ret = -EIO; - goto out; - } - } - ret = 0; - goto out; - } - - for (i = 0; list && list[i]; i++) { - if (filler(buf, list[i]->name, NULL, 0) != 0) { - ret = -EIO; - goto out; - } - } - - // now get the list of child cgroups - - if (!cgfs_list_children(d->controller, d->cgroup, &clist)) { - ret = 0; - goto out; - } - if (clist) { - for (i = 0; clist[i]; i++) { - if (filler(buf, clist[i], NULL, 0) != 0) { - ret = -EIO; - goto out; - } - } - } - ret = 0; - -out: - free_keys(list); - if (clist) { - for (i = 0; clist[i]; i++) - free(clist[i]); - free(clist); - } - return ret; -} - -void do_release_file_info(struct fuse_file_info *fi) -{ - struct file_info *f = (struct file_info *)fi->fh; - - if (!f) - return; - - fi->fh = 0; - - free_disarm(f->controller); - free_disarm(f->cgroup); - free_disarm(f->file); - free_disarm(f->buf); - free_disarm(f); -} - -int cg_releasedir(const char *path, struct fuse_file_info *fi) -{ - do_release_file_info(fi); - return 0; -} - -int cg_open(const char *path, struct fuse_file_info *fi) -{ - const char *cgroup; - char *last = NULL, *path1, *path2, * cgdir = NULL, *controller; - struct cgfs_files *k = NULL; - struct file_info *file_info; - struct fuse_context *fc = fuse_get_context(); - int ret; - - if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) - return -EIO; - - controller = pick_controller_from_path(fc, path); - if (!controller) - return -errno; - cgroup = find_cgroup_in_path(path); - if (!cgroup) - return -errno; - - get_cgdir_and_path(cgroup, &cgdir, &last); - if (!last) { - path1 = "/"; - path2 = cgdir; - } else { - path1 = cgdir; - path2 = last; - } - - k = cgfs_get_key(controller, path1, path2); - if (!k) { - ret = -EINVAL; - goto out; - } - free_key(k); - - pid_t initpid = lookup_initpid_in_store(fc->pid); - if (initpid <= 1 || is_shared_pidns(initpid)) - initpid = fc->pid; - if (!caller_may_see_dir(initpid, controller, path1)) { - ret = -ENOENT; - goto out; - } - if (!fc_may_access(fc, controller, path1, path2, fi->flags)) { - ret = -EACCES; - goto out; - } - - /* we'll free this at cg_release */ - file_info = malloc(sizeof(*file_info)); - if (!file_info) { - ret = -ENOMEM; - goto out; - } - file_info->controller = must_copy_string(controller); - file_info->cgroup = must_copy_string(path1); - file_info->file = must_copy_string(path2); - file_info->type = LXC_TYPE_CGFILE; - file_info->buf = NULL; - file_info->buflen = 0; - - fi->fh = (unsigned long)file_info; - ret = 0; - -out: - free(cgdir); - return ret; -} - -int cg_access(const char *path, int mode) -{ - int ret; - const char *cgroup; - char *path1, *path2, *controller; - char *last = NULL, *cgdir = NULL; - struct cgfs_files *k = NULL; - struct fuse_context *fc = fuse_get_context(); - - if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) - return -EIO; - - if (strcmp(path, "/cgroup") == 0) - return 0; - - controller = pick_controller_from_path(fc, path); - if (!controller) - return -errno; - cgroup = find_cgroup_in_path(path); - if (!cgroup) { - // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not - if ((mode & W_OK) == 0) - return 0; - return -EACCES; - } - - get_cgdir_and_path(cgroup, &cgdir, &last); - if (!last) { - path1 = "/"; - path2 = cgdir; - } else { - path1 = cgdir; - path2 = last; - } - - k = cgfs_get_key(controller, path1, path2); - if (!k) { - if ((mode & W_OK) == 0) - ret = 0; - else - ret = -EACCES; - goto out; - } - free_key(k); - - pid_t initpid = lookup_initpid_in_store(fc->pid); - if (initpid <= 1 || is_shared_pidns(initpid)) - initpid = fc->pid; - if (!caller_may_see_dir(initpid, controller, path1)) { - ret = -ENOENT; - goto out; - } - if (!fc_may_access(fc, controller, path1, path2, mode)) { - ret = -EACCES; - goto out; - } - - ret = 0; - -out: - free(cgdir); - return ret; -} - -int cg_release(const char *path, struct fuse_file_info *fi) -{ - do_release_file_info(fi); - return 0; -} - -#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP ) - -static bool wait_for_sock(int sock, int timeout) -{ - struct epoll_event ev; - int epfd, ret, now, starttime, deltatime, saved_errno; - - if ((starttime = time(NULL)) < 0) - return false; - - if ((epfd = epoll_create(1)) < 0) { - lxcfs_error("%s\n", "Failed to create epoll socket: %m."); - return false; - } - - ev.events = POLLIN_SET; - ev.data.fd = sock; - if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) { - lxcfs_error("%s\n", "Failed adding socket to epoll: %m."); - close(epfd); - return false; - } - -again: - if ((now = time(NULL)) < 0) { - close(epfd); - return false; - } - - deltatime = (starttime + timeout) - now; - if (deltatime < 0) { // timeout - errno = 0; - close(epfd); - return false; - } - ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1); - if (ret < 0 && errno == EINTR) - goto again; - saved_errno = errno; - close(epfd); - - if (ret <= 0) { - errno = saved_errno; - return false; - } - return true; -} - -static int msgrecv(int sockfd, void *buf, size_t len) -{ - if (!wait_for_sock(sockfd, 2)) - return -1; - return recv(sockfd, buf, len, MSG_DONTWAIT); -} - -static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst) -{ - struct msghdr msg = { 0 }; - struct iovec iov; - struct cmsghdr *cmsg; - char cmsgbuf[CMSG_SPACE(sizeof(*cred))]; - char buf[1]; - buf[0] = 'p'; - - if (pingfirst) { - if (msgrecv(sock, buf, 1) != 1) { - lxcfs_error("%s\n", "Error getting reply from server over socketpair."); - return SEND_CREDS_FAIL; - } - } - - msg.msg_control = cmsgbuf; - msg.msg_controllen = sizeof(cmsgbuf); - - cmsg = CMSG_FIRSTHDR(&msg); - cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred)); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_CREDENTIALS; - memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred)); - - msg.msg_name = NULL; - msg.msg_namelen = 0; - - buf[0] = v; - iov.iov_base = buf; - iov.iov_len = sizeof(buf); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - - if (sendmsg(sock, &msg, 0) < 0) { - lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno)); - if (errno == 3) - return SEND_CREDS_NOTSK; - return SEND_CREDS_FAIL; - } - - return SEND_CREDS_OK; -} - -static bool recv_creds(int sock, struct ucred *cred, char *v) -{ - struct msghdr msg = { 0 }; - struct iovec iov; - struct cmsghdr *cmsg; - char cmsgbuf[CMSG_SPACE(sizeof(*cred))]; - char buf[1]; - int ret; - int optval = 1; - - *v = '1'; - - cred->pid = -1; - cred->uid = -1; - cred->gid = -1; - - if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) { - lxcfs_error("Failed to set passcred: %s\n", strerror(errno)); - return false; - } - buf[0] = '1'; - if (write(sock, buf, 1) != 1) { - lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno)); - return false; - } - - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = cmsgbuf; - msg.msg_controllen = sizeof(cmsgbuf); - - iov.iov_base = buf; - iov.iov_len = sizeof(buf); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - - if (!wait_for_sock(sock, 2)) { - lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno)); - return false; - } - ret = recvmsg(sock, &msg, MSG_DONTWAIT); - if (ret < 0) { - lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno)); - return false; - } - - cmsg = CMSG_FIRSTHDR(&msg); - - if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) && - cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SCM_CREDENTIALS) { - memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred)); - } - *v = buf[0]; - - return true; -} - -struct pid_ns_clone_args { - int *cpipe; - int sock; - pid_t tpid; - int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns -}; - -/* - * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage - * with clone(). This simply writes '1' as ACK back to the parent - * before calling the actual wrapped function. - */ -static int pid_ns_clone_wrapper(void *arg) { - struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg; - char b = '1'; - - close(args->cpipe[0]); - if (write(args->cpipe[1], &b, sizeof(char)) < 0) - lxcfs_error("(child): error on write: %s.\n", strerror(errno)); - close(args->cpipe[1]); - return args->wrapped(args->sock, args->tpid); -} - -/* - * pid_to_ns - reads pids from a ucred over a socket, then writes the - * int value back over the socket. This shifts the pid from the - * sender's pidns into tpid's pidns. - */ -static int pid_to_ns(int sock, pid_t tpid) -{ - char v = '0'; - struct ucred cred; - - while (recv_creds(sock, &cred, &v)) { - if (v == '1') - return 0; - if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t)) - return 1; - } - return 0; -} - - -/* - * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain - * in your old pidns. Only children which you clone will be in the target - * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to - * actually convert pids. - * - * Note: glibc's fork() does not respect pidns, which can lead to failed - * assertions inside glibc (and thus failed forks) if the child's pid in - * the pidns and the parent pid outside are identical. Using clone prevents - * this issue. - */ -static void pid_to_ns_wrapper(int sock, pid_t tpid) -{ - int newnsfd = -1, ret, cpipe[2]; - char fnam[100]; - pid_t cpid; - char v; - - ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid); - if (ret < 0 || ret >= sizeof(fnam)) - _exit(1); - newnsfd = open(fnam, O_RDONLY); - if (newnsfd < 0) - _exit(1); - if (setns(newnsfd, 0) < 0) - _exit(1); - close(newnsfd); - - if (pipe(cpipe) < 0) - _exit(1); - - struct pid_ns_clone_args args = { - .cpipe = cpipe, - .sock = sock, - .tpid = tpid, - .wrapped = &pid_to_ns - }; - size_t stack_size = sysconf(_SC_PAGESIZE); - void *stack = alloca(stack_size); - - cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args); - if (cpid < 0) - _exit(1); - - // give the child 1 second to be done forking and - // write its ack - if (!wait_for_sock(cpipe[0], 1)) - _exit(1); - ret = read(cpipe[0], &v, 1); - if (ret != sizeof(char) || v != '1') - _exit(1); - - if (!wait_for_pid(cpid)) - _exit(1); - _exit(0); -} - -/* - * To read cgroup files with a particular pid, we will setns into the child - * pidns, open a pipe, fork a child - which will be the first to really be in - * the child ns - which does the cgfs_get_value and writes the data to the pipe. - */ -bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d) -{ - int sock[2] = {-1, -1}; - char *tmpdata = NULL; - int ret; - pid_t qpid, cpid = -1; - bool answer = false; - char v = '0'; - struct ucred cred; - size_t sz = 0, asz = 0; - - if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata)) - return false; - - /* - * Now we read the pids from returned data one by one, pass - * them into a child in the target namespace, read back the - * translated pids, and put them into our to-return data - */ - - if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) { - perror("socketpair"); - free(tmpdata); - return false; - } - - cpid = fork(); - if (cpid == -1) - goto out; - - if (!cpid) // child - exits when done - pid_to_ns_wrapper(sock[1], tpid); - - char *ptr = tmpdata; - cred.uid = 0; - cred.gid = 0; - while (sscanf(ptr, "%d\n", &qpid) == 1) { - cred.pid = qpid; - ret = send_creds(sock[0], &cred, v, true); - - if (ret == SEND_CREDS_NOTSK) - goto next; - if (ret == SEND_CREDS_FAIL) - goto out; - - // read converted results - if (!wait_for_sock(sock[0], 2)) { - lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno)); - goto out; - } - if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { - lxcfs_error("Error reading pid from child: %s.\n", strerror(errno)); - goto out; - } - must_strcat_pid(d, &sz, &asz, qpid); -next: - ptr = strchr(ptr, '\n'); - if (!ptr) - break; - ptr++; - } - - cred.pid = getpid(); - v = '1'; - if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) { - // failed to ask child to exit - lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno)); - goto out; - } - - answer = true; - -out: - free(tmpdata); - if (cpid != -1) - wait_for_pid(cpid); - if (sock[0] != -1) { - close(sock[0]); - close(sock[1]); - } - return answer; -} - -int cg_read(const char *path, char *buf, size_t size, off_t offset, - struct fuse_file_info *fi) -{ - struct fuse_context *fc = fuse_get_context(); - struct file_info *f = (struct file_info *)fi->fh; - struct cgfs_files *k = NULL; - char *data = NULL; - int ret, s; - bool r; - - if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) - return -EIO; - - if (f->type != LXC_TYPE_CGFILE) { - lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read."); - return -EIO; - } - - if (offset) - return 0; - - if (!f->controller) - return -EINVAL; - - if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) { - return -EINVAL; - } - free_key(k); - - - if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { - ret = -EACCES; - goto out; - } - - if (strcmp(f->file, "tasks") == 0 || - strcmp(f->file, "/tasks") == 0 || - strcmp(f->file, "/cgroup.procs") == 0 || - strcmp(f->file, "cgroup.procs") == 0) - // special case - we have to translate the pids - r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data); - else - r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data); - - if (!r) { - ret = -EINVAL; - goto out; - } - - if (!data) { - ret = 0; - goto out; - } - s = strlen(data); - if (s > size) - s = size; - memcpy(buf, data, s); - if (s > 0 && s < size && data[s-1] != '\n') - buf[s++] = '\n'; - - ret = s; - -out: - free(data); - return ret; -} - -static int pid_from_ns(int sock, pid_t tpid) -{ - pid_t vpid; - struct ucred cred; - char v; - int ret; - - cred.uid = 0; - cred.gid = 0; - while (1) { - if (!wait_for_sock(sock, 2)) { - lxcfs_error("%s\n", "Timeout reading from parent."); - return 1; - } - if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) { - lxcfs_error("Bad read from parent: %s.\n", strerror(errno)); - return 1; - } - if (vpid == -1) // done - break; - v = '0'; - cred.pid = vpid; - if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) { - v = '1'; - cred.pid = getpid(); - if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK) - return 1; - } - } - return 0; -} - -static void pid_from_ns_wrapper(int sock, pid_t tpid) -{ - int newnsfd = -1, ret, cpipe[2]; - char fnam[100]; - pid_t cpid; - char v; - - ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid); - if (ret < 0 || ret >= sizeof(fnam)) - _exit(1); - newnsfd = open(fnam, O_RDONLY); - if (newnsfd < 0) - _exit(1); - if (setns(newnsfd, 0) < 0) - _exit(1); - close(newnsfd); - - if (pipe(cpipe) < 0) - _exit(1); - - struct pid_ns_clone_args args = { - .cpipe = cpipe, - .sock = sock, - .tpid = tpid, - .wrapped = &pid_from_ns - }; - size_t stack_size = sysconf(_SC_PAGESIZE); - void *stack = alloca(stack_size); - - cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args); - if (cpid < 0) - _exit(1); - - // give the child 1 second to be done forking and - // write its ack - if (!wait_for_sock(cpipe[0], 1)) - _exit(1); - ret = read(cpipe[0], &v, 1); - if (ret != sizeof(char) || v != '1') - _exit(1); - - if (!wait_for_pid(cpid)) - _exit(1); - _exit(0); -} - -/* - * Given host @uid, return the uid to which it maps in - * @pid's user namespace, or -1 if none. - */ -bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer) -{ - FILE *f; - char line[400]; - - sprintf(line, "/proc/%d/uid_map", pid); - if ((f = fopen(line, "r")) == NULL) { - return false; - } - - *answer = convert_id_to_ns(f, uid); - fclose(f); - - if (*answer == -1) - return false; - return true; -} - -/* - * get_pid_creds: get the real uid and gid of @pid from - * /proc/$$/status - * (XXX should we use euid here?) - */ -void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid) -{ - char line[400]; - uid_t u; - gid_t g; - FILE *f; - - *uid = -1; - *gid = -1; - sprintf(line, "/proc/%d/status", pid); - if ((f = fopen(line, "r")) == NULL) { - lxcfs_error("Error opening %s: %s\n", line, strerror(errno)); - return; - } - while (fgets(line, 400, f)) { - if (strncmp(line, "Uid:", 4) == 0) { - if (sscanf(line+4, "%u", &u) != 1) { - lxcfs_error("bad uid line for pid %u\n", pid); - fclose(f); - return; - } - *uid = u; - } else if (strncmp(line, "Gid:", 4) == 0) { - if (sscanf(line+4, "%u", &g) != 1) { - lxcfs_error("bad gid line for pid %u\n", pid); - fclose(f); - return; - } - *gid = g; - } - } - fclose(f); -} - -/* - * May the requestor @r move victim @v to a new cgroup? - * This is allowed if - * . they are the same task - * . they are ownedy by the same uid - * . @r is root on the host, or - * . @v's uid is mapped into @r's where @r is root. - */ -bool may_move_pid(pid_t r, uid_t r_uid, pid_t v) -{ - uid_t v_uid, tmpuid; - gid_t v_gid; - - if (r == v) - return true; - if (r_uid == 0) - return true; - get_pid_creds(v, &v_uid, &v_gid); - if (r_uid == v_uid) - return true; - if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0 - && hostuid_to_ns(v_uid, r, &tmpuid)) - return true; - return false; -} - -static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg, - const char *file, const char *buf) -{ - int sock[2] = {-1, -1}; - pid_t qpid, cpid = -1; - FILE *pids_file = NULL; - bool answer = false, fail = false; - - pids_file = open_pids_file(contrl, cg); - if (!pids_file) + ev.events = POLLIN_SET; + ev.data.fd = sock; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) { + lxcfs_error("%s\n", "Failed adding socket to epoll: %m."); + close(epfd); return false; - - /* - * write the pids to a socket, have helper in writer's pidns - * call movepid for us - */ - if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) { - perror("socketpair"); - goto out; - } - - cpid = fork(); - if (cpid == -1) - goto out; - - if (!cpid) { // child - fclose(pids_file); - pid_from_ns_wrapper(sock[1], tpid); - } - - const char *ptr = buf; - while (sscanf(ptr, "%d", &qpid) == 1) { - struct ucred cred; - char v; - - if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { - lxcfs_error("Error writing pid to child: %s.\n", strerror(errno)); - goto out; - } - - if (recv_creds(sock[0], &cred, &v)) { - if (v == '0') { - if (!may_move_pid(tpid, tuid, cred.pid)) { - fail = true; - break; - } - if (fprintf(pids_file, "%d", (int) cred.pid) < 0) - fail = true; - } - } - - ptr = strchr(ptr, '\n'); - if (!ptr) - break; - ptr++; - } - - /* All good, write the value */ - qpid = -1; - if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid)) - lxcfs_error("%s\n", "Warning: failed to ask child to exit."); - - if (!fail) - answer = true; - -out: - if (cpid != -1) - wait_for_pid(cpid); - if (sock[0] != -1) { - close(sock[0]); - close(sock[1]); } - if (pids_file) { - if (fclose(pids_file) != 0) - answer = false; - } - return answer; -} - -int cg_write(const char *path, const char *buf, size_t size, off_t offset, - struct fuse_file_info *fi) -{ - struct fuse_context *fc = fuse_get_context(); - char *localbuf = NULL; - struct cgfs_files *k = NULL; - struct file_info *f = (struct file_info *)fi->fh; - bool r; - if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) - return -EIO; - - if (f->type != LXC_TYPE_CGFILE) { - lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write."); - return -EIO; +again: + if ((now = time(NULL)) < 0) { + close(epfd); + return false; } - if (offset) - return 0; - - localbuf = alloca(size+1); - localbuf[size] = '\0'; - memcpy(localbuf, buf, size); - - if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) { - size = -EINVAL; - goto out; + deltatime = (starttime + timeout) - now; + if (deltatime < 0) { // timeout + errno = 0; + close(epfd); + return false; } + ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1); + if (ret < 0 && errno == EINTR) + goto again; + saved_errno = errno; + close(epfd); - if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) { - size = -EACCES; - goto out; + if (ret <= 0) { + errno = saved_errno; + return false; } - - if (strcmp(f->file, "tasks") == 0 || - strcmp(f->file, "/tasks") == 0 || - strcmp(f->file, "/cgroup.procs") == 0 || - strcmp(f->file, "cgroup.procs") == 0) - // special case - we have to translate the pids - r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf); - else - r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf); - - if (!r) - size = -EINVAL; - -out: - free_key(k); - return size; + return true; } -int cg_chown(const char *path, uid_t uid, gid_t gid) +static int msgrecv(int sockfd, void *buf, size_t len) { - struct fuse_context *fc = fuse_get_context(); - char *cgdir = NULL, *last = NULL, *path1, *path2, *controller; - struct cgfs_files *k = NULL; - const char *cgroup; - int ret; - - if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) - return -EIO; - - if (strcmp(path, "/cgroup") == 0) - return -EPERM; - - controller = pick_controller_from_path(fc, path); - if (!controller) - return errno == ENOENT ? -EPERM : -errno; - - cgroup = find_cgroup_in_path(path); - if (!cgroup) - /* this is just /cgroup/controller */ - return -EPERM; - - get_cgdir_and_path(cgroup, &cgdir, &last); - - if (!last) { - path1 = "/"; - path2 = cgdir; - } else { - path1 = cgdir; - path2 = last; - } - - if (is_child_cgroup(controller, path1, path2)) { - // get uid, gid, from '/tasks' file and make up a mode - // That is a hack, until cgmanager gains a GetCgroupPerms fn. - k = cgfs_get_key(controller, cgroup, "tasks"); - - } else - k = cgfs_get_key(controller, path1, path2); - - if (!k) { - ret = -EINVAL; - goto out; - } - - /* - * This being a fuse request, the uid and gid must be valid - * in the caller's namespace. So we can just check to make - * sure that the caller is root in his uid, and privileged - * over the file's current owner. - */ - if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) { - ret = -EACCES; - goto out; - } - - ret = cgfs_chown_file(controller, cgroup, uid, gid); - -out: - free_key(k); - free(cgdir); - - return ret; + if (!wait_for_sock(sockfd, 2)) + return -1; + return recv(sockfd, buf, len, MSG_DONTWAIT); } -int cg_chmod(const char *path, mode_t mode) +static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst) { - struct fuse_context *fc = fuse_get_context(); - char * cgdir = NULL, *last = NULL, *path1, *path2, *controller; - struct cgfs_files *k = NULL; - const char *cgroup; - int ret; - - if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) - return -EIO; - - if (strcmp(path, "/cgroup") == 0) - return -EPERM; - - controller = pick_controller_from_path(fc, path); - if (!controller) - return errno == ENOENT ? -EPERM : -errno; - - cgroup = find_cgroup_in_path(path); - if (!cgroup) - /* this is just /cgroup/controller */ - return -EPERM; - - get_cgdir_and_path(cgroup, &cgdir, &last); + struct msghdr msg = { 0 }; + struct iovec iov; + struct cmsghdr *cmsg; + char cmsgbuf[CMSG_SPACE(sizeof(*cred))]; + char buf[1]; + buf[0] = 'p'; - if (!last) { - path1 = "/"; - path2 = cgdir; - } else { - path1 = cgdir; - path2 = last; + if (pingfirst) { + if (msgrecv(sock, buf, 1) != 1) { + lxcfs_error("%s\n", "Error getting reply from server over socketpair."); + return SEND_CREDS_FAIL; + } } - if (is_child_cgroup(controller, path1, path2)) { - // get uid, gid, from '/tasks' file and make up a mode - // That is a hack, until cgmanager gains a GetCgroupPerms fn. - k = cgfs_get_key(controller, cgroup, "tasks"); + msg.msg_control = cmsgbuf; + msg.msg_controllen = sizeof(cmsgbuf); - } else - k = cgfs_get_key(controller, path1, path2); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_CREDENTIALS; + memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred)); - if (!k) { - ret = -EINVAL; - goto out; - } + msg.msg_name = NULL; + msg.msg_namelen = 0; - /* - * This being a fuse request, the uid and gid must be valid - * in the caller's namespace. So we can just check to make - * sure that the caller is root in his uid, and privileged - * over the file's current owner. - */ - if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) { - ret = -EPERM; - goto out; - } + buf[0] = v; + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; - if (!cgfs_chmod_file(controller, cgroup, mode)) { - ret = -EINVAL; - goto out; + if (sendmsg(sock, &msg, 0) < 0) { + lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno)); + if (errno == 3) + return SEND_CREDS_NOTSK; + return SEND_CREDS_FAIL; } - ret = 0; -out: - free_key(k); - free(cgdir); - return ret; + return SEND_CREDS_OK; } -int cg_mkdir(const char *path, mode_t mode) +static bool recv_creds(int sock, struct ucred *cred, char *v) { - struct fuse_context *fc = fuse_get_context(); - char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL; - const char *cgroup; + struct msghdr msg = { 0 }; + struct iovec iov; + struct cmsghdr *cmsg; + char cmsgbuf[CMSG_SPACE(sizeof(*cred))]; + char buf[1]; int ret; + int optval = 1; - if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) - return -EIO; - - controller = pick_controller_from_path(fc, path); - if (!controller) - return errno == ENOENT ? -EPERM : -errno; - - cgroup = find_cgroup_in_path(path); - if (!cgroup) - return -errno; - - get_cgdir_and_path(cgroup, &cgdir, &last); - if (!last) - path1 = "/"; - else - path1 = cgdir; + *v = '1'; - pid_t initpid = lookup_initpid_in_store(fc->pid); - if (initpid <= 1 || is_shared_pidns(initpid)) - initpid = fc->pid; - if (!caller_is_in_ancestor(initpid, controller, path1, &next)) { - if (!next) - ret = -EINVAL; - else if (last && strcmp(next, last) == 0) - ret = -EEXIST; - else - ret = -EPERM; - goto out; - } + cred->pid = -1; + cred->uid = -1; + cred->gid = -1; - if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) { - ret = -EACCES; - goto out; + if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) { + lxcfs_error("Failed to set passcred: %s\n", strerror(errno)); + return false; } - if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) { - ret = -EACCES; - goto out; + buf[0] = '1'; + if (write(sock, buf, 1) != 1) { + lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno)); + return false; } - ret = cgfs_create(controller, cgroup, fc->uid, fc->gid); - -out: - free(cgdir); - free(next); - return ret; -} - -int cg_rmdir(const char *path) -{ - struct fuse_context *fc = fuse_get_context(); - char *last = NULL, *cgdir = NULL, *controller, *next = NULL; - const char *cgroup; - int ret; - - if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) - return -EIO; - - controller = pick_controller_from_path(fc, path); - if (!controller) /* Someone's trying to delete "/cgroup". */ - return -EPERM; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = cmsgbuf; + msg.msg_controllen = sizeof(cmsgbuf); - cgroup = find_cgroup_in_path(path); - if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */ - return -EPERM; + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; - get_cgdir_and_path(cgroup, &cgdir, &last); - if (!last) { - /* Someone's trying to delete a cgroup on the same level as the - * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or - * rmdir "/cgroup/blkio/init.slice". - */ - ret = -EPERM; - goto out; + if (!wait_for_sock(sock, 2)) { + lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno)); + return false; } - - pid_t initpid = lookup_initpid_in_store(fc->pid); - if (initpid <= 1 || is_shared_pidns(initpid)) - initpid = fc->pid; - if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) { - if (!last || (next && (strcmp(next, last) == 0))) - ret = -EBUSY; - else - ret = -ENOENT; - goto out; + ret = recvmsg(sock, &msg, MSG_DONTWAIT); + if (ret < 0) { + lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno)); + return false; } - if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) { - ret = -EACCES; - goto out; - } - if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) { - ret = -EACCES; - goto out; - } + cmsg = CMSG_FIRSTHDR(&msg); - if (!cgfs_remove(controller, cgroup)) { - ret = -EINVAL; - goto out; + if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) && + cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS) { + memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred)); } + *v = buf[0]; - ret = 0; - -out: - free(cgdir); - free(next); - return ret; + return true; } +struct pid_ns_clone_args { + int *cpipe; + int sock; + pid_t tpid; + int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns +}; + static bool startswith(const char *line, const char *pref) { if (strncmp(line, pref, strlen(pref)) == 0) diff --git a/bindings.h b/bindings.h index e3c0c83..7f928d6 100644 --- a/bindings.h +++ b/bindings.h @@ -2,6 +2,7 @@ #define __LXCFS_BINDINGS_H #include "macro.h" +#include "cgroup_fuse.h" #include "sysfs_fuse.h" /* directory under which we mount the controllers - /run/lxcfs/controllers */ @@ -42,23 +43,6 @@ struct lxcfs_opts { bool swap_off; }; -extern int cg_write(const char *path, const char *buf, size_t size, off_t offset, - struct fuse_file_info *fi); -extern int cg_mkdir(const char *path, mode_t mode); -extern int cg_chown(const char *path, uid_t uid, gid_t gid); -extern int cg_rmdir(const char *path); -extern int cg_chmod(const char *path, mode_t mode); -extern int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, - struct fuse_file_info *fi); -extern int cg_releasedir(const char *path, struct fuse_file_info *fi); -extern int cg_release(const char *path, struct fuse_file_info *fi); -extern int cg_read(const char *path, char *buf, size_t size, off_t offset, - struct fuse_file_info *fi); -extern int cg_opendir(const char *path, struct fuse_file_info *fi); -extern int cg_getattr(const char *path, struct stat *sb); -extern int cg_open(const char *path, struct fuse_file_info *fi); -extern int cg_access(const char *path, int mode); - extern int proc_getattr(const char *path, struct stat *sb); extern int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi); diff --git a/cgroup_fuse.c b/cgroup_fuse.c new file mode 100644 index 0000000..e7833a2 --- /dev/null +++ b/cgroup_fuse.c @@ -0,0 +1,2302 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#define FUSE_USE_VERSION 26 + +#define __STDC_FORMAT_MACROS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bindings.h" +#include "config.h" +#include "cgroups/cgroup.h" +#include "cgroups/cgroup_utils.h" +#include "memory_utils.h" +#include "utils.h" + +struct cgfs_files { + char *name; + uint32_t uid, gid; + uint32_t mode; +}; + +struct pid_ns_clone_args { + int *cpipe; + int sock; + pid_t tpid; + /* pid_from_ns or pid_to_ns. */ + int (*wrapped) (int, pid_t); +}; + +/* + * given /cgroup/freezer/a/b, return "freezer". + * the returned char* should NOT be freed. + */ +static char *pick_controller_from_path(struct fuse_context *fc, const char *path) +{ + const char *p1; + char *contr, *slash; + + if (strlen(path) < 9) { + errno = EACCES; + return NULL; + } + if (*(path + 7) != '/') { + errno = EINVAL; + return NULL; + } + p1 = path + 8; + contr = strdupa(p1); + if (!contr) { + errno = ENOMEM; + return NULL; + } + slash = strstr(contr, "/"); + if (slash) + *slash = '\0'; + + for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { + if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0) + return (*h)->__controllers; + } + errno = ENOENT; + return NULL; +} + +/* + * Find the start of cgroup in /cgroup/controller/the/cgroup/path + * Note that the returned value may include files (keynames) etc + */ +static const char *find_cgroup_in_path(const char *path) +{ + const char *p1; + + if (strlen(path) < 9) { + errno = EACCES; + return NULL; + } + p1 = strstr(path + 8, "/"); + if (!p1) { + errno = EINVAL; + return NULL; + } + errno = 0; + return p1 + 1; +} + +/* + * split the last path element from the path in @cg. + * @dir is newly allocated and should be freed, @last not +*/ +static void get_cgdir_and_path(const char *cg, char **dir, char **last) +{ + char *p; + + do { + *dir = strdup(cg); + } while (!*dir); + *last = strrchr(cg, '/'); + if (!*last) { + *last = NULL; + return; + } + p = strrchr(*dir, '/'); + *p = '\0'; +} + +static bool is_child_cgroup(const char *controller, const char *cgroup, const char *f) +{ + int cfd; + size_t len; + char *fnam; + int ret; + struct stat sb; + + cfd = get_cgroup_fd(controller); + if (cfd < 0) + return false; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / + f + \0 + */ + len = strlen(cgroup) + strlen(f) + 3; + fnam = alloca(len); + ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f); + if (ret < 0 || (size_t)ret >= len) + return false; + + ret = fstatat(cfd, fnam, &sb, 0); + if (ret < 0 || !S_ISDIR(sb.st_mode)) + return false; + + return true; +} + +/* + * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c. + */ +static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg) +{ + bool answer = false; + char *c2, *task_cg; + size_t target_len, task_len; + + if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0) + return true; + + c2 = get_pid_cgroup(pid, contrl); + if (!c2) + return false; + prune_init_slice(c2); + + task_cg = c2 + 1; + target_len = strlen(cg); + task_len = strlen(task_cg); + if (task_len == 0) { + /* Task is in the root cg, it can see everything. This case is + * not handled by the strmcps below, since they test for the + * last /, but that is the first / that we've chopped off + * above. + */ + answer = true; + goto out; + } + if (strcmp(cg, task_cg) == 0) { + answer = true; + goto out; + } + if (target_len < task_len) { + /* looking up a parent dir */ + if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/') + answer = true; + goto out; + } + if (target_len > task_len) { + /* looking up a child dir */ + if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/') + answer = true; + goto out; + } + +out: + free(c2); + return answer; +} + +/* + * taskcg is a/b/c + * querycg is /a/b/c/d/e + * we return 'd' + */ +static char *get_next_cgroup_dir(const char *taskcg, const char *querycg) +{ + char *start, *end; + + if (strlen(taskcg) <= strlen(querycg)) { + lxcfs_error("%s\n", "I was fed bad input."); + return NULL; + } + + if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0)) + start = strdup(taskcg + 1); + else + start = strdup(taskcg + strlen(querycg) + 1); + if (!start) + return NULL; + end = strchr(start, '/'); + if (end) + *end = '\0'; + return start; +} + +/* + * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d. + * If pid is in /a, he may act on /a/b, but not on /b. + * if the answer is false and nextcg is not NULL, then *nextcg will point + * to a string containing the next cgroup directory under cg, which must be + * freed by the caller. + */ +static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg) +{ + bool answer = false; + char *c2 = get_pid_cgroup(pid, contrl); + char *linecmp; + + if (!c2) + return false; + prune_init_slice(c2); + + /* + * callers pass in '/' or './' (openat()) for root cgroup, otherwise + * they pass in a cgroup without leading '/' + * + * The original line here was: + * linecmp = *cg == '/' ? c2 : c2+1; + * TODO: I'm not sure why you'd want to increment when *cg != '/'? + * Serge, do you know? + */ + if (*cg == '/' || !strncmp(cg, "./", 2)) + linecmp = c2; + else + linecmp = c2 + 1; + if (strncmp(linecmp, cg, strlen(linecmp)) != 0) { + if (nextcg) { + *nextcg = get_next_cgroup_dir(linecmp, cg); + } + goto out; + } + answer = true; + +out: + free(c2); + return answer; +} + +static struct cgfs_files *cgfs_get_key(const char *controller, + const char *cgroup, const char *file) +{ + int ret, cfd; + size_t len; + char *fnam; + struct stat sb; + struct cgfs_files *newkey; + + cfd = get_cgroup_fd(controller); + if (cfd < 0) + return false; + + if (file && *file == '/') + file++; + + if (file && strchr(file, '/')) + return NULL; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / + file + \0 + */ + len = strlen(cgroup) + 3; + if (file) + len += strlen(file) + 1; + fnam = alloca(len); + snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup, + file ? "/" : "", file ? file : ""); + + ret = fstatat(cfd, fnam, &sb, 0); + if (ret < 0) + return NULL; + + do { + newkey = malloc(sizeof(struct cgfs_files)); + } while (!newkey); + if (file) + newkey->name = must_copy_string(file); + else if (strrchr(cgroup, '/')) + newkey->name = must_copy_string(strrchr(cgroup, '/')); + else + newkey->name = must_copy_string(cgroup); + newkey->uid = sb.st_uid; + newkey->gid = sb.st_gid; + newkey->mode = sb.st_mode; + + return newkey; +} + +/* + * Given a open file * to /proc/pid/{u,g}id_map, and an id + * valid in the caller's namespace, return the id mapped into + * pid's namespace. + * Returns the mapped id, or -1 on error. + */ +static unsigned int convert_id_to_ns(FILE *idfile, unsigned int in_id) +{ + unsigned int nsuid, // base id for a range in the idfile's namespace + hostuid, // base id for a range in the caller's namespace + count; // number of ids in this range + char line[400]; + int ret; + + fseek(idfile, 0L, SEEK_SET); + while (fgets(line, 400, idfile)) { + ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count); + if (ret != 3) + continue; + if (hostuid + count < hostuid || nsuid + count < nsuid) { + /* + * uids wrapped around - unexpected as this is a procfile, + * so just bail. + */ + lxcfs_error("pid wrapparound at entry %u %u %u in %s\n", + nsuid, hostuid, count, line); + return -1; + } + if (hostuid <= in_id && hostuid+count > in_id) { + /* + * now since hostuid <= in_id < hostuid+count, and + * hostuid+count and nsuid+count do not wrap around, + * we know that nsuid+(in_id-hostuid) which must be + * less that nsuid+(count) must not wrap around + */ + return (in_id - hostuid) + nsuid; + } + } + + // no answer found + return -1; +} + +/* + * for is_privileged_over, + * specify whether we require the calling uid to be root in his + * namespace + */ +#define NS_ROOT_REQD true +#define NS_ROOT_OPT false + +#define PROCLEN 100 + +static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root) +{ + char fpath[PROCLEN]; + int ret; + bool answer = false; + uid_t nsuid; + + if (victim == -1 || uid == -1) + return false; + + /* + * If the request is one not requiring root in the namespace, + * then having the same uid suffices. (i.e. uid 1000 has write + * access to files owned by uid 1000 + */ + if (!req_ns_root && uid == victim) + return true; + + ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid); + if (ret < 0 || ret >= PROCLEN) + return false; + FILE *f = fopen(fpath, "r"); + if (!f) + return false; + + /* if caller's not root in his namespace, reject */ + nsuid = convert_id_to_ns(f, uid); + if (nsuid) + goto out; + + /* + * If victim is not mapped into caller's ns, reject. + * XXX I'm not sure this check is needed given that fuse + * will be sending requests where the vfs has converted + */ + nsuid = convert_id_to_ns(f, victim); + if (nsuid == -1) + goto out; + + answer = true; + +out: + fclose(f); + return answer; +} + +static bool perms_include(int fmode, mode_t req_mode) +{ + mode_t r; + + switch (req_mode & O_ACCMODE) { + case O_RDONLY: + r = S_IROTH; + break; + case O_WRONLY: + r = S_IWOTH; + break; + case O_RDWR: + r = S_IROTH | S_IWOTH; + break; + default: + return false; + } + return ((fmode & r) == r); +} + +static void free_key(struct cgfs_files *k) +{ + if (!k) + return; + free_disarm(k->name); + free_disarm(k); +} + +/* + * check whether a fuse context may access a cgroup dir or file + * + * If file is not null, it is a cgroup file to check under cg. + * If file is null, then we are checking perms on cg itself. + * + * For files we can check the mode of the list_keys result. + * For cgroups, we must make assumptions based on the files under the + * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups + * yet. + */ +static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode) +{ + struct cgfs_files *k = NULL; + bool ret = false; + + k = cgfs_get_key(contrl, cg, file); + if (!k) + return false; + + if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) { + if (perms_include(k->mode >> 6, mode)) { + ret = true; + goto out; + } + } + if (fc->gid == k->gid) { + if (perms_include(k->mode >> 3, mode)) { + ret = true; + goto out; + } + } + ret = perms_include(k->mode, mode); + +out: + free_key(k); + return ret; +} + +int cg_getattr(const char *path, struct stat *sb) +{ + struct timespec now; + struct fuse_context *fc = fuse_get_context(); + char * cgdir = NULL; + char *last = NULL, *path1, *path2; + struct cgfs_files *k = NULL; + const char *cgroup; + const char *controller = NULL; + int ret = -ENOENT; + + + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + + memset(sb, 0, sizeof(struct stat)); + + if (clock_gettime(CLOCK_REALTIME, &now) < 0) + return -EINVAL; + + sb->st_uid = sb->st_gid = 0; + sb->st_atim = sb->st_mtim = sb->st_ctim = now; + sb->st_size = 0; + + if (strcmp(path, "/cgroup") == 0) { + sb->st_mode = S_IFDIR | 00755; + sb->st_nlink = 2; + return 0; + } + + controller = pick_controller_from_path(fc, path); + if (!controller) + return -errno; + cgroup = find_cgroup_in_path(path); + if (!cgroup) { + /* this is just /cgroup/controller, return it as a dir */ + sb->st_mode = S_IFDIR | 00755; + sb->st_nlink = 2; + return 0; + } + + get_cgdir_and_path(cgroup, &cgdir, &last); + + if (!last) { + path1 = "/"; + path2 = cgdir; + } else { + path1 = cgdir; + path2 = last; + } + + pid_t initpid = lookup_initpid_in_store(fc->pid); + if (initpid <= 1 || is_shared_pidns(initpid)) + initpid = fc->pid; + /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys. + * Then check that caller's cgroup is under path if last is a child + * cgroup, or cgdir if last is a file */ + + if (is_child_cgroup(controller, path1, path2)) { + if (!caller_may_see_dir(initpid, controller, cgroup)) { + ret = -ENOENT; + goto out; + } + if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) { + /* this is just /cgroup/controller, return it as a dir */ + sb->st_mode = S_IFDIR | 00555; + sb->st_nlink = 2; + ret = 0; + goto out; + } + if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) { + ret = -EACCES; + goto out; + } + + // get uid, gid, from '/tasks' file and make up a mode + // That is a hack, until cgmanager gains a GetCgroupPerms fn. + sb->st_mode = S_IFDIR | 00755; + k = cgfs_get_key(controller, cgroup, NULL); + if (!k) { + sb->st_uid = sb->st_gid = 0; + } else { + sb->st_uid = k->uid; + sb->st_gid = k->gid; + } + free_key(k); + sb->st_nlink = 2; + ret = 0; + goto out; + } + + if ((k = cgfs_get_key(controller, path1, path2)) != NULL) { + sb->st_mode = S_IFREG | k->mode; + sb->st_nlink = 1; + sb->st_uid = k->uid; + sb->st_gid = k->gid; + sb->st_size = 0; + free_key(k); + if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) { + ret = -ENOENT; + goto out; + } + ret = 0; + } + +out: + free(cgdir); + return ret; +} + +/* + * Chown all the files in the cgroup directory. We do this when we create a + * cgroup on behalf of a user. + */ +static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd) +{ + struct dirent *direntp; + char path[MAXPATHLEN]; + size_t len; + DIR *d; + int fd1, ret; + + len = strlen(dirname); + if (len >= MAXPATHLEN) { + lxcfs_error("Pathname too long: %s\n", dirname); + return; + } + + fd1 = openat(fd, dirname, O_DIRECTORY); + if (fd1 < 0) + return; + + d = fdopendir(fd1); + if (!d) { + lxcfs_error("Failed to open %s\n", dirname); + return; + } + + while ((direntp = readdir(d))) { + if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, "..")) + continue; + ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); + if (ret < 0 || ret >= MAXPATHLEN) { + lxcfs_error("Pathname too long under %s\n", dirname); + continue; + } + if (fchownat(fd, path, uid, gid, 0) < 0) + lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid); + } + closedir(d); +} + +static int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid) +{ + int cfd; + size_t len; + char *dirnam; + + cfd = get_cgroup_fd(controller); + if (cfd < 0) + return -EINVAL; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cg + \0 + */ + len = strlen(cg) + 2; + dirnam = alloca(len); + snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg); + + if (mkdirat(cfd, dirnam, 0755) < 0) + return -errno; + + if (uid == 0 && gid == 0) + return 0; + + if (fchownat(cfd, dirnam, uid, gid, 0) < 0) + return -errno; + + chown_all_cgroup_files(dirnam, uid, gid, cfd); + + return 0; +} + +int cg_mkdir(const char *path, mode_t mode) +{ + struct fuse_context *fc = fuse_get_context(); + char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL; + const char *cgroup; + int ret; + + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + + controller = pick_controller_from_path(fc, path); + if (!controller) + return errno == ENOENT ? -EPERM : -errno; + + cgroup = find_cgroup_in_path(path); + if (!cgroup) + return -errno; + + get_cgdir_and_path(cgroup, &cgdir, &last); + if (!last) + path1 = "/"; + else + path1 = cgdir; + + pid_t initpid = lookup_initpid_in_store(fc->pid); + if (initpid <= 1 || is_shared_pidns(initpid)) + initpid = fc->pid; + if (!caller_is_in_ancestor(initpid, controller, path1, &next)) { + if (!next) + ret = -EINVAL; + else if (last && strcmp(next, last) == 0) + ret = -EEXIST; + else + ret = -EPERM; + goto out; + } + + if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) { + ret = -EACCES; + goto out; + } + if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) { + ret = -EACCES; + goto out; + } + + ret = cgfs_create(controller, cgroup, fc->uid, fc->gid); + +out: + free(cgdir); + free(next); + return ret; +} + +static bool recursive_rmdir(const char *dirname, int fd, const int cfd) +{ + struct dirent *direntp; + DIR *dir; + bool ret = false; + char pathname[MAXPATHLEN]; + int dupfd; + + dupfd = dup(fd); // fdopendir() does bad things once it uses an fd. + if (dupfd < 0) + return false; + + dir = fdopendir(dupfd); + if (!dir) { + lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno)); + close(dupfd); + return false; + } + + while ((direntp = readdir(dir))) { + struct stat mystat; + int rc; + + if (!strcmp(direntp->d_name, ".") || + !strcmp(direntp->d_name, "..")) + continue; + + rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); + if (rc < 0 || rc >= MAXPATHLEN) { + lxcfs_error("%s\n", "Pathname too long."); + continue; + } + + rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW); + if (rc) { + lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno)); + continue; + } + if (S_ISDIR(mystat.st_mode)) + if (!recursive_rmdir(pathname, fd, cfd)) + lxcfs_debug("Error removing %s.\n", pathname); + } + + ret = true; + if (closedir(dir) < 0) { + lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno)); + ret = false; + } + + if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) { + lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno)); + ret = false; + } + + close(dupfd); + + return ret; +} + +static bool cgfs_remove(const char *controller, const char *cg) +{ + int fd, cfd; + size_t len; + char *dirnam; + bool bret; + + cfd = get_cgroup_fd(controller); + if (cfd < 0) + return false; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cg + \0 + */ + len = strlen(cg) + 2; + dirnam = alloca(len); + snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg); + + fd = openat(cfd, dirnam, O_DIRECTORY); + if (fd < 0) + return false; + + bret = recursive_rmdir(dirnam, fd, cfd); + close(fd); + return bret; +} + +int cg_rmdir(const char *path) +{ + struct fuse_context *fc = fuse_get_context(); + char *last = NULL, *cgdir = NULL, *controller, *next = NULL; + const char *cgroup; + int ret; + + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + + controller = pick_controller_from_path(fc, path); + if (!controller) /* Someone's trying to delete "/cgroup". */ + return -EPERM; + + cgroup = find_cgroup_in_path(path); + if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */ + return -EPERM; + + get_cgdir_and_path(cgroup, &cgdir, &last); + if (!last) { + /* Someone's trying to delete a cgroup on the same level as the + * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or + * rmdir "/cgroup/blkio/init.slice". + */ + ret = -EPERM; + goto out; + } + + pid_t initpid = lookup_initpid_in_store(fc->pid); + if (initpid <= 1 || is_shared_pidns(initpid)) + initpid = fc->pid; + if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) { + if (!last || (next && (strcmp(next, last) == 0))) + ret = -EBUSY; + else + ret = -ENOENT; + goto out; + } + + if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) { + ret = -EACCES; + goto out; + } + if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) { + ret = -EACCES; + goto out; + } + + if (!cgfs_remove(controller, cgroup)) { + ret = -EINVAL; + goto out; + } + + ret = 0; + +out: + free(cgdir); + free(next); + return ret; +} + +static bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode) +{ + int cfd; + size_t len; + char *pathname; + + cfd = get_cgroup_fd(controller); + if (cfd < 0) + return false; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /file + \0 + */ + len = strlen(file) + 2; + pathname = alloca(len); + snprintf(pathname, len, "%s%s", dot_or_empty(file), file); + if (fchmodat(cfd, pathname, mode, 0) < 0) + return false; + return true; +} + +int cg_chmod(const char *path, mode_t mode) +{ + struct fuse_context *fc = fuse_get_context(); + char * cgdir = NULL, *last = NULL, *path1, *path2, *controller; + struct cgfs_files *k = NULL; + const char *cgroup; + int ret; + + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + + if (strcmp(path, "/cgroup") == 0) + return -EPERM; + + controller = pick_controller_from_path(fc, path); + if (!controller) + return errno == ENOENT ? -EPERM : -errno; + + cgroup = find_cgroup_in_path(path); + if (!cgroup) + /* this is just /cgroup/controller */ + return -EPERM; + + get_cgdir_and_path(cgroup, &cgdir, &last); + + if (!last) { + path1 = "/"; + path2 = cgdir; + } else { + path1 = cgdir; + path2 = last; + } + + if (is_child_cgroup(controller, path1, path2)) { + // get uid, gid, from '/tasks' file and make up a mode + // That is a hack, until cgmanager gains a GetCgroupPerms fn. + k = cgfs_get_key(controller, cgroup, "tasks"); + + } else + k = cgfs_get_key(controller, path1, path2); + + if (!k) { + ret = -EINVAL; + goto out; + } + + /* + * This being a fuse request, the uid and gid must be valid + * in the caller's namespace. So we can just check to make + * sure that the caller is root in his uid, and privileged + * over the file's current owner. + */ + if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) { + ret = -EPERM; + goto out; + } + + if (!cgfs_chmod_file(controller, cgroup, mode)) { + ret = -EINVAL; + goto out; + } + + ret = 0; +out: + free_key(k); + free(cgdir); + return ret; +} + +static int is_dir(const char *path, int fd) +{ + struct stat statbuf; + int ret = fstatat(fd, path, &statbuf, fd); + if (ret == 0 && S_ISDIR(statbuf.st_mode)) + return 1; + return 0; +} + +static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd) +{ + size_t len; + char *fname; + + len = strlen(dirname) + strlen("/cgroup.procs") + 1; + fname = alloca(len); + snprintf(fname, len, "%s/tasks", dirname); + if (fchownat(fd, fname, uid, gid, 0) != 0) + return -errno; + snprintf(fname, len, "%s/cgroup.procs", dirname); + if (fchownat(fd, fname, uid, gid, 0) != 0) + return -errno; + return 0; +} + +static int cgfs_chown_file(const char *controller, const char *file, uid_t uid, + gid_t gid) +{ + int cfd; + size_t len; + char *pathname; + + cfd = get_cgroup_fd(controller); + if (cfd < 0) + return false; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /file + \0 + */ + len = strlen(file) + 2; + pathname = alloca(len); + snprintf(pathname, len, "%s%s", dot_or_empty(file), file); + if (fchownat(cfd, pathname, uid, gid, 0) < 0) + return -errno; + + if (is_dir(pathname, cfd)) + return chown_tasks_files(pathname, uid, gid, cfd); + + return 0; +} + +int cg_chown(const char *path, uid_t uid, gid_t gid) +{ + struct fuse_context *fc = fuse_get_context(); + char *cgdir = NULL, *last = NULL, *path1, *path2, *controller; + struct cgfs_files *k = NULL; + const char *cgroup; + int ret; + + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + + if (strcmp(path, "/cgroup") == 0) + return -EPERM; + + controller = pick_controller_from_path(fc, path); + if (!controller) + return errno == ENOENT ? -EPERM : -errno; + + cgroup = find_cgroup_in_path(path); + if (!cgroup) + /* this is just /cgroup/controller */ + return -EPERM; + + get_cgdir_and_path(cgroup, &cgdir, &last); + + if (!last) { + path1 = "/"; + path2 = cgdir; + } else { + path1 = cgdir; + path2 = last; + } + + if (is_child_cgroup(controller, path1, path2)) { + // get uid, gid, from '/tasks' file and make up a mode + // That is a hack, until cgmanager gains a GetCgroupPerms fn. + k = cgfs_get_key(controller, cgroup, "tasks"); + + } else + k = cgfs_get_key(controller, path1, path2); + + if (!k) { + ret = -EINVAL; + goto out; + } + + /* + * This being a fuse request, the uid and gid must be valid + * in the caller's namespace. So we can just check to make + * sure that the caller is root in his uid, and privileged + * over the file's current owner. + */ + if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) { + ret = -EACCES; + goto out; + } + + ret = cgfs_chown_file(controller, cgroup, uid, gid); + +out: + free_key(k); + free(cgdir); + + return ret; +} + +int cg_open(const char *path, struct fuse_file_info *fi) +{ + const char *cgroup; + char *last = NULL, *path1, *path2, * cgdir = NULL, *controller; + struct cgfs_files *k = NULL; + struct file_info *file_info; + struct fuse_context *fc = fuse_get_context(); + int ret; + + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + + controller = pick_controller_from_path(fc, path); + if (!controller) + return -errno; + cgroup = find_cgroup_in_path(path); + if (!cgroup) + return -errno; + + get_cgdir_and_path(cgroup, &cgdir, &last); + if (!last) { + path1 = "/"; + path2 = cgdir; + } else { + path1 = cgdir; + path2 = last; + } + + k = cgfs_get_key(controller, path1, path2); + if (!k) { + ret = -EINVAL; + goto out; + } + free_key(k); + + pid_t initpid = lookup_initpid_in_store(fc->pid); + if (initpid <= 1 || is_shared_pidns(initpid)) + initpid = fc->pid; + if (!caller_may_see_dir(initpid, controller, path1)) { + ret = -ENOENT; + goto out; + } + if (!fc_may_access(fc, controller, path1, path2, fi->flags)) { + ret = -EACCES; + goto out; + } + + /* we'll free this at cg_release */ + file_info = malloc(sizeof(*file_info)); + if (!file_info) { + ret = -ENOMEM; + goto out; + } + file_info->controller = must_copy_string(controller); + file_info->cgroup = must_copy_string(path1); + file_info->file = must_copy_string(path2); + file_info->type = LXC_TYPE_CGFILE; + file_info->buf = NULL; + file_info->buflen = 0; + + fi->fh = (unsigned long)file_info; + ret = 0; + +out: + free(cgdir); + return ret; +} + +#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP ) + +static bool wait_for_sock(int sock, int timeout) +{ + struct epoll_event ev; + int epfd, ret, now, starttime, deltatime, saved_errno; + + if ((starttime = time(NULL)) < 0) + return false; + + if ((epfd = epoll_create(1)) < 0) { + lxcfs_error("%s\n", "Failed to create epoll socket: %m."); + return false; + } + + ev.events = POLLIN_SET; + ev.data.fd = sock; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) { + lxcfs_error("%s\n", "Failed adding socket to epoll: %m."); + close(epfd); + return false; + } + +again: + if ((now = time(NULL)) < 0) { + close(epfd); + return false; + } + + deltatime = (starttime + timeout) - now; + if (deltatime < 0) { // timeout + errno = 0; + close(epfd); + return false; + } + ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1); + if (ret < 0 && errno == EINTR) + goto again; + saved_errno = errno; + close(epfd); + + if (ret <= 0) { + errno = saved_errno; + return false; + } + return true; +} + +static int msgrecv(int sockfd, void *buf, size_t len) +{ + if (!wait_for_sock(sockfd, 2)) + return -1; + return recv(sockfd, buf, len, MSG_DONTWAIT); +} + +#define SEND_CREDS_OK 0 +#define SEND_CREDS_NOTSK 1 +#define SEND_CREDS_FAIL 2 + +static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst) +{ + struct msghdr msg = { 0 }; + struct iovec iov; + struct cmsghdr *cmsg; + char cmsgbuf[CMSG_SPACE(sizeof(*cred))]; + char buf[1]; + buf[0] = 'p'; + + if (pingfirst) { + if (msgrecv(sock, buf, 1) != 1) { + lxcfs_error("%s\n", "Error getting reply from server over socketpair."); + return SEND_CREDS_FAIL; + } + } + + msg.msg_control = cmsgbuf; + msg.msg_controllen = sizeof(cmsgbuf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_CREDENTIALS; + memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred)); + + msg.msg_name = NULL; + msg.msg_namelen = 0; + + buf[0] = v; + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + if (sendmsg(sock, &msg, 0) < 0) { + lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno)); + if (errno == 3) + return SEND_CREDS_NOTSK; + return SEND_CREDS_FAIL; + } + + return SEND_CREDS_OK; +} + +static int wait_for_pid(pid_t pid) +{ + int status, ret; + + if (pid <= 0) + return -1; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + return -1; + } + if (ret != pid) + goto again; + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) + return -1; + return 0; +} + +static bool recv_creds(int sock, struct ucred *cred, char *v) +{ + struct msghdr msg = { 0 }; + struct iovec iov; + struct cmsghdr *cmsg; + char cmsgbuf[CMSG_SPACE(sizeof(*cred))]; + char buf[1]; + int ret; + int optval = 1; + + *v = '1'; + + cred->pid = -1; + cred->uid = -1; + cred->gid = -1; + + if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) { + lxcfs_error("Failed to set passcred: %s\n", strerror(errno)); + return false; + } + buf[0] = '1'; + if (write(sock, buf, 1) != 1) { + lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno)); + return false; + } + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = cmsgbuf; + msg.msg_controllen = sizeof(cmsgbuf); + + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + if (!wait_for_sock(sock, 2)) { + lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno)); + return false; + } + ret = recvmsg(sock, &msg, MSG_DONTWAIT); + if (ret < 0) { + lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno)); + return false; + } + + cmsg = CMSG_FIRSTHDR(&msg); + + if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) && + cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS) { + memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred)); + } + *v = buf[0]; + + return true; +} + +/* + * pid_to_ns - reads pids from a ucred over a socket, then writes the + * int value back over the socket. This shifts the pid from the + * sender's pidns into tpid's pidns. + */ +static int pid_to_ns(int sock, pid_t tpid) +{ + char v = '0'; + struct ucred cred; + + while (recv_creds(sock, &cred, &v)) { + if (v == '1') + return 0; + + if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t)) + return 1; + } + + return 0; +} + +/* + * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage + * with clone(). This simply writes '1' as ACK back to the parent + * before calling the actual wrapped function. + */ +static int pid_ns_clone_wrapper(void *arg) { + struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg; + char b = '1'; + + close(args->cpipe[0]); + if (write(args->cpipe[1], &b, sizeof(char)) < 0) + lxcfs_error("(child): error on write: %s.\n", strerror(errno)); + close(args->cpipe[1]); + return args->wrapped(args->sock, args->tpid); +} + +/* + * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain + * in your old pidns. Only children which you clone will be in the target + * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to + * actually convert pids. + * + * Note: glibc's fork() does not respect pidns, which can lead to failed + * assertions inside glibc (and thus failed forks) if the child's pid in + * the pidns and the parent pid outside are identical. Using clone prevents + * this issue. + */ +static void pid_to_ns_wrapper(int sock, pid_t tpid) +{ + int newnsfd = -1, ret, cpipe[2]; + char fnam[100]; + pid_t cpid; + char v; + + ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid); + if (ret < 0 || ret >= sizeof(fnam)) + _exit(1); + newnsfd = open(fnam, O_RDONLY); + if (newnsfd < 0) + _exit(1); + if (setns(newnsfd, 0) < 0) + _exit(1); + close(newnsfd); + + if (pipe(cpipe) < 0) + _exit(1); + + struct pid_ns_clone_args args = { + .cpipe = cpipe, + .sock = sock, + .tpid = tpid, + .wrapped = &pid_to_ns + }; + size_t stack_size = sysconf(_SC_PAGESIZE); + void *stack = alloca(stack_size); + + cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args); + if (cpid < 0) + _exit(1); + + /* Give the child 1 second to be done forking and write its ack. */ + if (!wait_for_sock(cpipe[0], 1)) + _exit(1); + ret = read(cpipe[0], &v, 1); + if (ret != sizeof(char) || v != '1') + _exit(1); + + if (!wait_for_pid(cpid)) + _exit(1); + _exit(0); +} + +/* + * append pid to *src. + * src: a pointer to a char* in which ot append the pid. + * sz: the number of characters printed so far, minus trailing \0. + * asz: the allocated size so far + * pid: the pid to append + */ +static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid) +{ + must_strcat(src, sz, asz, "%d\n", (int)pid); +} + +/* + * To read cgroup files with a particular pid, we will setns into the child + * pidns, open a pipe, fork a child - which will be the first to really be in + * the child ns - which does the cgfs_get_value and writes the data to the pipe. + */ +static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, + const char *file, char **d) +{ + int sock[2] = {-1, -1}; + char *tmpdata = NULL; + int ret; + pid_t qpid, cpid = -1; + bool answer = false; + char v = '0'; + struct ucred cred; + size_t sz = 0, asz = 0; + + if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata)) + return false; + + /* + * Now we read the pids from returned data one by one, pass + * them into a child in the target namespace, read back the + * translated pids, and put them into our to-return data + */ + + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) { + perror("socketpair"); + free(tmpdata); + return false; + } + + cpid = fork(); + if (cpid == -1) + goto out; + + if (!cpid) // child - exits when done + pid_to_ns_wrapper(sock[1], tpid); + + char *ptr = tmpdata; + cred.uid = 0; + cred.gid = 0; + while (sscanf(ptr, "%d\n", &qpid) == 1) { + cred.pid = qpid; + ret = send_creds(sock[0], &cred, v, true); + + if (ret == SEND_CREDS_NOTSK) + goto next; + if (ret == SEND_CREDS_FAIL) + goto out; + + // read converted results + if (!wait_for_sock(sock[0], 2)) { + lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno)); + goto out; + } + if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { + lxcfs_error("Error reading pid from child: %s.\n", strerror(errno)); + goto out; + } + must_strcat_pid(d, &sz, &asz, qpid); +next: + ptr = strchr(ptr, '\n'); + if (!ptr) + break; + ptr++; + } + + cred.pid = getpid(); + v = '1'; + if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) { + // failed to ask child to exit + lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno)); + goto out; + } + + answer = true; + +out: + free(tmpdata); + if (cpid != -1) + wait_for_pid(cpid); + if (sock[0] != -1) { + close(sock[0]); + close(sock[1]); + } + return answer; +} + +int cg_read(const char *path, char *buf, size_t size, off_t offset, + struct fuse_file_info *fi) +{ + struct fuse_context *fc = fuse_get_context(); + struct file_info *f = (struct file_info *)fi->fh; + struct cgfs_files *k = NULL; + char *data = NULL; + int ret, s; + bool r; + + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + + if (f->type != LXC_TYPE_CGFILE) { + lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read."); + return -EIO; + } + + if (offset) + return 0; + + if (!f->controller) + return -EINVAL; + + if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) { + return -EINVAL; + } + free_key(k); + + + if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { + ret = -EACCES; + goto out; + } + + if (strcmp(f->file, "tasks") == 0 || + strcmp(f->file, "/tasks") == 0 || + strcmp(f->file, "/cgroup.procs") == 0 || + strcmp(f->file, "cgroup.procs") == 0) + // special case - we have to translate the pids + r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data); + else + r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data); + + if (!r) { + ret = -EINVAL; + goto out; + } + + if (!data) { + ret = 0; + goto out; + } + s = strlen(data); + if (s > size) + s = size; + memcpy(buf, data, s); + if (s > 0 && s < size && data[s-1] != '\n') + buf[s++] = '\n'; + + ret = s; + +out: + free(data); + return ret; +} + +int cg_opendir(const char *path, struct fuse_file_info *fi) +{ + struct fuse_context *fc = fuse_get_context(); + const char *cgroup; + struct file_info *dir_info; + char *controller = NULL; + + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + + if (strcmp(path, "/cgroup") == 0) { + cgroup = NULL; + controller = NULL; + } else { + // return list of keys for the controller, and list of child cgroups + controller = pick_controller_from_path(fc, path); + if (!controller) + return -errno; + + cgroup = find_cgroup_in_path(path); + if (!cgroup) { + /* this is just /cgroup/controller, return its contents */ + cgroup = "/"; + } + } + + pid_t initpid = lookup_initpid_in_store(fc->pid); + if (initpid <= 1 || is_shared_pidns(initpid)) + initpid = fc->pid; + if (cgroup) { + if (!caller_may_see_dir(initpid, controller, cgroup)) + return -ENOENT; + if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) + return -EACCES; + } + + /* we'll free this at cg_releasedir */ + dir_info = malloc(sizeof(*dir_info)); + if (!dir_info) + return -ENOMEM; + dir_info->controller = must_copy_string(controller); + dir_info->cgroup = must_copy_string(cgroup); + dir_info->type = LXC_TYPE_CGDIR; + dir_info->buf = NULL; + dir_info->file = NULL; + dir_info->buflen = 0; + + fi->fh = (unsigned long)dir_info; + return 0; +} + +int cg_release(const char *path, struct fuse_file_info *fi) +{ + do_release_file_info(fi); + return 0; +} + +int cg_releasedir(const char *path, struct fuse_file_info *fi) +{ + do_release_file_info(fi); + return 0; +} + +static FILE *open_pids_file(const char *controller, const char *cgroup) +{ + int fd, cfd; + size_t len; + char *pathname; + + cfd = get_cgroup_fd(controller); + if (cfd < 0) + return false; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / "cgroup.procs" + \0 + */ + len = strlen(cgroup) + strlen("cgroup.procs") + 3; + pathname = alloca(len); + snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup); + + fd = openat(cfd, pathname, O_WRONLY); + if (fd < 0) + return NULL; + + return fdopen(fd, "w"); +} + +static int pid_from_ns(int sock, pid_t tpid) +{ + pid_t vpid; + struct ucred cred; + char v; + int ret; + + cred.uid = 0; + cred.gid = 0; + while (1) { + if (!wait_for_sock(sock, 2)) { + lxcfs_error("%s\n", "Timeout reading from parent."); + return 1; + } + if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) { + lxcfs_error("Bad read from parent: %s.\n", strerror(errno)); + return 1; + } + if (vpid == -1) // done + break; + v = '0'; + cred.pid = vpid; + if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) { + v = '1'; + cred.pid = getpid(); + if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK) + return 1; + } + } + return 0; +} + +static void pid_from_ns_wrapper(int sock, pid_t tpid) +{ + int newnsfd = -1, ret, cpipe[2]; + char fnam[100]; + pid_t cpid; + char v; + + ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid); + if (ret < 0 || ret >= sizeof(fnam)) + _exit(1); + newnsfd = open(fnam, O_RDONLY); + if (newnsfd < 0) + _exit(1); + if (setns(newnsfd, 0) < 0) + _exit(1); + close(newnsfd); + + if (pipe(cpipe) < 0) + _exit(1); + + struct pid_ns_clone_args args = { + .cpipe = cpipe, + .sock = sock, + .tpid = tpid, + .wrapped = &pid_from_ns + }; + size_t stack_size = sysconf(_SC_PAGESIZE); + void *stack = alloca(stack_size); + + cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args); + if (cpid < 0) + _exit(1); + + // give the child 1 second to be done forking and + // write its ack + if (!wait_for_sock(cpipe[0], 1)) + _exit(1); + ret = read(cpipe[0], &v, 1); + if (ret != sizeof(char) || v != '1') + _exit(1); + + if (!wait_for_pid(cpid)) + _exit(1); + _exit(0); +} + +/* + * get_pid_creds: get the real uid and gid of @pid from + * /proc/$$/status + * (XXX should we use euid here?) + */ +static void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid) +{ + char line[400]; + uid_t u; + gid_t g; + FILE *f; + + *uid = -1; + *gid = -1; + sprintf(line, "/proc/%d/status", pid); + if ((f = fopen(line, "r")) == NULL) { + lxcfs_error("Error opening %s: %s\n", line, strerror(errno)); + return; + } + while (fgets(line, 400, f)) { + if (strncmp(line, "Uid:", 4) == 0) { + if (sscanf(line+4, "%u", &u) != 1) { + lxcfs_error("bad uid line for pid %u\n", pid); + fclose(f); + return; + } + *uid = u; + } else if (strncmp(line, "Gid:", 4) == 0) { + if (sscanf(line+4, "%u", &g) != 1) { + lxcfs_error("bad gid line for pid %u\n", pid); + fclose(f); + return; + } + *gid = g; + } + } + fclose(f); +} + +/* + * Given host @uid, return the uid to which it maps in + * @pid's user namespace, or -1 if none. + */ +static bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer) +{ + FILE *f; + char line[400]; + + sprintf(line, "/proc/%d/uid_map", pid); + if ((f = fopen(line, "r")) == NULL) { + return false; + } + + *answer = convert_id_to_ns(f, uid); + fclose(f); + + if (*answer == -1) + return false; + return true; +} + +/* + * May the requestor @r move victim @v to a new cgroup? + * This is allowed if + * . they are the same task + * . they are ownedy by the same uid + * . @r is root on the host, or + * . @v's uid is mapped into @r's where @r is root. + */ +static bool may_move_pid(pid_t r, uid_t r_uid, pid_t v) +{ + uid_t v_uid, tmpuid; + gid_t v_gid; + + if (r == v) + return true; + if (r_uid == 0) + return true; + get_pid_creds(v, &v_uid, &v_gid); + if (r_uid == v_uid) + return true; + if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0 + && hostuid_to_ns(v_uid, r, &tmpuid)) + return true; + return false; +} + +static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, + const char *cg, const char *file, const char *buf) +{ + int sock[2] = {-1, -1}; + pid_t qpid, cpid = -1; + FILE *pids_file = NULL; + bool answer = false, fail = false; + + pids_file = open_pids_file(contrl, cg); + if (!pids_file) + return false; + + /* + * write the pids to a socket, have helper in writer's pidns + * call movepid for us + */ + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) { + perror("socketpair"); + goto out; + } + + cpid = fork(); + if (cpid == -1) + goto out; + + if (!cpid) { // child + fclose(pids_file); + pid_from_ns_wrapper(sock[1], tpid); + } + + const char *ptr = buf; + while (sscanf(ptr, "%d", &qpid) == 1) { + struct ucred cred; + char v; + + if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { + lxcfs_error("Error writing pid to child: %s.\n", strerror(errno)); + goto out; + } + + if (recv_creds(sock[0], &cred, &v)) { + if (v == '0') { + if (!may_move_pid(tpid, tuid, cred.pid)) { + fail = true; + break; + } + if (fprintf(pids_file, "%d", (int) cred.pid) < 0) + fail = true; + } + } + + ptr = strchr(ptr, '\n'); + if (!ptr) + break; + ptr++; + } + + /* All good, write the value */ + qpid = -1; + if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid)) + lxcfs_error("%s\n", "Warning: failed to ask child to exit."); + + if (!fail) + answer = true; + +out: + if (cpid != -1) + wait_for_pid(cpid); + if (sock[0] != -1) { + close(sock[0]); + close(sock[1]); + } + if (pids_file) { + if (fclose(pids_file) != 0) + answer = false; + } + return answer; +} + +static bool write_string(const char *fnam, const char *string, int fd) +{ + FILE *f; + size_t len, ret; + + f = fdopen(fd, "w"); + if (!f) + return false; + + len = strlen(string); + ret = fwrite(string, 1, len, f); + if (ret != len) { + lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n", + strerror(errno), string, fnam); + fclose(f); + return false; + } + + if (fclose(f) < 0) { + lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam); + return false; + } + + return true; +} + +static bool cgfs_set_value(const char *controller, const char *cgroup, + const char *file, const char *value) +{ + int ret, fd, cfd; + size_t len; + char *fnam; + + cfd = get_cgroup_fd(controller); + if (cfd < 0) + return false; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / + file + \0 + */ + len = strlen(cgroup) + strlen(file) + 3; + fnam = alloca(len); + ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file); + if (ret < 0 || (size_t)ret >= len) + return false; + + fd = openat(cfd, fnam, O_WRONLY); + if (fd < 0) + return false; + + return write_string(fnam, value, fd); +} + +int cg_write(const char *path, const char *buf, size_t size, off_t offset, + struct fuse_file_info *fi) +{ + struct fuse_context *fc = fuse_get_context(); + char *localbuf = NULL; + struct cgfs_files *k = NULL; + struct file_info *f = (struct file_info *)fi->fh; + bool r; + + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + + if (f->type != LXC_TYPE_CGFILE) { + lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write."); + return -EIO; + } + + if (offset) + return 0; + + localbuf = alloca(size+1); + localbuf[size] = '\0'; + memcpy(localbuf, buf, size); + + if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) { + size = -EINVAL; + goto out; + } + + if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) { + size = -EACCES; + goto out; + } + + if (strcmp(f->file, "tasks") == 0 || + strcmp(f->file, "/tasks") == 0 || + strcmp(f->file, "/cgroup.procs") == 0 || + strcmp(f->file, "cgroup.procs") == 0) + // special case - we have to translate the pids + r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf); + else + r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf); + + if (!r) + size = -EINVAL; + +out: + free_key(k); + return size; +} + +static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, + bool directories, void ***list, size_t typesize, + void *(*iterator)(const char *, const char *, const char *)) +{ + int cfd, fd, ret; + size_t len; + char *cg; + char pathname[MAXPATHLEN]; + size_t sz = 0, asz = 0; + struct dirent *dirent; + DIR *dir; + + cfd = get_cgroup_fd(controller); + *list = NULL; + if (cfd < 0) + return false; + + /* Make sure we pass a relative path to *at() family of functions. */ + len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */; + cg = alloca(len); + ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup); + if (ret < 0 || (size_t)ret >= len) { + lxcfs_error("Pathname too long under %s\n", cgroup); + return false; + } + + fd = openat(cfd, cg, O_DIRECTORY); + if (fd < 0) + return false; + + dir = fdopendir(fd); + if (!dir) + return false; + + while ((dirent = readdir(dir))) { + struct stat mystat; + + if (!strcmp(dirent->d_name, ".") || + !strcmp(dirent->d_name, "..")) + continue; + + ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name); + if (ret < 0 || ret >= MAXPATHLEN) { + lxcfs_error("Pathname too long under %s\n", cg); + continue; + } + + ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW); + if (ret) { + lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno)); + continue; + } + if ((!directories && !S_ISREG(mystat.st_mode)) || + (directories && !S_ISDIR(mystat.st_mode))) + continue; + + if (sz+2 >= asz) { + void **tmp; + asz += BATCH_SIZE; + do { + tmp = realloc(*list, asz * typesize); + } while (!tmp); + *list = tmp; + } + (*list)[sz] = (*iterator)(controller, cg, dirent->d_name); + (*list)[sz+1] = NULL; + sz++; + } + if (closedir(dir) < 0) { + lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno)); + return false; + } + return true; +} + +static void *make_key_list_entry(const char *controller, const char *cgroup, + const char *dir_entry) +{ + struct cgfs_files *entry; + + entry = cgfs_get_key(controller, cgroup, dir_entry); + if (!entry) + lxcfs_error("Failed to retrieve files under %s:%s\n", + controller, cgroup); + return entry; +} + +static bool cgfs_list_keys(const char *controller, const char *cgroup, + struct cgfs_files ***keys) +{ + return cgfs_iterate_cgroup(controller, cgroup, false, (void ***)keys, + sizeof(*keys), &make_key_list_entry); +} + +static void *make_children_list_entry(const char *controller, + const char *cgroup, const char *dir_entry) +{ + return strdup(dir_entry); +} + +static bool cgfs_list_children(const char *controller, const char *cgroup, + char ***list) +{ + return cgfs_iterate_cgroup(controller, cgroup, true, (void ***)list, + sizeof(*list), &make_children_list_entry); +} + +static void free_keys(struct cgfs_files **keys) +{ + if (!keys) + return; + + for (int i = 0; keys[i]; i++) + free_key(keys[i]); + + free_disarm(keys); +} + +int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, + off_t offset, struct fuse_file_info *fi) +{ + struct file_info *d = (struct file_info *)fi->fh; + struct cgfs_files **list = NULL; + int i, ret; + char *nextcg = NULL; + struct fuse_context *fc = fuse_get_context(); + char **clist = NULL; + + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + + if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0) + return -EIO; + + if (d->type != LXC_TYPE_CGDIR) { + lxcfs_error("%s\n", "Internal error: file cache info used in readdir."); + return -EIO; + } + if (!d->cgroup && !d->controller) { + /* + * ls /var/lib/lxcfs/cgroup - just show list of controllers. + * This only works with the legacy hierarchy. + */ + for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { + if (is_unified_hierarchy(*h)) + continue; + + if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0)) + return -EIO; + } + + return 0; + } + + if (!cgfs_list_keys(d->controller, d->cgroup, &list)) { + // not a valid cgroup + ret = -EINVAL; + goto out; + } + + pid_t initpid = lookup_initpid_in_store(fc->pid); + if (initpid <= 1 || is_shared_pidns(initpid)) + initpid = fc->pid; + if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) { + if (nextcg) { + ret = filler(buf, nextcg, NULL, 0); + free(nextcg); + if (ret != 0) { + ret = -EIO; + goto out; + } + } + ret = 0; + goto out; + } + + for (i = 0; list && list[i]; i++) { + if (filler(buf, list[i]->name, NULL, 0) != 0) { + ret = -EIO; + goto out; + } + } + + // now get the list of child cgroups + + if (!cgfs_list_children(d->controller, d->cgroup, &clist)) { + ret = 0; + goto out; + } + if (clist) { + for (i = 0; clist[i]; i++) { + if (filler(buf, clist[i], NULL, 0) != 0) { + ret = -EIO; + goto out; + } + } + } + ret = 0; + +out: + free_keys(list); + if (clist) { + for (i = 0; clist[i]; i++) + free(clist[i]); + free(clist); + } + return ret; +} + +int cg_access(const char *path, int mode) +{ + int ret; + const char *cgroup; + char *path1, *path2, *controller; + char *last = NULL, *cgdir = NULL; + struct cgfs_files *k = NULL; + struct fuse_context *fc = fuse_get_context(); + + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + + if (strcmp(path, "/cgroup") == 0) + return 0; + + controller = pick_controller_from_path(fc, path); + if (!controller) + return -errno; + cgroup = find_cgroup_in_path(path); + if (!cgroup) { + // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not + if ((mode & W_OK) == 0) + return 0; + return -EACCES; + } + + get_cgdir_and_path(cgroup, &cgdir, &last); + if (!last) { + path1 = "/"; + path2 = cgdir; + } else { + path1 = cgdir; + path2 = last; + } + + k = cgfs_get_key(controller, path1, path2); + if (!k) { + if ((mode & W_OK) == 0) + ret = 0; + else + ret = -EACCES; + goto out; + } + free_key(k); + + pid_t initpid = lookup_initpid_in_store(fc->pid); + if (initpid <= 1 || is_shared_pidns(initpid)) + initpid = fc->pid; + if (!caller_may_see_dir(initpid, controller, path1)) { + ret = -ENOENT; + goto out; + } + if (!fc_may_access(fc, controller, path1, path2, mode)) { + ret = -EACCES; + goto out; + } + + ret = 0; + +out: + free(cgdir); + return ret; +} diff --git a/cgroup_fuse.h b/cgroup_fuse.h new file mode 100644 index 0000000..4515530 --- /dev/null +++ b/cgroup_fuse.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#ifndef __LXCFS_CGROUP_FUSE_H +#define __LXCFS_CGROUP_FUSE_H + +extern int cg_getattr(const char *path, struct stat *sb); +extern int cg_mkdir(const char *path, mode_t mode); +extern int cg_rmdir(const char *path); +extern int cg_chmod(const char *path, mode_t mode); +extern int cg_chown(const char *path, uid_t uid, gid_t gid); +extern int cg_open(const char *path, struct fuse_file_info *fi); +extern int cg_read(const char *path, char *buf, size_t size, off_t offset, + struct fuse_file_info *fi); +extern int cg_opendir(const char *path, struct fuse_file_info *fi); +extern int cg_release(const char *path, struct fuse_file_info *fi); +extern int cg_releasedir(const char *path, struct fuse_file_info *fi); +extern int cg_write(const char *path, const char *buf, size_t size, + off_t offset, struct fuse_file_info *fi); +extern int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, + off_t offset, struct fuse_file_info *fi); +extern int cg_access(const char *path, int mode); + +#endif /* __LXCFS_CGROUP_FUSE_H */ diff --git a/utils.c b/utils.c index 1ebcf16..5bfd442 100644 --- a/utils.c +++ b/utils.c @@ -127,3 +127,19 @@ int preserve_ns(const int pid, const char *ns) return open(path, O_RDONLY | O_CLOEXEC); } + +void do_release_file_info(struct fuse_file_info *fi) +{ + struct file_info *f = (struct file_info *)fi->fh; + + if (!f) + return; + + fi->fh = 0; + + free_disarm(f->controller); + free_disarm(f->cgroup); + free_disarm(f->file); + free_disarm(f->buf); + free_disarm(f); +} diff --git a/utils.h b/utils.h index fbe775e..0a4dd3c 100644 --- a/utils.h +++ b/utils.h @@ -3,11 +3,16 @@ #ifndef __LXCFS_UTILS_H #define __LXCFS_UTILS_H +#define FUSE_USE_VERSION 26 + +#include + /* Reserve buffer size to account for file size changes. */ #define BUF_RESERVE_SIZE 512 extern void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...); extern bool is_shared_pidns(pid_t pid); extern int preserve_ns(const int pid, const char *ns); +extern void do_release_file_info(struct fuse_file_info *fi); #endif /* __LXCFS_UTILS_H */