X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=bindings.c;h=9657160fe485cf70a2248b3f9b8345dc818f76cc;hb=3137a0a63db273005d62973a5d055a7623e98631;hp=150d0996dde28459bbeeb36749aec71057f4d01b;hpb=ba59ea0956a7a22f9b360e292cf3b082963b91f3;p=mirror_lxcfs.git diff --git a/bindings.c b/bindings.c index 150d099..9657160 100644 --- a/bindings.c +++ b/bindings.c @@ -8,30 +8,55 @@ #define FUSE_USE_VERSION 26 -#include +#define __STDC_FORMAT_MACROS #include +#include #include #include -#include -#include -#include -#include -#include -#include +#include #include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include #include #include -#include -#include -#include +#include +#include +#include #include "bindings.h" - #include "config.h" // for VERSION +/* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */ +#define LXCFS_NUMSTRLEN64 21 + +/* Define pivot_root() if missing from the C library */ +#ifndef HAVE_PIVOT_ROOT +static int pivot_root(const char * new_root, const char * put_old) +{ +#ifdef __NR_pivot_root +return syscall(__NR_pivot_root, new_root, put_old); +#else +errno = ENOSYS; +return -1; +#endif +} +#else +extern int pivot_root(const char * new_root, const char * put_old); +#endif + enum { LXC_TYPE_CGDIR, LXC_TYPE_CGFILE, @@ -54,8 +79,8 @@ struct file_info { int cached; }; -/* reserve buffer size, for cpuall in /proc/stat */ -#define BUF_RESERVE_SIZE 256 +/* Reserve buffer size to account for file size changes. */ +#define BUF_RESERVE_SIZE 512 /* * A table caching which pid is init for a pid namespace. @@ -90,17 +115,38 @@ static void lock_mutex(pthread_mutex_t *l) int ret; if ((ret = pthread_mutex_lock(l)) != 0) { - fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret)); + lxcfs_error("returned:%d %s\n", ret, strerror(ret)); exit(1); } } +/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run. + * Number of hierarchies mounted. */ +static int num_hierarchies; + +/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run. + * Hierachies mounted {cpuset, blkio, ...}: + * Initialized via __constructor__ collect_and_mount_subsystems(). */ +static char **hierarchies; + +/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run. + * Open file descriptors: + * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a + * private mount namespace. + * Initialized via __constructor__ collect_and_mount_subsystems(). + * @fd_hierarchies[i] can be used to perform file operations on the cgroup + * mounts and respective files in the private namespace even when located in + * another namespace using the *at() family of functions + * {openat(), fchownat(), ...}. */ +static int *fd_hierarchies; +static int cgroup_mount_ns_fd = -1; + static void unlock_mutex(pthread_mutex_t *l) { int ret; if ((ret = pthread_mutex_unlock(l)) != 0) { - fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret)); + lxcfs_error("returned:%d %s\n", ret, strerror(ret)); exit(1); } } @@ -124,10 +170,10 @@ static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb) snprintf(fnam, 100, "/proc/%d", e->initpid); if (stat(fnam, &initsb) < 0) return false; -#if DEBUG - fprintf(stderr, "comparing ctime %ld %ld for pid %d\n", - e->ctime, initsb.st_ctime, e->initpid); -#endif + + lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime, + initsb.st_ctime, e->initpid); + if (e->ctime != initsb.st_ctime) return false; return true; @@ -139,9 +185,8 @@ static void remove_initpid(struct pidns_init_store *e) struct pidns_init_store *tmp; int h; -#if DEBUG - fprintf(stderr, "remove_initpid: removing entry for %d\n", e->initpid); -#endif + lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid); + h = HASH(e->ino); if (pidns_hash_table[h] == e) { pidns_hash_table[h] = e->next; @@ -176,18 +221,18 @@ static void prune_initpid_store(void) now = time(NULL); if (now < last_prune + PURGE_SECS) return; -#if DEBUG - fprintf(stderr, "pruning\n"); -#endif + + lxcfs_debug("%s\n", "Pruning."); + last_prune = now; threshold = now - 2 * PURGE_SECS; for (i = 0; i < PIDNS_HASH_SIZE; i++) { for (prev = NULL, e = pidns_hash_table[i]; e; ) { if (e->lastcheck < threshold) { -#if DEBUG - fprintf(stderr, "Removing cached entry for %d\n", e->initpid); -#endif + + lxcfs_debug("Removing cached entry for %d.\n", e->initpid); + delme = e; if (prev) prev->next = e->next; @@ -211,9 +256,8 @@ static void save_initpid(struct stat *sb, pid_t pid) struct stat procsb; int h; -#if DEBUG - fprintf(stderr, "save_initpid: adding entry for %d\n", pid); -#endif + lxcfs_debug("Save_initpid: adding entry for %d.\n", pid); + snprintf(fpath, 100, "/proc/%d", pid); if (stat(fpath, &procsb) < 0) return; @@ -256,10 +300,10 @@ static struct pidns_init_store *lookup_verify_initpid(struct stat *sb) return NULL; } -static int is_dir(const char *path) +static int is_dir(const char *path, int fd) { struct stat statbuf; - int ret = stat(path, &statbuf); + int ret = fstatat(fd, path, &statbuf, fd); if (ret == 0 && S_ISDIR(statbuf.st_mode)) return 1; return 0; @@ -339,12 +383,12 @@ static bool write_string(const char *fnam, const char *string, int fd) len = strlen(string); ret = fwrite(string, 1, len, f); if (ret != len) { - fprintf(stderr, "Error writing to file: %s\n", strerror(errno)); + lxcfs_error("Error writing to file: %s\n", strerror(errno)); fclose(f); return false; } if (fclose(f) < 0) { - fprintf(stderr, "Error writing to file: %s\n", strerror(errno)); + lxcfs_error("Error writing to file: %s\n", strerror(errno)); return false; } return true; @@ -364,7 +408,7 @@ static bool store_hierarchy(char *stridx, char *h) n *= ALLOC_NUM; char **tmp = realloc(hierarchies, n * sizeof(char *)); if (!tmp) { - fprintf(stderr, "Out of memory\n"); + lxcfs_error("%s\n", strerror(errno)); exit(1); } hierarchies = tmp; @@ -378,10 +422,12 @@ static void print_subsystems(void) { int i; + fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd); fprintf(stderr, "hierarchies:\n"); for (i = 0; i < num_hierarchies; i++) { if (hierarchies[i]) - fprintf(stderr, " %d: %s\n", i, hierarchies[i]); + fprintf(stderr, " %2d: fd: %3d: %s\n", i, + fd_hierarchies[i], hierarchies[i]); } } @@ -390,7 +436,7 @@ static bool in_comma_list(const char *needle, const char *haystack) const char *s = haystack, *e; size_t nlen = strlen(needle); - while (*s && (e = index(s, ','))) { + while (*s && (e = strchr(s, ','))) { if (nlen != e - s) { s = e + 1; continue; @@ -434,11 +480,15 @@ bool cgfs_set_value(const char *controller, const char *cgroup, const char *file { int ret, fd, cfd; size_t len; - char *fnam, *tmpc = find_mounted_controller(controller, &cfd); + char *fnam, *tmpc; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return false; - /* . + /cgroup + / + file + \0 */ + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / + file + \0 + */ len = strlen(cgroup) + strlen(file) + 3; fnam = alloca(len); ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file); @@ -454,36 +504,40 @@ bool cgfs_set_value(const char *controller, const char *cgroup, const char *file // Chown all the files in the cgroup directory. We do this when we create // a cgroup on behalf of a user. -static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid) +static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd) { - struct dirent dirent, *direntp; + struct dirent *direntp; char path[MAXPATHLEN]; size_t len; DIR *d; - int ret; + int fd1, ret; len = strlen(dirname); if (len >= MAXPATHLEN) { - fprintf(stderr, "chown_all_cgroup_files: pathname too long: %s\n", dirname); + lxcfs_error("Pathname too long: %s\n", dirname); return; } - d = opendir(dirname); + fd1 = openat(fd, dirname, O_DIRECTORY); + if (fd1 < 0) + return; + + d = fdopendir(fd1); if (!d) { - fprintf(stderr, "chown_all_cgroup_files: failed to open %s\n", dirname); + lxcfs_error("Failed to open %s\n", dirname); return; } - while (readdir_r(d, &dirent, &direntp) == 0 && direntp) { + while ((direntp = readdir(d))) { if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, "..")) continue; ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); if (ret < 0 || ret >= MAXPATHLEN) { - fprintf(stderr, "chown_all_cgroup_files: pathname too long under %s\n", dirname); + lxcfs_error("Pathname too long under %s\n", dirname); continue; } - if (chown(path, uid, gid) < 0) - fprintf(stderr, "Failed to chown file %s to %u:%u", path, uid, gid); + if (fchownat(fd, path, uid, gid, 0) < 0) + lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid); } closedir(d); } @@ -492,126 +546,141 @@ int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid) { int cfd; size_t len; - char *dirnam, *tmpc = find_mounted_controller(controller, &cfd); + char *dirnam, *tmpc; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return -EINVAL; - /* BASEDIR / tmpc / cg \0 */ - len = strlen(BASEDIR) + strlen(tmpc) + strlen(cg) + 3; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cg + \0 + */ + len = strlen(cg) + 2; dirnam = alloca(len); - snprintf(dirnam, len, "%s/%s/%s", BASEDIR,tmpc, cg); + snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg); - if (mkdir(dirnam, 0755) < 0) + if (mkdirat(cfd, dirnam, 0755) < 0) return -errno; if (uid == 0 && gid == 0) return 0; - if (chown(dirnam, uid, gid) < 0) + if (fchownat(cfd, dirnam, uid, gid, 0) < 0) return -errno; - chown_all_cgroup_files(dirnam, uid, gid); + chown_all_cgroup_files(dirnam, uid, gid, cfd); return 0; } -static bool recursive_rmdir(const char *dirname) +static bool recursive_rmdir(const char *dirname, int fd, const int cfd) { - struct dirent dirent, *direntp; + struct dirent *direntp; DIR *dir; bool ret = false; char pathname[MAXPATHLEN]; + int dupfd; - dir = opendir(dirname); + dupfd = dup(fd); // fdopendir() does bad things once it uses an fd. + if (dupfd < 0) + return false; + + dir = fdopendir(dupfd); if (!dir) { -#if DEBUG - fprintf(stderr, "%s: failed to open %s: %s\n", __func__, dirname, strerror(errno)); -#endif + lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno)); + close(dupfd); return false; } - while (!readdir_r(dir, &dirent, &direntp)) { + while ((direntp = readdir(dir))) { struct stat mystat; int rc; - if (!direntp) - break; - if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, "..")) continue; rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); if (rc < 0 || rc >= MAXPATHLEN) { - fprintf(stderr, "pathname too long\n"); + lxcfs_error("%s\n", "Pathname too long."); continue; } - ret = lstat(pathname, &mystat); - if (ret) { -#if DEBUG - fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno)); -#endif + rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW); + if (rc) { + lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno)); continue; } - if (S_ISDIR(mystat.st_mode)) { - if (!recursive_rmdir(pathname)) { -#if DEBUG - fprintf(stderr, "Error removing %s\n", pathname); -#endif - } - } + if (S_ISDIR(mystat.st_mode)) + if (!recursive_rmdir(pathname, fd, cfd)) + lxcfs_debug("Error removing %s.\n", pathname); } ret = true; if (closedir(dir) < 0) { - fprintf(stderr, "%s: failed to close directory %s: %s\n", __func__, dirname, strerror(errno)); + lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno)); ret = false; } - if (rmdir(dirname) < 0) { -#if DEBUG - fprintf(stderr, "%s: failed to delete %s: %s\n", __func__, dirname, strerror(errno)); -#endif + if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) { + lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno)); ret = false; } + close(dupfd); + return ret; } bool cgfs_remove(const char *controller, const char *cg) { - int cfd; + int fd, cfd; size_t len; - char *dirnam, *tmpc = find_mounted_controller(controller, &cfd); + char *dirnam, *tmpc; + bool bret; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return false; - /* BASEDIR / tmpc / cg \0 */ - len = strlen(BASEDIR) + strlen(tmpc) + strlen(cg) + 3; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cg + \0 + */ + len = strlen(cg) + 2; dirnam = alloca(len); - snprintf(dirnam, len, "%s/%s/%s", BASEDIR,tmpc, cg); - return recursive_rmdir(dirnam); + snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg); + + fd = openat(cfd, dirnam, O_DIRECTORY); + if (fd < 0) + return false; + + bret = recursive_rmdir(dirnam, fd, cfd); + close(fd); + return bret; } bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode) { int cfd; size_t len; - char *pathname, *tmpc = find_mounted_controller(controller, &cfd); + char *pathname, *tmpc; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return false; - /* BASEDIR / tmpc / file \0 */ - len = strlen(BASEDIR) + strlen(tmpc) + strlen(file) + 3; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /file + \0 + */ + len = strlen(file) + 2; pathname = alloca(len); - snprintf(pathname, len, "%s/%s/%s", BASEDIR, tmpc, file); - if (chmod(pathname, mode) < 0) + snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file); + if (fchmodat(cfd, pathname, mode, 0) < 0) return false; return true; } -static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid) +static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd) { size_t len; char *fname; @@ -619,10 +688,10 @@ static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid) len = strlen(dirname) + strlen("/cgroup.procs") + 1; fname = alloca(len); snprintf(fname, len, "%s/tasks", dirname); - if (chown(fname, uid, gid) != 0) + if (fchownat(fd, fname, uid, gid, 0) != 0) return -errno; snprintf(fname, len, "%s/cgroup.procs", dirname); - if (chown(fname, uid, gid) != 0) + if (fchownat(fd, fname, uid, gid, 0) != 0) return -errno; return 0; } @@ -631,37 +700,50 @@ int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t g { int cfd; size_t len; - char *pathname, *tmpc = find_mounted_controller(controller, &cfd); + char *pathname, *tmpc; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return -EINVAL; - /* BASEDIR / tmpc / file \0 */ - len = strlen(BASEDIR) + strlen(tmpc) + strlen(file) + 3; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /file + \0 + */ + len = strlen(file) + 2; pathname = alloca(len); - snprintf(pathname, len, "%s/%s/%s", BASEDIR, tmpc, file); - if (chown(pathname, uid, gid) < 0) + snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file); + if (fchownat(cfd, pathname, uid, gid, 0) < 0) return -errno; - if (is_dir(pathname)) + if (is_dir(pathname, cfd)) // like cgmanager did, we want to chown the tasks file as well - return chown_tasks_files(pathname, uid, gid); + return chown_tasks_files(pathname, uid, gid, cfd); return 0; } FILE *open_pids_file(const char *controller, const char *cgroup) { - int cfd; + int fd, cfd; size_t len; - char *pathname, *tmpc = find_mounted_controller(controller, &cfd); + char *pathname, *tmpc; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return NULL; - /* BASEDIR / tmpc / cgroup / "cgroup.procs" \0 */ - len = strlen(BASEDIR) + strlen(tmpc) + strlen(cgroup) + 4 + strlen("cgroup.procs"); + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / "cgroup.procs" + \0 + */ + len = strlen(cgroup) + strlen("cgroup.procs") + 3; pathname = alloca(len); - snprintf(pathname, len, "%s/%s/%s/cgroup.procs", BASEDIR, tmpc, cgroup); - return fopen(pathname, "w"); + snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup); + + fd = openat(cfd, pathname, O_WRONLY); + if (fd < 0) + return NULL; + + return fdopen(fd, "w"); } static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories, @@ -681,12 +763,12 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool if (!tmpc) return false; - /* Make sure we pass a relative path to openat(). */ + /* Make sure we pass a relative path to *at() family of functions. */ len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */; cg = alloca(len); ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup); if (ret < 0 || (size_t)ret >= len) { - fprintf(stderr, "%s: pathname too long under %s\n", __func__, cgroup); + lxcfs_error("Pathname too long under %s\n", cgroup); return false; } @@ -707,13 +789,13 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name); if (ret < 0 || ret >= MAXPATHLEN) { - fprintf(stderr, "%s: pathname too long under %s\n", __func__, cg); + lxcfs_error("Pathname too long under %s\n", cg); continue; } ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW); if (ret) { - fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno)); + lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno)); continue; } if ((!directories && !S_ISREG(mystat.st_mode)) || @@ -733,7 +815,7 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool sz++; } if (closedir(dir) < 0) { - fprintf(stderr, "%s: failed closedir for %s: %s\n", __func__, cgroup, strerror(errno)); + lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno)); return false; } return true; @@ -777,20 +859,24 @@ bool cgfs_get_value(const char *controller, const char *cgroup, const char *file { int ret, fd, cfd; size_t len; - char *fnam, *tmpc = find_mounted_controller(controller, &cfd); + char *fnam, *tmpc; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return false; - /* . + /cgroup + / + file + \0 */ + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / + file + \0 + */ len = strlen(cgroup) + strlen(file) + 3; fnam = alloca(len); ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file); if (ret < 0 || (size_t)ret >= len) - return NULL; + return false; fd = openat(cfd, fnam, O_RDONLY); if (fd < 0) - return NULL; + return false; *value = slurp_file(fnam, fd); return *value != NULL; @@ -800,20 +886,23 @@ struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, cons { int ret, cfd; size_t len; - char *fnam, *tmpc = find_mounted_controller(controller, &cfd); + char *fnam, *tmpc; struct stat sb; struct cgfs_files *newkey; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return false; if (file && *file == '/') file++; - if (file && index(file, '/')) + if (file && strchr(file, '/')) return NULL; - /* . + /cgroup + / + file + \0 */ + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / + file + \0 + */ len = strlen(cgroup) + 3; if (file) len += strlen(file) + 1; @@ -830,8 +919,8 @@ struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, cons } while (!newkey); if (file) newkey->name = must_copy_string(file); - else if (rindex(cgroup, '/')) - newkey->name = must_copy_string(rindex(cgroup, '/')); + else if (strrchr(cgroup, '/')) + newkey->name = must_copy_string(strrchr(cgroup, '/')); else newkey->name = must_copy_string(cgroup); newkey->uid = sb.st_uid; @@ -845,8 +934,8 @@ static void *make_key_list_entry(const char *controller, const char *cgroup, con { struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry); if (!entry) { - fprintf(stderr, "%s: Error getting files under %s:%s\n", - __func__, controller, cgroup); + lxcfs_error("Error getting files under %s:%s\n", controller, + cgroup); } return entry; } @@ -860,13 +949,17 @@ bool is_child_cgroup(const char *controller, const char *cgroup, const char *f) { int cfd; size_t len; - char *fnam, *tmpc = find_mounted_controller(controller, &cfd); + char *fnam, *tmpc; int ret; struct stat sb; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return false; - /* . + /cgroup + / + f + \0 */ + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / + f + \0 + */ len = strlen(cgroup) + strlen(f) + 3; fnam = alloca(len); ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f); @@ -876,6 +969,7 @@ bool is_child_cgroup(const char *controller, const char *cgroup, const char *f) ret = fstatat(cfd, fnam, &sb, 0); if (ret < 0 || !S_ISDIR(sb.st_mode)) return false; + return true; } @@ -1078,7 +1172,7 @@ convert_id_to_ns(FILE *idfile, unsigned int in_id) * uids wrapped around - unexpected as this is a procfile, * so just bail. */ - fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n", + lxcfs_error("pid wrapparound at entry %u %u %u in %s\n", nsuid, hostuid, count, line); return -1; } @@ -1184,11 +1278,11 @@ static char *get_next_cgroup_dir(const char *taskcg, const char *querycg) char *start, *end; if (strlen(taskcg) <= strlen(querycg)) { - fprintf(stderr, "%s: I was fed bad input\n", __func__); + lxcfs_error("%s\n", "I was fed bad input."); return NULL; } - if (strcmp(querycg, "/") == 0) + if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0)) start = strdup(taskcg + 1); else start = strdup(taskcg + strlen(querycg) + 1); @@ -1414,23 +1508,30 @@ static char *pick_controller_from_path(struct fuse_context *fc, const char *path const char *p1; char *contr, *slash; - if (strlen(path) < 9) + if (strlen(path) < 9) { + errno = EACCES; return NULL; - if (*(path+7) != '/') + } + if (*(path + 7) != '/') { + errno = EINVAL; return NULL; - p1 = path+8; + } + p1 = path + 8; contr = strdupa(p1); - if (!contr) + if (!contr) { + errno = ENOMEM; return NULL; + } slash = strstr(contr, "/"); if (slash) *slash = '\0'; int i; - for (i = 0; i < num_hierarchies; i++) { + for (i = 0; i < num_hierarchies; i++) { if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0) return hierarchies[i]; } + errno = ENOENT; return NULL; } @@ -1442,12 +1543,17 @@ static const char *find_cgroup_in_path(const char *path) { const char *p1; - if (strlen(path) < 9) + if (strlen(path) < 9) { + errno = EACCES; return NULL; - p1 = strstr(path+8, "/"); - if (!p1) + } + p1 = strstr(path + 8, "/"); + if (!p1) { + errno = EINVAL; return NULL; - return p1+1; + } + errno = 0; + return p1 + 1; } /* @@ -1506,7 +1612,7 @@ int cg_getattr(const char *path, struct stat *sb) controller = pick_controller_from_path(fc, path); if (!controller) - return -EIO; + return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) { /* this is just /cgroup/controller, return it as a dir */ @@ -1576,11 +1682,6 @@ int cg_getattr(const char *path, struct stat *sb) ret = -ENOENT; goto out; } - if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) { - ret = -EACCES; - goto out; - } - ret = 0; } @@ -1606,7 +1707,7 @@ int cg_opendir(const char *path, struct fuse_file_info *fi) // return list of keys for the controller, and list of child cgroups controller = pick_controller_from_path(fc, path); if (!controller) - return -EIO; + return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) { @@ -1650,8 +1751,11 @@ int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset struct fuse_context *fc = fuse_get_context(); char **clist = NULL; + if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0) + return -EIO; + if (d->type != LXC_TYPE_CGDIR) { - fprintf(stderr, "Internal error: file cache info used in readdir\n"); + lxcfs_error("%s\n", "Internal error: file cache info used in readdir."); return -EIO; } if (!d->cgroup && !d->controller) { @@ -1761,10 +1865,10 @@ int cg_open(const char *path, struct fuse_file_info *fi) controller = pick_controller_from_path(fc, path); if (!controller) - return -EIO; + return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) - return -EINVAL; + return -errno; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { @@ -1817,18 +1921,22 @@ out: int cg_access(const char *path, int mode) { + int ret; const char *cgroup; - char *last = NULL, *path1, *path2, * cgdir = NULL, *controller; + char *path1, *path2, *controller; + char *last = NULL, *cgdir = NULL; struct cgfs_files *k = NULL; struct fuse_context *fc = fuse_get_context(); - int ret; + + if (strcmp(path, "/cgroup") == 0) + return 0; if (!fc) return -EIO; controller = pick_controller_from_path(fc, path); if (!controller) - return -EIO; + return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) { // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not @@ -1892,14 +2000,14 @@ static bool wait_for_sock(int sock, int timeout) return false; if ((epfd = epoll_create(1)) < 0) { - fprintf(stderr, "Failed to create epoll socket: %m\n"); + lxcfs_error("%s\n", "Failed to create epoll socket: %m."); return false; } ev.events = POLLIN_SET; ev.data.fd = sock; if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) { - fprintf(stderr, "Failed adding socket to epoll: %m\n"); + lxcfs_error("%s\n", "Failed adding socket to epoll: %m."); close(epfd); return false; } @@ -1947,8 +2055,7 @@ static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst) if (pingfirst) { if (msgrecv(sock, buf, 1) != 1) { - fprintf(stderr, "%s: Error getting reply from server over socketpair\n", - __func__); + lxcfs_error("%s\n", "Error getting reply from server over socketpair."); return SEND_CREDS_FAIL; } } @@ -1972,8 +2079,7 @@ static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst) msg.msg_iovlen = 1; if (sendmsg(sock, &msg, 0) < 0) { - fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__, - strerror(errno)); + lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno)); if (errno == 3) return SEND_CREDS_NOTSK; return SEND_CREDS_FAIL; @@ -1999,12 +2105,12 @@ static bool recv_creds(int sock, struct ucred *cred, char *v) cred->gid = -1; if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) { - fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno)); + lxcfs_error("Failed to set passcred: %s\n", strerror(errno)); return false; } buf[0] = '1'; if (write(sock, buf, 1) != 1) { - fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno)); + lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno)); return false; } @@ -2019,14 +2125,12 @@ static bool recv_creds(int sock, struct ucred *cred, char *v) msg.msg_iovlen = 1; if (!wait_for_sock(sock, 2)) { - fprintf(stderr, "Timed out waiting for scm_cred: %s\n", - strerror(errno)); + lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno)); return false; } ret = recvmsg(sock, &msg, MSG_DONTWAIT); if (ret < 0) { - fprintf(stderr, "Failed to receive scm_cred: %s\n", - strerror(errno)); + lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno)); return false; } @@ -2059,10 +2163,8 @@ static int pid_ns_clone_wrapper(void *arg) { char b = '1'; close(args->cpipe[0]); - if (write(args->cpipe[1], &b, sizeof(char)) < 0) { - fprintf(stderr, "%s (child): error on write: %s\n", - __func__, strerror(errno)); - } + if (write(args->cpipe[1], &b, sizeof(char)) < 0) + lxcfs_error("(child): error on write: %s.\n", strerror(errno)); close(args->cpipe[1]); return args->wrapped(args->sock, args->tpid); } @@ -2196,13 +2298,11 @@ bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *fi // read converted results if (!wait_for_sock(sock[0], 2)) { - fprintf(stderr, "%s: timed out waiting for pid from child: %s\n", - __func__, strerror(errno)); + lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno)); goto out; } if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { - fprintf(stderr, "%s: error reading pid from child: %s\n", - __func__, strerror(errno)); + lxcfs_error("Error reading pid from child: %s.\n", strerror(errno)); goto out; } must_strcat_pid(d, &sz, &asz, qpid); @@ -2217,8 +2317,7 @@ next: v = '1'; if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) { // failed to ask child to exit - fprintf(stderr, "%s: failed to ask child to exit: %s\n", - __func__, strerror(errno)); + lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno)); goto out; } @@ -2246,7 +2345,7 @@ int cg_read(const char *path, char *buf, size_t size, off_t offset, bool r; if (f->type != LXC_TYPE_CGFILE) { - fprintf(stderr, "Internal error: directory cache info used in cg_read\n"); + lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read."); return -EIO; } @@ -2313,12 +2412,11 @@ static int pid_from_ns(int sock, pid_t tpid) cred.gid = 0; while (1) { if (!wait_for_sock(sock, 2)) { - fprintf(stderr, "%s: timeout reading from parent\n", __func__); + lxcfs_error("%s\n", "Timeout reading from parent."); return 1; } if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) { - fprintf(stderr, "%s: bad read from parent: %s\n", - __func__, strerror(errno)); + lxcfs_error("Bad read from parent: %s.\n", strerror(errno)); return 1; } if (vpid == -1) // done @@ -2419,20 +2517,20 @@ void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid) *gid = -1; sprintf(line, "/proc/%d/status", pid); if ((f = fopen(line, "r")) == NULL) { - fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno)); + lxcfs_error("Error opening %s: %s\n", line, strerror(errno)); return; } while (fgets(line, 400, f)) { if (strncmp(line, "Uid:", 4) == 0) { if (sscanf(line+4, "%u", &u) != 1) { - fprintf(stderr, "bad uid line for pid %u\n", pid); + lxcfs_error("bad uid line for pid %u\n", pid); fclose(f); return; } *uid = u; } else if (strncmp(line, "Gid:", 4) == 0) { if (sscanf(line+4, "%u", &g) != 1) { - fprintf(stderr, "bad gid line for pid %u\n", pid); + lxcfs_error("bad gid line for pid %u\n", pid); fclose(f); return; } @@ -2504,8 +2602,7 @@ static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char char v; if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { - fprintf(stderr, "%s: error writing pid to child: %s\n", - __func__, strerror(errno)); + lxcfs_error("Error writing pid to child: %s.\n", strerror(errno)); goto out; } @@ -2529,7 +2626,7 @@ static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char /* All good, write the value */ qpid = -1; if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid)) - fprintf(stderr, "Warning: failed to ask child to exit\n"); + lxcfs_error("%s\n", "Warning: failed to ask child to exit."); if (!fail) answer = true; @@ -2558,7 +2655,7 @@ int cg_write(const char *path, const char *buf, size_t size, off_t offset, bool r; if (f->type != LXC_TYPE_CGFILE) { - fprintf(stderr, "Internal error: directory cache info used in cg_write\n"); + lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write."); return -EIO; } @@ -2611,15 +2708,16 @@ int cg_chown(const char *path, uid_t uid, gid_t gid) return -EIO; if (strcmp(path, "/cgroup") == 0) - return -EINVAL; + return -EPERM; controller = pick_controller_from_path(fc, path); if (!controller) - return -EINVAL; + return errno == ENOENT ? -EPERM : -errno; + cgroup = find_cgroup_in_path(path); if (!cgroup) /* this is just /cgroup/controller */ - return -EINVAL; + return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); @@ -2676,15 +2774,16 @@ int cg_chmod(const char *path, mode_t mode) return -EIO; if (strcmp(path, "/cgroup") == 0) - return -EINVAL; + return -EPERM; controller = pick_controller_from_path(fc, path); if (!controller) - return -EINVAL; + return errno == ENOENT ? -EPERM : -errno; + cgroup = find_cgroup_in_path(path); if (!cgroup) /* this is just /cgroup/controller */ - return -EINVAL; + return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); @@ -2742,14 +2841,13 @@ int cg_mkdir(const char *path, mode_t mode) if (!fc) return -EIO; - controller = pick_controller_from_path(fc, path); if (!controller) - return -EINVAL; + return errno == ENOENT ? -EPERM : -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) - return -EINVAL; + return -errno; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) @@ -2766,7 +2864,7 @@ int cg_mkdir(const char *path, mode_t mode) else if (last && strcmp(next, last) == 0) ret = -EEXIST; else - ret = -ENOENT; + ret = -EPERM; goto out; } @@ -2798,16 +2896,20 @@ int cg_rmdir(const char *path) return -EIO; controller = pick_controller_from_path(fc, path); - if (!controller) - return -EINVAL; + if (!controller) /* Someone's trying to delete "/cgroup". */ + return -EPERM; cgroup = find_cgroup_in_path(path); - if (!cgroup) - return -EINVAL; + if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */ + return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { - ret = -EINVAL; + /* Someone's trying to delete a cgroup on the same level as the + * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or + * rmdir "/cgroup/blkio/init.slice". + */ + ret = -EPERM; goto out; } @@ -2815,7 +2917,7 @@ int cg_rmdir(const char *path) if (initpid <= 0) initpid = fc->pid; if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) { - if (!last || strcmp(next, last) == 0) + if (!last || (next && (strcmp(next, last) == 0))) ret = -EBUSY; else ret = -ENOENT; @@ -2851,16 +2953,32 @@ static bool startswith(const char *line, const char *pref) return false; } -static void get_mem_cached(char *memstat, unsigned long *v) +static void parse_memstat(char *memstat, unsigned long *cached, + unsigned long *active_anon, unsigned long *inactive_anon, + unsigned long *active_file, unsigned long *inactive_file, + unsigned long *unevictable) { char *eol; - *v = 0; while (*memstat) { if (startswith(memstat, "total_cache")) { - sscanf(memstat + 11, "%lu", v); - *v /= 1024; - return; + sscanf(memstat + 11, "%lu", cached); + *cached /= 1024; + } else if (startswith(memstat, "total_active_anon")) { + sscanf(memstat + 17, "%lu", active_anon); + *active_anon /= 1024; + } else if (startswith(memstat, "total_inactive_anon")) { + sscanf(memstat + 19, "%lu", inactive_anon); + *inactive_anon /= 1024; + } else if (startswith(memstat, "total_active_file")) { + sscanf(memstat + 17, "%lu", active_file); + *active_file /= 1024; + } else if (startswith(memstat, "total_inactive_file")) { + sscanf(memstat + 19, "%lu", inactive_file); + *inactive_file /= 1024; + } else if (startswith(memstat, "total_unevictable")) { + sscanf(memstat + 17, "%lu", unevictable); + *unevictable /= 1024; } eol = strchr(memstat, '\n'); if (!eol) @@ -2911,7 +3029,7 @@ static int read_file(const char *path, char *buf, size_t size, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -2937,12 +3055,12 @@ static int read_file(const char *path, char *buf, size_t size, * FUSE ops for /proc */ -static unsigned long get_memlimit(const char *cgroup) +static unsigned long get_memlimit(const char *cgroup, const char *file) { char *memlimit_str = NULL; unsigned long memlimit = -1; - if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str)) + if (cgfs_get_value("memory", cgroup, file, &memlimit_str)) memlimit = strtoul(memlimit_str, NULL, 10); free(memlimit_str); @@ -2950,16 +3068,16 @@ static unsigned long get_memlimit(const char *cgroup) return memlimit; } -static unsigned long get_min_memlimit(const char *cgroup) +static unsigned long get_min_memlimit(const char *cgroup, const char *file) { char *copy = strdupa(cgroup); unsigned long memlimit = 0, retlimit; - retlimit = get_memlimit(copy); + retlimit = get_memlimit(copy, file); while (strcmp(copy, "/") != 0) { copy = dirname(copy); - memlimit = get_memlimit(copy); + memlimit = get_memlimit(copy, file); if (memlimit != -1 && memlimit < retlimit) retlimit = memlimit; }; @@ -2974,10 +3092,11 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, struct file_info *d = (struct file_info *)fi->fh; char *cg; char *memusage_str = NULL, *memstat_str = NULL, - *memswlimit_str = NULL, *memswusage_str = NULL, - *memswlimit_default_str = NULL, *memswusage_default_str = NULL; + *memswlimit_str = NULL, *memswusage_str = NULL; unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0, - cached = 0, hosttotal = 0; + cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0, + active_file = 0, inactive_file = 0, unevictable = 0, + hostswtotal = 0; char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; char *cache = d->buf; @@ -3003,7 +3122,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, return read_file("/proc/meminfo", buf, size, d); prune_init_slice(cg); - memlimit = get_min_memlimit(cg); + memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str)) goto err; if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str)) @@ -3014,20 +3133,9 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) && cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str)) { - /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */ - if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str)) - goto err; - if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str)) - goto err; - - memswlimit = strtoul(memswlimit_str, NULL, 10); + memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes"); memswusage = strtoul(memswusage_str, NULL, 10); - if (!strcmp(memswlimit_str, memswlimit_default_str)) - memswlimit = 0; - if (!strcmp(memswusage_str, memswusage_default_str)) - memswusage = 0; - memswlimit = memswlimit / 1024; memswusage = memswusage / 1024; } @@ -3036,7 +3144,9 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, memlimit /= 1024; memusage /= 1024; - get_mem_cached(memstat_str, &cached); + parse_memstat(memstat_str, &cached, &active_anon, + &inactive_anon, &active_file, &inactive_file, + &unevictable); f = fopen("/proc/meminfo", "r"); if (!f) @@ -3048,7 +3158,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, memset(lbuf, 0, 100); if (startswith(line, "MemTotal:")) { - sscanf(line+14, "%lu", &hosttotal); + sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal); if (hosttotal < memlimit) memlimit = hosttotal; snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit); @@ -3057,14 +3167,19 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage); printme = lbuf; } else if (startswith(line, "MemAvailable:")) { - snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage); + snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached); printme = lbuf; } else if (startswith(line, "SwapTotal:") && memswlimit > 0) { - snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit); + sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal); + if (hostswtotal < memswlimit) + memswlimit = hostswtotal; + snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit); printme = lbuf; } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) { - snprintf(lbuf, 100, "SwapFree: %8lu kB\n", - (memswlimit - memlimit) - (memswusage - memusage)); + unsigned long swaptotal = memswlimit, + swapusage = memswusage - memusage, + swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0; + snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree); printme = lbuf; } else if (startswith(line, "Slab:")) { snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL); @@ -3078,6 +3193,35 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, } else if (startswith(line, "SwapCached:")) { snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL); printme = lbuf; + } else if (startswith(line, "Active:")) { + snprintf(lbuf, 100, "Active: %8lu kB\n", + active_anon + active_file); + printme = lbuf; + } else if (startswith(line, "Inactive:")) { + snprintf(lbuf, 100, "Inactive: %8lu kB\n", + inactive_anon + inactive_file); + printme = lbuf; + } else if (startswith(line, "Active(anon)")) { + snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon); + printme = lbuf; + } else if (startswith(line, "Inactive(anon)")) { + snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon); + printme = lbuf; + } else if (startswith(line, "Active(file)")) { + snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file); + printme = lbuf; + } else if (startswith(line, "Inactive(file)")) { + snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file); + printme = lbuf; + } else if (startswith(line, "Unevictable")) { + snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable); + printme = lbuf; + } else if (startswith(line, "SReclaimable")) { + snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL); + printme = lbuf; + } else if (startswith(line, "SUnreclaim")) { + snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL); + printme = lbuf; } else printme = line; @@ -3089,7 +3233,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3114,8 +3258,6 @@ err: free(memswlimit_str); free(memswusage_str); free(memstat_str); - free(memswlimit_default_str); - free(memswusage_default_str); return rv; } @@ -3220,7 +3362,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3245,7 +3387,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3263,7 +3405,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3321,6 +3463,137 @@ err: return rv; } +static uint64_t get_reaper_start_time(pid_t pid) +{ + int ret; + FILE *f; + uint64_t starttime; + /* strlen("/proc/") = 6 + * + + * LXCFS_NUMSTRLEN64 + * + + * strlen("/stat") = 5 + * + + * \0 = 1 + * */ +#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1) + char path[__PROC_PID_STAT_LEN]; + pid_t qpid; + + qpid = lookup_initpid_in_store(pid); + if (qpid <= 0) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid); + if (ret < 0 || ret >= __PROC_PID_STAT_LEN) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + f = fopen(path, "r"); + if (!f) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + /* Note that the *scanf() argument supression requires that length + * modifiers such as "l" are omitted. Otherwise some compilers will yell + * at us. It's like telling someone you're not married and then asking + * if you can bring your wife to the party. + */ + ret = fscanf(f, "%*d " /* (1) pid %d */ + "%*s " /* (2) comm %s */ + "%*c " /* (3) state %c */ + "%*d " /* (4) ppid %d */ + "%*d " /* (5) pgrp %d */ + "%*d " /* (6) session %d */ + "%*d " /* (7) tty_nr %d */ + "%*d " /* (8) tpgid %d */ + "%*u " /* (9) flags %u */ + "%*u " /* (10) minflt %lu */ + "%*u " /* (11) cminflt %lu */ + "%*u " /* (12) majflt %lu */ + "%*u " /* (13) cmajflt %lu */ + "%*u " /* (14) utime %lu */ + "%*u " /* (15) stime %lu */ + "%*d " /* (16) cutime %ld */ + "%*d " /* (17) cstime %ld */ + "%*d " /* (18) priority %ld */ + "%*d " /* (19) nice %ld */ + "%*d " /* (20) num_threads %ld */ + "%*d " /* (21) itrealvalue %ld */ + "%" PRIu64, /* (22) starttime %llu */ + &starttime); + if (ret != 1) { + fclose(f); + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + fclose(f); + + errno = 0; + return starttime; +} + +static uint64_t get_reaper_start_time_in_sec(pid_t pid) +{ + uint64_t clockticks; + int64_t ticks_per_sec; + + clockticks = get_reaper_start_time(pid); + if (clockticks == 0 && errno == EINVAL) { + lxcfs_debug("failed to retrieve start time of pid %d\n", pid); + return 0; + } + + ticks_per_sec = sysconf(_SC_CLK_TCK); + if (ticks_per_sec < 0 && errno == EINVAL) { + lxcfs_debug( + "%s\n", + "failed to determine number of clock ticks in a second"); + return 0; + } + + return (clockticks /= ticks_per_sec); +} + +static uint64_t get_reaper_age(pid_t pid) +{ + uint64_t procstart, uptime, procage; + + /* We need to substract the time the process has started since system + * boot minus the time when the system has started to get the actual + * reaper age. + */ + procstart = get_reaper_start_time_in_sec(pid); + procage = procstart; + if (procstart > 0) { + int ret; + struct timespec spec; + + ret = clock_gettime(CLOCK_BOOTTIME, &spec); + if (ret < 0) + return 0; + /* We could make this more precise here by using the tv_nsec + * field in the timespec struct and convert it to milliseconds + * and then create a double for the seconds and milliseconds but + * that seems more work than it is worth. + */ + uptime = spec.tv_sec; + procage = uptime - procstart; + } + + return procage; +} + +#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2) static int proc_stat_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { @@ -3331,10 +3604,9 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; int curcpu = -1; /* cpu numbering starts at 0 */ - unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0; + unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0; unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0, - irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0; -#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE + irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0; char cpuall[CPUALL_MAX_SIZE]; /* reserve for cpu all */ char *cache = d->buf + CPUALL_MAX_SIZE; @@ -3370,7 +3642,7 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, //skip first line if (getline(&line, &linelen, f) < 0) { - fprintf(stderr, "proc_stat_read read first line failed\n"); + lxcfs_error("%s\n", "proc_stat_read read first line failed."); goto err; } @@ -3380,6 +3652,8 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, char cpu_char[10]; /* That's a lot of cores */ char *c; + if (strlen(line) == 0) + continue; if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) { /* not a ^cpuN line containing a number N, just print it */ l = snprintf(cache, cache_size, "%s", line); @@ -3389,7 +3663,7 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3416,7 +3690,7 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3425,8 +3699,17 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, cache_size -= l; total_len += l; - if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq, - &softirq, &steal, &guest) != 9) + if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + &user, + &nice, + &system, + &idle, + &iowait, + &irq, + &softirq, + &steal, + &guest, + &guest_nice) != 10) continue; user_sum += user; nice_sum += nice; @@ -3437,18 +3720,28 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, softirq_sum += softirq; steal_sum += steal; guest_sum += guest; + guest_nice_sum += guest_nice; } cache = d->buf; - int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum); - if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){ + int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + user_sum, + nice_sum, + system_sum, + idle_sum, + iowait_sum, + irq_sum, + softirq_sum, + steal_sum, + guest_sum, + guest_nice_sum); + if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) { memcpy(cache, cpuall, cpuall_len); cache += cpuall_len; - } else{ + } else { /* shouldn't happen */ - fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len); + lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len); cpuall_len = 0; } @@ -3456,7 +3749,8 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, total_len += cpuall_len; d->cached = 1; d->size = total_len; - if (total_len > size ) total_len = size; + if (total_len > size) + total_len = size; memcpy(buf, d->buf, total_len); rv = total_len; @@ -3470,27 +3764,13 @@ err: return rv; } -static long int getreaperage(pid_t pid) -{ - char fnam[100]; - struct stat sb; - int ret; - pid_t qpid; - - qpid = lookup_initpid_in_store(pid); - if (qpid <= 0) - return 0; - - ret = snprintf(fnam, 100, "/proc/%d", qpid); - if (ret < 0 || ret >= 100) - return 0; - - if (lstat(fnam, &sb) < 0) - return 0; - - return time(NULL) - sb.st_ctime; -} - +/* This function retrieves the busy time of a group of tasks by looking at + * cpuacct.usage. Unfortunately, this only makes sense when the container has + * been given it's own cpuacct cgroup. If not, this function will take the busy + * time of all other taks that do not actually belong to the container into + * account as well. If someone has a clever solution for this please send a + * patch! + */ static unsigned long get_reaper_busy(pid_t task) { pid_t initpid = lookup_initpid_in_store(task); @@ -3518,17 +3798,9 @@ out: #if RELOADTEST void iwashere(void) { - char *name, *cwd = get_current_dir_name(); - size_t len; int fd; - if (!cwd) - exit(1); - len = strlen(cwd) + strlen("/iwashere") + 1; - name = alloca(len); - snprintf(name, len, "%s/iwashere", cwd); - free(cwd); - fd = creat(name, 0755); + fd = creat("/tmp/lxcfs-iwashere", 0644); if (fd >= 0) close(fd); } @@ -3544,33 +3816,37 @@ static int proc_uptime_read(char *buf, size_t size, off_t offset, { struct fuse_context *fc = fuse_get_context(); struct file_info *d = (struct file_info *)fi->fh; - long int reaperage = getreaperage(fc->pid); - unsigned long int busytime = get_reaper_busy(fc->pid), idletime; + unsigned long int busytime = get_reaper_busy(fc->pid); char *cache = d->buf; ssize_t total_len = 0; + uint64_t idletime, reaperage; #if RELOADTEST iwashere(); #endif if (offset){ - if (offset > d->size) - return -EINVAL; if (!d->cached) return 0; + if (offset > d->size) + return -EINVAL; int left = d->size - offset; total_len = left > size ? size: left; memcpy(buf, cache + offset, total_len); return total_len; } - idletime = reaperage - busytime; - if (idletime > reaperage) - idletime = reaperage; + reaperage = get_reaper_age(fc->pid); + /* To understand why this is done, please read the comment to the + * get_reaper_busy() function. + */ + idletime = reaperage; + if (reaperage >= busytime) + idletime = reaperage - busytime; - total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime); - if (total_len < 0){ - perror("Error writing to cache"); + total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime); + if (total_len < 0 || total_len >= d->buflen){ + lxcfs_error("%s\n", "failed to write to cache"); return 0; } @@ -3688,7 +3964,7 @@ static int proc_diskstats_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3722,8 +3998,7 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, struct fuse_context *fc = fuse_get_context(); struct file_info *d = (struct file_info *)fi->fh; char *cg = NULL; - char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL, - *memswlimit_default_str = NULL, *memswusage_default_str = NULL; + char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL; unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0; ssize_t total_len = 0, rv = 0; ssize_t l = 0; @@ -3748,32 +4023,19 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, return read_file("/proc/swaps", buf, size, d); prune_init_slice(cg); - if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str)) - goto err; + memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str)) goto err; - memlimit = strtoul(memlimit_str, NULL, 10); memusage = strtoul(memusage_str, NULL, 10); if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) && cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) { - /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */ - if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str)) - goto err; - if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str)) - goto err; - - memswlimit = strtoul(memswlimit_str, NULL, 10); + memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes"); memswusage = strtoul(memswusage_str, NULL, 10); - if (!strcmp(memswlimit_str, memswlimit_default_str)) - memswlimit = 0; - if (!strcmp(memswusage_str, memswusage_default_str)) - memswusage = 0; - swap_total = (memswlimit - memlimit) / 1024; swap_free = (memswusage - memusage) / 1024; } @@ -3827,8 +4089,6 @@ err: free(memlimit_str); free(memusage_str); free(memswusage_str); - free(memswusage_default_str); - free(memswlimit_default_str); return rv; } @@ -3881,12 +4141,14 @@ int proc_getattr(const char *path, struct stat *sb) int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi) { - if (filler(buf, "cpuinfo", NULL, 0) != 0 || - filler(buf, "meminfo", NULL, 0) != 0 || - filler(buf, "stat", NULL, 0) != 0 || - filler(buf, "uptime", NULL, 0) != 0 || - filler(buf, "diskstats", NULL, 0) != 0 || - filler(buf, "swaps", NULL, 0) != 0) + if (filler(buf, ".", NULL, 0) != 0 || + filler(buf, "..", NULL, 0) != 0 || + filler(buf, "cpuinfo", NULL, 0) != 0 || + filler(buf, "meminfo", NULL, 0) != 0 || + filler(buf, "stat", NULL, 0) != 0 || + filler(buf, "uptime", NULL, 0) != 0 || + filler(buf, "diskstats", NULL, 0) != 0 || + filler(buf, "swaps", NULL, 0) != 0) return -EINVAL; return 0; } @@ -3932,6 +4194,9 @@ int proc_open(const char *path, struct fuse_file_info *fi) int proc_access(const char *path, int mask) { + if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0) + return 0; + /* these are all read-only */ if ((mask & ~R_OK) != 0) return -EACCES; @@ -3967,22 +4232,365 @@ int proc_read(const char *path, char *buf, size_t size, off_t offset, } } -static void __attribute__((constructor)) collect_subsystems(void) +/* + * Functions needed to setup cgroups in the __constructor__. + */ + +static bool mkdir_p(const char *dir, mode_t mode) +{ + const char *tmp = dir; + const char *orig = dir; + char *makeme; + + do { + dir = tmp + strspn(tmp, "/"); + tmp = dir + strcspn(dir, "/"); + makeme = strndup(orig, dir - orig); + if (!makeme) + return false; + if (mkdir(makeme, mode) && errno != EEXIST) { + lxcfs_error("Failed to create directory '%s': %s.\n", + makeme, strerror(errno)); + free(makeme); + return false; + } + free(makeme); + } while(tmp != dir); + + return true; +} + +static bool umount_if_mounted(void) +{ + if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { + lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); + return false; + } + return true; +} + +/* __typeof__ should be safe to use with all compilers. */ +typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; +static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) +{ + return (fs->f_type == (fs_type_magic)magic_val); +} + +/* + * looking at fs/proc_namespace.c, it appears we can + * actually expect the rootfs entry to very specifically contain + * " - rootfs rootfs " + * IIUC, so long as we've chrooted so that rootfs is not our root, + * the rootfs entry should always be skipped in mountinfo contents. + */ +static bool is_on_ramfs(void) { FILE *f; + char *p, *p2; char *line = NULL; size_t len = 0; + int i; + + f = fopen("/proc/self/mountinfo", "r"); + if (!f) + return false; + + while (getline(&line, &len, f) != -1) { + for (p = line, i = 0; p && i < 4; i++) + p = strchr(p + 1, ' '); + if (!p) + continue; + p2 = strchr(p + 1, ' '); + if (!p2) + continue; + *p2 = '\0'; + if (strcmp(p + 1, "/") == 0) { + // this is '/'. is it the ramfs? + p = strchr(p2 + 1, '-'); + if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) { + free(line); + fclose(f); + return true; + } + } + } + free(line); + fclose(f); + return false; +} + +static int pivot_enter() +{ + int ret = -1, oldroot = -1, newroot = -1; + + oldroot = open("/", O_DIRECTORY | O_RDONLY); + if (oldroot < 0) { + lxcfs_error("%s\n", "Failed to open old root for fchdir."); + return ret; + } + + newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY); + if (newroot < 0) { + lxcfs_error("%s\n", "Failed to open new root for fchdir."); + goto err; + } + + /* change into new root fs */ + if (fchdir(newroot) < 0) { + lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR); + goto err; + } + + /* pivot_root into our new root fs */ + if (pivot_root(".", ".") < 0) { + lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno)); + goto err; + } + + /* + * At this point the old-root is mounted on top of our new-root. + * To unmounted it we must not be chdir'd into it, so escape back + * to the old-root. + */ + if (fchdir(oldroot) < 0) { + lxcfs_error("%s\n", "Failed to enter old root."); + goto err; + } + + if (umount2(".", MNT_DETACH) < 0) { + lxcfs_error("%s\n", "Failed to detach old root."); + goto err; + } + + if (fchdir(newroot) < 0) { + lxcfs_error("%s\n", "Failed to re-enter new root."); + goto err; + } + + ret = 0; + +err: + if (oldroot > 0) + close(oldroot); + if (newroot > 0) + close(newroot); + + return ret; +} + +static int chroot_enter() +{ + if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { + lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); + return -1; + } + + if (chroot(".") < 0) { + lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); + return -1; + } + + if (chdir("/") < 0) { + lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); + return -1; + } + + return 0; +} + +static int permute_and_enter(void) +{ + struct statfs sb; + + if (statfs("/", &sb) < 0) { + lxcfs_error("%s\n", "Could not stat / mountpoint."); + return -1; + } + + /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will + * likely report TMPFS_MAGIC. Hence, when it reports no we still check + * /proc/1/mountinfo. */ + if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) + return chroot_enter(); + + if (pivot_enter() < 0) { + lxcfs_error("%s\n", "Could not perform pivot root."); + return -1; + } + + return 0; +} + +/* Prepare our new clean root. */ +static int permute_prepare(void) +{ + if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { + lxcfs_error("%s\n", "Failed to create directory for new root."); + return -1; + } + + if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { + lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); + return -1; + } + + if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { + lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); + return -1; + } + + if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { + printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); + return -1; + } + + return 0; +} + +/* Calls chroot() on ramfs, pivot_root() in all other cases. */ +static bool permute_root(void) +{ + /* Prepare new root. */ + if (permute_prepare() < 0) + return false; + + /* Pivot into new root. */ + if (permute_and_enter() < 0) + return false; + + return true; +} + +static int preserve_mnt_ns(int pid) +{ + int ret; + size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt"); + char path[len]; + + ret = snprintf(path, len, "/proc/%d/ns/mnt", pid); + if (ret < 0 || (size_t)ret >= len) + return -1; + + return open(path, O_RDONLY | O_CLOEXEC); +} + +static bool cgfs_prepare_mounts(void) +{ + if (!mkdir_p(BASEDIR, 0700)) { + lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); + return false; + } + + if (!umount_if_mounted()) { + lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); + return false; + } + + if (unshare(CLONE_NEWNS) < 0) { + lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); + return false; + } + + cgroup_mount_ns_fd = preserve_mnt_ns(getpid()); + if (cgroup_mount_ns_fd < 0) { + lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); + return false; + } + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { + lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); + return false; + } + + if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { + lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); + return false; + } + + return true; +} + +static bool cgfs_mount_hierarchies(void) +{ + char *target; + size_t clen, len; + int i, ret; + + for (i = 0; i < num_hierarchies; i++) { + char *controller = hierarchies[i]; + + clen = strlen(controller); + len = strlen(BASEDIR) + clen + 2; + target = malloc(len); + if (!target) + return false; + + ret = snprintf(target, len, "%s/%s", BASEDIR, controller); + if (ret < 0 || ret >= len) { + free(target); + return false; + } + if (mkdir(target, 0755) < 0 && errno != EEXIST) { + free(target); + return false; + } + if (!strcmp(controller, "unified")) + ret = mount("none", target, "cgroup2", 0, NULL); + else + ret = mount(controller, target, "cgroup", 0, controller); + if (ret < 0) { + lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno)); + free(target); + return false; + } + + fd_hierarchies[i] = open(target, O_DIRECTORY); + if (fd_hierarchies[i] < 0) { + free(target); + return false; + } + free(target); + } + return true; +} + +static bool cgfs_setup_controllers(void) +{ + if (!cgfs_prepare_mounts()) + return false; + + if (!cgfs_mount_hierarchies()) { + lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts."); + return false; + } + + if (!permute_root()) + return false; + + return true; +} + +static void __attribute__((constructor)) collect_and_mount_subsystems(void) +{ + FILE *f; + char *cret, *line = NULL; + char cwd[MAXPATHLEN]; + size_t len = 0; + int i, init_ns = -1; + bool found_unified = false; if ((f = fopen("/proc/self/cgroup", "r")) == NULL) { - fprintf(stderr, "Error opening /proc/self/cgroup: %s\n", strerror(errno)); + lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno)); return; } + while (getline(&line, &len, f) != -1) { - char *p, *p2; + char *idx, *p, *p2; p = strchr(line, ':'); if (!p) goto out; + idx = line; *(p++) = '\0'; p2 = strrchr(p, ':'); @@ -3995,26 +4603,74 @@ static void __attribute__((constructor)) collect_subsystems(void) * because it parses out the empty string "" and later on passes * it to mount(). Let's skip such entries. */ - if (!strcmp(p, "")) - continue; + if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) { + found_unified = true; + p = "unified"; + } if (!store_hierarchy(line, p)) goto out; } + /* Preserve initial namespace. */ + init_ns = preserve_mnt_ns(getpid()); + if (init_ns < 0) { + lxcfs_error("%s\n", "Failed to preserve initial mount namespace."); + goto out; + } + + fd_hierarchies = malloc(sizeof(int) * num_hierarchies); + if (!fd_hierarchies) { + lxcfs_error("%s\n", strerror(errno)); + goto out; + } + + for (i = 0; i < num_hierarchies; i++) + fd_hierarchies[i] = -1; + + cret = getcwd(cwd, MAXPATHLEN); + if (!cret) + lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno)); + + /* This function calls unshare(CLONE_NEWNS) our initial mount namespace + * to privately mount lxcfs cgroups. */ + if (!cgfs_setup_controllers()) { + lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs."); + goto out; + } + + if (setns(init_ns, 0) < 0) { + lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno)); + goto out; + } + + if (!cret || chdir(cwd) < 0) + lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno)); + print_subsystems(); out: free(line); fclose(f); + if (init_ns >= 0) + close(init_ns); } static void __attribute__((destructor)) free_subsystems(void) { int i; - for (i = 0; i < num_hierarchies; i++) + lxcfs_debug("%s\n", "Running destructor for liblxcfs."); + + for (i = 0; i < num_hierarchies; i++) { if (hierarchies[i]) free(hierarchies[i]); + if (fd_hierarchies && fd_hierarchies[i] >= 0) + close(fd_hierarchies[i]); + } free(hierarchies); + free(fd_hierarchies); + + if (cgroup_mount_ns_fd >= 0) + close(cgroup_mount_ns_fd); }