X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=bindings.c;h=5b5860e73a889579baaaa618613e345854708e14;hb=e48e16dc588e1a7718453f06b0f7228d13cd2c84;hp=a16722c2b3b857ba45cdf8a7a07bd48580606264;hpb=8cf8c4f8f2bb994ab16a236e1731ca33512c465f;p=mirror_lxcfs.git diff --git a/bindings.c b/bindings.c index a16722c..5b5860e 100644 --- a/bindings.c +++ b/bindings.c @@ -8,30 +8,48 @@ #define FUSE_USE_VERSION 26 -#include #include +#include #include #include -#include -#include -#include -#include -#include -#include #include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include #include #include -#include -#include -#include +#include +#include #include "bindings.h" - #include "config.h" // for VERSION +/* Define pivot_root() if missing from the C library */ +#ifndef HAVE_PIVOT_ROOT +static int pivot_root(const char * new_root, const char * put_old) +{ +#ifdef __NR_pivot_root +return syscall(__NR_pivot_root, new_root, put_old); +#else +errno = ENOSYS; +return -1; +#endif +} +#else +extern int pivot_root(const char * new_root, const char * put_old); +#endif + enum { LXC_TYPE_CGDIR, LXC_TYPE_CGFILE, @@ -54,8 +72,8 @@ struct file_info { int cached; }; -/* reserve buffer size, for cpuall in /proc/stat */ -#define BUF_RESERVE_SIZE 256 +/* Reserve buffer size to account for file size changes. */ +#define BUF_RESERVE_SIZE 512 /* * A table caching which pid is init for a pid namespace. @@ -90,17 +108,37 @@ static void lock_mutex(pthread_mutex_t *l) int ret; if ((ret = pthread_mutex_lock(l)) != 0) { - fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret)); + lxcfs_error("returned:%d %s\n", ret, strerror(ret)); exit(1); } } +/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run. + * Number of hierarchies mounted. */ +static int num_hierarchies; + +/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run. + * Hierachies mounted {cpuset, blkio, ...}: + * Initialized via __constructor__ collect_and_mount_subsystems(). */ +static char **hierarchies; + +/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run. + * Open file descriptors: + * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a + * private mount namespace. + * Initialized via __constructor__ collect_and_mount_subsystems(). + * @fd_hierarchies[i] can be used to perform file operations on the cgroup + * mounts and respective files in the private namespace even when located in + * another namespace using the *at() family of functions + * {openat(), fchownat(), ...}. */ +static int *fd_hierarchies; + static void unlock_mutex(pthread_mutex_t *l) { int ret; if ((ret = pthread_mutex_unlock(l)) != 0) { - fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret)); + lxcfs_error("returned:%d %s\n", ret, strerror(ret)); exit(1); } } @@ -124,10 +162,10 @@ static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb) snprintf(fnam, 100, "/proc/%d", e->initpid); if (stat(fnam, &initsb) < 0) return false; -#if DEBUG - fprintf(stderr, "comparing ctime %ld %ld for pid %d\n", - e->ctime, initsb.st_ctime, e->initpid); -#endif + + lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime, + initsb.st_ctime, e->initpid); + if (e->ctime != initsb.st_ctime) return false; return true; @@ -139,9 +177,8 @@ static void remove_initpid(struct pidns_init_store *e) struct pidns_init_store *tmp; int h; -#if DEBUG - fprintf(stderr, "remove_initpid: removing entry for %d\n", e->initpid); -#endif + lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid); + h = HASH(e->ino); if (pidns_hash_table[h] == e) { pidns_hash_table[h] = e->next; @@ -176,18 +213,18 @@ static void prune_initpid_store(void) now = time(NULL); if (now < last_prune + PURGE_SECS) return; -#if DEBUG - fprintf(stderr, "pruning\n"); -#endif + + lxcfs_debug("%s\n", "Pruning."); + last_prune = now; threshold = now - 2 * PURGE_SECS; for (i = 0; i < PIDNS_HASH_SIZE; i++) { for (prev = NULL, e = pidns_hash_table[i]; e; ) { if (e->lastcheck < threshold) { -#if DEBUG - fprintf(stderr, "Removing cached entry for %d\n", e->initpid); -#endif + + lxcfs_debug("Removing cached entry for %d.\n", e->initpid); + delme = e; if (prev) prev->next = e->next; @@ -211,9 +248,8 @@ static void save_initpid(struct stat *sb, pid_t pid) struct stat procsb; int h; -#if DEBUG - fprintf(stderr, "save_initpid: adding entry for %d\n", pid); -#endif + lxcfs_debug("Save_initpid: adding entry for %d.\n", pid); + snprintf(fpath, 100, "/proc/%d", pid); if (stat(fpath, &procsb) < 0) return; @@ -256,10 +292,10 @@ static struct pidns_init_store *lookup_verify_initpid(struct stat *sb) return NULL; } -static int is_dir(const char *path) +static int is_dir(const char *path, int fd) { struct stat statbuf; - int ret = stat(path, &statbuf); + int ret = fstatat(fd, path, &statbuf, fd); if (ret == 0 && S_ISDIR(statbuf.st_mode)) return 1; return 0; @@ -307,11 +343,11 @@ static void append_line(char **contents, size_t *len, char *line, ssize_t linele *len = newlen; } -static char *slurp_file(const char *from) +static char *slurp_file(const char *from, int fd) { char *line = NULL; char *contents = NULL; - FILE *f = fopen(from, "r"); + FILE *f = fdopen(fd, "r"); size_t len = 0, fulllen = 0; ssize_t linelen; @@ -329,33 +365,27 @@ static char *slurp_file(const char *from) return contents; } -static bool write_string(const char *fnam, const char *string) +static bool write_string(const char *fnam, const char *string, int fd) { FILE *f; size_t len, ret; - if (!(f = fopen(fnam, "w"))) + if (!(f = fdopen(fd, "w"))) return false; len = strlen(string); ret = fwrite(string, 1, len, f); if (ret != len) { - fprintf(stderr, "Error writing to file: %s\n", strerror(errno)); + lxcfs_error("Error writing to file: %s\n", strerror(errno)); fclose(f); return false; } if (fclose(f) < 0) { - fprintf(stderr, "Error writing to file: %s\n", strerror(errno)); + lxcfs_error("Error writing to file: %s\n", strerror(errno)); return false; } return true; } -/* - * hierarchies, i.e. 'cpu,cpuacct' - */ -char **hierarchies; -int num_hierarchies; - struct cgfs_files { char *name; uint32_t uid, gid; @@ -370,12 +400,12 @@ static bool store_hierarchy(char *stridx, char *h) n *= ALLOC_NUM; char **tmp = realloc(hierarchies, n * sizeof(char *)); if (!tmp) { - fprintf(stderr, "Out of memory\n"); + lxcfs_error("%s\n", strerror(errno)); exit(1); } hierarchies = tmp; } - + hierarchies[num_hierarchies++] = must_copy_string(h); return true; } @@ -384,10 +414,11 @@ static void print_subsystems(void) { int i; - fprintf(stderr, "hierarchies:"); + fprintf(stderr, "hierarchies:\n"); for (i = 0; i < num_hierarchies; i++) { if (hierarchies[i]) - fprintf(stderr, " %d: %s\n", i, hierarchies[i]); + fprintf(stderr, " %2d: fd: %3d: %s\n", i, + fd_hierarchies[i], hierarchies[i]); } } @@ -396,7 +427,7 @@ static bool in_comma_list(const char *needle, const char *haystack) const char *s = haystack, *e; size_t nlen = strlen(needle); - while (*s && (e = index(s, ','))) { + while (*s && (e = strchr(s, ','))) { if (nlen != e - s) { s = e + 1; continue; @@ -411,17 +442,25 @@ static bool in_comma_list(const char *needle, const char *haystack) } /* do we need to do any massaging here? I'm not sure... */ -static char *find_mounted_controller(const char *controller) +/* Return the mounted controller and store the corresponding open file descriptor + * referring to the controller mountpoint in the private lxcfs namespace in + * @cfd. + */ +static char *find_mounted_controller(const char *controller, int *cfd) { int i; for (i = 0; i < num_hierarchies; i++) { if (!hierarchies[i]) continue; - if (strcmp(hierarchies[i], controller) == 0) + if (strcmp(hierarchies[i], controller) == 0) { + *cfd = fd_hierarchies[i]; return hierarchies[i]; - if (in_comma_list(controller, hierarchies[i])) + } + if (in_comma_list(controller, hierarchies[i])) { + *cfd = fd_hierarchies[i]; return hierarchies[i]; + } } return NULL; @@ -430,176 +469,209 @@ static char *find_mounted_controller(const char *controller) bool cgfs_set_value(const char *controller, const char *cgroup, const char *file, const char *value) { + int ret, fd, cfd; size_t len; - char *fnam, *tmpc = find_mounted_controller(controller); + char *fnam, *tmpc; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return false; - /* basedir / tmpc / cgroup / file \0 */ - len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + strlen(file) + 4; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / + file + \0 + */ + len = strlen(cgroup) + strlen(file) + 3; fnam = alloca(len); - snprintf(fnam, len, "%s/%s/%s/%s", basedir, tmpc, cgroup, file); - - return write_string(fnam, value); + ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file); + if (ret < 0 || (size_t)ret >= len) + return false; + + fd = openat(cfd, fnam, O_WRONLY); + if (fd < 0) + return false; + + return write_string(fnam, value, fd); } // Chown all the files in the cgroup directory. We do this when we create // a cgroup on behalf of a user. -static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid) +static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd) { - struct dirent dirent, *direntp; + struct dirent *direntp; char path[MAXPATHLEN]; size_t len; DIR *d; - int ret; + int fd1, ret; len = strlen(dirname); if (len >= MAXPATHLEN) { - fprintf(stderr, "chown_all_cgroup_files: pathname too long: %s\n", dirname); + lxcfs_error("Pathname too long: %s\n", dirname); return; } - d = opendir(dirname); + fd1 = openat(fd, dirname, O_DIRECTORY); + if (fd1 < 0) + return; + + d = fdopendir(fd1); if (!d) { - fprintf(stderr, "chown_all_cgroup_files: failed to open %s\n", dirname); + lxcfs_error("Failed to open %s\n", dirname); return; } - while (readdir_r(d, &dirent, &direntp) == 0 && direntp) { + while ((direntp = readdir(d))) { if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, "..")) continue; ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); if (ret < 0 || ret >= MAXPATHLEN) { - fprintf(stderr, "chown_all_cgroup_files: pathname too long under %s\n", dirname); + lxcfs_error("Pathname too long under %s\n", dirname); continue; } - if (chown(path, uid, gid) < 0) - fprintf(stderr, "Failed to chown file %s to %u:%u", path, uid, gid); + if (fchownat(fd, path, uid, gid, 0) < 0) + lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid); } closedir(d); } int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid) { + int cfd; size_t len; - char *dirnam, *tmpc = find_mounted_controller(controller); + char *dirnam, *tmpc; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return -EINVAL; - /* basedir / tmpc / cg \0 */ - len = strlen(basedir) + strlen(tmpc) + strlen(cg) + 3; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cg + \0 + */ + len = strlen(cg) + 2; dirnam = alloca(len); - snprintf(dirnam, len, "%s/%s/%s", basedir,tmpc, cg); + snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg); - if (mkdir(dirnam, 0755) < 0) + if (mkdirat(cfd, dirnam, 0755) < 0) return -errno; if (uid == 0 && gid == 0) return 0; - if (chown(dirnam, uid, gid) < 0) + if (fchownat(cfd, dirnam, uid, gid, 0) < 0) return -errno; - chown_all_cgroup_files(dirnam, uid, gid); + chown_all_cgroup_files(dirnam, uid, gid, cfd); return 0; } -static bool recursive_rmdir(const char *dirname) +static bool recursive_rmdir(const char *dirname, int fd, const int cfd) { - struct dirent dirent, *direntp; + struct dirent *direntp; DIR *dir; bool ret = false; char pathname[MAXPATHLEN]; + int dupfd; - dir = opendir(dirname); + dupfd = dup(fd); // fdopendir() does bad things once it uses an fd. + if (dupfd < 0) + return false; + + dir = fdopendir(dupfd); if (!dir) { -#if DEBUG - fprintf(stderr, "%s: failed to open %s: %s\n", __func__, dirname, strerror(errno)); -#endif + lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno)); + close(dupfd); return false; } - while (!readdir_r(dir, &dirent, &direntp)) { + while ((direntp = readdir(dir))) { struct stat mystat; int rc; - if (!direntp) - break; - if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, "..")) continue; rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); if (rc < 0 || rc >= MAXPATHLEN) { - fprintf(stderr, "pathname too long\n"); + lxcfs_error("%s\n", "Pathname too long."); continue; } - ret = lstat(pathname, &mystat); - if (ret) { -#if DEBUG - fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno)); -#endif + rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW); + if (rc) { + lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno)); continue; } - if (S_ISDIR(mystat.st_mode)) { - if (!recursive_rmdir(pathname)) { -#if DEBUG - fprintf(stderr, "Error removing %s\n", pathname); -#endif - } - } + if (S_ISDIR(mystat.st_mode)) + if (!recursive_rmdir(pathname, fd, cfd)) + lxcfs_debug("Error removing %s.\n", pathname); } ret = true; if (closedir(dir) < 0) { - fprintf(stderr, "%s: failed to close directory %s: %s\n", __func__, dirname, strerror(errno)); + lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno)); ret = false; } - if (rmdir(dirname) < 0) { -#if DEBUG - fprintf(stderr, "%s: failed to delete %s: %s\n", __func__, dirname, strerror(errno)); -#endif + if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) { + lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno)); ret = false; } + close(dupfd); + return ret; } bool cgfs_remove(const char *controller, const char *cg) { + int fd, cfd; size_t len; - char *dirnam, *tmpc = find_mounted_controller(controller); + char *dirnam, *tmpc; + bool bret; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return false; - /* basedir / tmpc / cg \0 */ - len = strlen(basedir) + strlen(tmpc) + strlen(cg) + 3; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cg + \0 + */ + len = strlen(cg) + 2; dirnam = alloca(len); - snprintf(dirnam, len, "%s/%s/%s", basedir,tmpc, cg); - return recursive_rmdir(dirnam); + snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg); + + fd = openat(cfd, dirnam, O_DIRECTORY); + if (fd < 0) + return false; + + bret = recursive_rmdir(dirnam, fd, cfd); + close(fd); + return bret; } bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode) { + int cfd; size_t len; - char *pathname, *tmpc = find_mounted_controller(controller); + char *pathname, *tmpc; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return false; - /* basedir / tmpc / file \0 */ - len = strlen(basedir) + strlen(tmpc) + strlen(file) + 3; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /file + \0 + */ + len = strlen(file) + 2; pathname = alloca(len); - snprintf(pathname, len, "%s/%s/%s", basedir, tmpc, file); - if (chmod(pathname, mode) < 0) + snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file); + if (fchmodat(cfd, pathname, mode, 0) < 0) return false; return true; } -static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid) +static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd) { size_t len; char *fname; @@ -607,94 +679,114 @@ static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid) len = strlen(dirname) + strlen("/cgroup.procs") + 1; fname = alloca(len); snprintf(fname, len, "%s/tasks", dirname); - if (chown(fname, uid, gid) != 0) + if (fchownat(fd, fname, uid, gid, 0) != 0) return -errno; snprintf(fname, len, "%s/cgroup.procs", dirname); - if (chown(fname, uid, gid) != 0) + if (fchownat(fd, fname, uid, gid, 0) != 0) return -errno; return 0; } int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid) { + int cfd; size_t len; - char *pathname, *tmpc = find_mounted_controller(controller); + char *pathname, *tmpc; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return -EINVAL; - /* basedir / tmpc / file \0 */ - len = strlen(basedir) + strlen(tmpc) + strlen(file) + 3; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /file + \0 + */ + len = strlen(file) + 2; pathname = alloca(len); - snprintf(pathname, len, "%s/%s/%s", basedir, tmpc, file); - if (chown(pathname, uid, gid) < 0) + snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file); + if (fchownat(cfd, pathname, uid, gid, 0) < 0) return -errno; - if (is_dir(pathname)) + if (is_dir(pathname, cfd)) // like cgmanager did, we want to chown the tasks file as well - return chown_tasks_files(pathname, uid, gid); + return chown_tasks_files(pathname, uid, gid, cfd); return 0; } FILE *open_pids_file(const char *controller, const char *cgroup) { + int fd, cfd; size_t len; - char *pathname, *tmpc = find_mounted_controller(controller); + char *pathname, *tmpc; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return NULL; - /* basedir / tmpc / cgroup / "cgroup.procs" \0 */ - len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 4 + strlen("cgroup.procs"); + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / "cgroup.procs" + \0 + */ + len = strlen(cgroup) + strlen("cgroup.procs") + 3; pathname = alloca(len); - snprintf(pathname, len, "%s/%s/%s/cgroup.procs", basedir, tmpc, cgroup); - return fopen(pathname, "w"); + snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup); + + fd = openat(cfd, pathname, O_WRONLY); + if (fd < 0) + return NULL; + + return fdopen(fd, "w"); } static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories, void ***list, size_t typesize, void* (*iterator)(const char*, const char*, const char*)) { + int cfd, fd, ret; size_t len; - char *dirname, *tmpc = find_mounted_controller(controller); + char *cg, *tmpc; char pathname[MAXPATHLEN]; size_t sz = 0, asz = 0; - struct dirent dirent, *direntp; + struct dirent *dirent; DIR *dir; - int ret; + tmpc = find_mounted_controller(controller, &cfd); *list = NULL; if (!tmpc) return false; - /* basedir / tmpc / cgroup \0 */ - len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 3; - dirname = alloca(len); - snprintf(dirname, len, "%s/%s/%s", basedir, tmpc, cgroup); + /* Make sure we pass a relative path to *at() family of functions. */ + len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */; + cg = alloca(len); + ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup); + if (ret < 0 || (size_t)ret >= len) { + lxcfs_error("Pathname too long under %s\n", cgroup); + return false; + } + + fd = openat(cfd, cg, O_DIRECTORY); + if (fd < 0) + return false; - dir = opendir(dirname); + dir = fdopendir(fd); if (!dir) return false; - while (!readdir_r(dir, &dirent, &direntp)) { + while ((dirent = readdir(dir))) { struct stat mystat; - int rc; - if (!direntp) - break; - - if (!strcmp(direntp->d_name, ".") || - !strcmp(direntp->d_name, "..")) + if (!strcmp(dirent->d_name, ".") || + !strcmp(dirent->d_name, "..")) continue; - rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); - if (rc < 0 || rc >= MAXPATHLEN) { - fprintf(stderr, "%s: pathname too long under %s\n", __func__, dirname); + ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name); + if (ret < 0 || ret >= MAXPATHLEN) { + lxcfs_error("Pathname too long under %s\n", cg); continue; } - ret = lstat(pathname, &mystat); + ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW); if (ret) { - fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno)); + lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno)); continue; } if ((!directories && !S_ISREG(mystat.st_mode)) || @@ -709,12 +801,12 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool } while (!tmp); *list = tmp; } - (*list)[sz] = (*iterator)(controller, cgroup, direntp->d_name); + (*list)[sz] = (*iterator)(controller, cg, dirent->d_name); (*list)[sz+1] = NULL; sz++; } if (closedir(dir) < 0) { - fprintf(stderr, "%s: failed closedir for %s: %s\n", __func__, dirname, strerror(errno)); + lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno)); return false; } return true; @@ -756,46 +848,60 @@ void free_keys(struct cgfs_files **keys) bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value) { + int ret, fd, cfd; size_t len; - char *fnam, *tmpc = find_mounted_controller(controller); + char *fnam, *tmpc; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return false; - /* basedir / tmpc / cgroup / file \0 */ - len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + strlen(file) + 4; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / + file + \0 + */ + len = strlen(cgroup) + strlen(file) + 3; fnam = alloca(len); - snprintf(fnam, len, "%s/%s/%s/%s", basedir, tmpc, cgroup, file); + ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file); + if (ret < 0 || (size_t)ret >= len) + return false; + + fd = openat(cfd, fnam, O_RDONLY); + if (fd < 0) + return false; - *value = slurp_file(fnam); + *value = slurp_file(fnam, fd); return *value != NULL; } struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file) { + int ret, cfd; size_t len; - char *fnam, *tmpc = find_mounted_controller(controller); + char *fnam, *tmpc; struct stat sb; struct cgfs_files *newkey; - int ret; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return false; if (file && *file == '/') file++; - if (file && index(file, '/')) + if (file && strchr(file, '/')) return NULL; - /* basedir / tmpc / cgroup / file \0 */ - len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 3; + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / + file + \0 + */ + len = strlen(cgroup) + 3; if (file) len += strlen(file) + 1; fnam = alloca(len); - snprintf(fnam, len, "%s/%s/%s%s%s", basedir, tmpc, cgroup, - file ? "/" : "", file ? file : ""); + snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup, + file ? "/" : "", file ? file : ""); - ret = stat(fnam, &sb); + ret = fstatat(cfd, fnam, &sb, 0); if (ret < 0) return NULL; @@ -804,8 +910,8 @@ struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, cons } while (!newkey); if (file) newkey->name = must_copy_string(file); - else if (rindex(cgroup, '/')) - newkey->name = must_copy_string(rindex(cgroup, '/')); + else if (strrchr(cgroup, '/')) + newkey->name = must_copy_string(strrchr(cgroup, '/')); else newkey->name = must_copy_string(cgroup); newkey->uid = sb.st_uid; @@ -819,8 +925,8 @@ static void *make_key_list_entry(const char *controller, const char *cgroup, con { struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry); if (!entry) { - fprintf(stderr, "%s: Error getting files under %s:%s\n", - __func__, controller, cgroup); + lxcfs_error("Error getting files under %s:%s\n", controller, + cgroup); } return entry; } @@ -831,21 +937,30 @@ bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_file } bool is_child_cgroup(const char *controller, const char *cgroup, const char *f) -{ size_t len; - char *fnam, *tmpc = find_mounted_controller(controller); +{ + int cfd; + size_t len; + char *fnam, *tmpc; int ret; struct stat sb; + tmpc = find_mounted_controller(controller, &cfd); if (!tmpc) return false; - /* basedir / tmpc / cgroup / f \0 */ - len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + strlen(f) + 4; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / + f + \0 + */ + len = strlen(cgroup) + strlen(f) + 3; fnam = alloca(len); - snprintf(fnam, len, "%s/%s/%s/%s", basedir, tmpc, cgroup, f); + ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f); + if (ret < 0 || (size_t)ret >= len) + return false; - ret = stat(fnam, &sb); + ret = fstatat(cfd, fnam, &sb, 0); if (ret < 0 || !S_ISDIR(sb.st_mode)) return false; + return true; } @@ -1048,7 +1163,7 @@ convert_id_to_ns(FILE *idfile, unsigned int in_id) * uids wrapped around - unexpected as this is a procfile, * so just bail. */ - fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n", + lxcfs_error("pid wrapparound at entry %u %u %u in %s\n", nsuid, hostuid, count, line); return -1; } @@ -1154,11 +1269,11 @@ static char *get_next_cgroup_dir(const char *taskcg, const char *querycg) char *start, *end; if (strlen(taskcg) <= strlen(querycg)) { - fprintf(stderr, "%s: I was fed bad input\n", __func__); + lxcfs_error("%s\n", "I was fed bad input."); return NULL; } - if (strcmp(querycg, "/") == 0) + if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0)) start = strdup(taskcg + 1); else start = strdup(taskcg + strlen(querycg) + 1); @@ -1179,13 +1294,14 @@ static void stripnewline(char *x) static char *get_pid_cgroup(pid_t pid, const char *contrl) { + int cfd; char fnam[PROCLEN]; FILE *f; char *answer = NULL; char *line = NULL; size_t len = 0; int ret; - const char *h = find_mounted_controller(contrl); + const char *h = find_mounted_controller(contrl, &cfd); if (!h) return NULL; @@ -1298,10 +1414,18 @@ static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, prune_init_slice(c2); /* - * callers pass in '/' for root cgroup, otherwise they pass - * in a cgroup without leading '/' + * callers pass in '/' or './' (openat()) for root cgroup, otherwise + * they pass in a cgroup without leading '/' + * + * The original line here was: + * linecmp = *cg == '/' ? c2 : c2+1; + * TODO: I'm not sure why you'd want to increment when *cg != '/'? + * Serge, do you know? */ - linecmp = *cg == '/' ? c2 : c2+1; + if (*cg == '/' || !strncmp(cg, "./", 2)) + linecmp = c2; + else + linecmp = c2 + 1; if (strncmp(linecmp, cg, strlen(linecmp)) != 0) { if (nextcg) { *nextcg = get_next_cgroup_dir(linecmp, cg); @@ -1324,7 +1448,7 @@ static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg) char *c2, *task_cg; size_t target_len, task_len; - if (strcmp(cg, "/") == 0) + if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0) return true; c2 = get_pid_cgroup(pid, contrl); @@ -1375,23 +1499,30 @@ static char *pick_controller_from_path(struct fuse_context *fc, const char *path const char *p1; char *contr, *slash; - if (strlen(path) < 9) + if (strlen(path) < 9) { + errno = EACCES; return NULL; - if (*(path+7) != '/') + } + if (*(path + 7) != '/') { + errno = EINVAL; return NULL; - p1 = path+8; + } + p1 = path + 8; contr = strdupa(p1); - if (!contr) + if (!contr) { + errno = ENOMEM; return NULL; + } slash = strstr(contr, "/"); if (slash) *slash = '\0'; int i; - for (i = 0; i < num_hierarchies; i++) { + for (i = 0; i < num_hierarchies; i++) { if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0) return hierarchies[i]; } + errno = ENOENT; return NULL; } @@ -1403,12 +1534,17 @@ static const char *find_cgroup_in_path(const char *path) { const char *p1; - if (strlen(path) < 9) + if (strlen(path) < 9) { + errno = EACCES; return NULL; - p1 = strstr(path+8, "/"); - if (!p1) + } + p1 = strstr(path + 8, "/"); + if (!p1) { + errno = EINVAL; return NULL; - return p1+1; + } + errno = 0; + return p1 + 1; } /* @@ -1467,7 +1603,7 @@ int cg_getattr(const char *path, struct stat *sb) controller = pick_controller_from_path(fc, path); if (!controller) - return -EIO; + return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) { /* this is just /cgroup/controller, return it as a dir */ @@ -1537,11 +1673,6 @@ int cg_getattr(const char *path, struct stat *sb) ret = -ENOENT; goto out; } - if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) { - ret = -EACCES; - goto out; - } - ret = 0; } @@ -1567,7 +1698,7 @@ int cg_opendir(const char *path, struct fuse_file_info *fi) // return list of keys for the controller, and list of child cgroups controller = pick_controller_from_path(fc, path); if (!controller) - return -EIO; + return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) { @@ -1611,8 +1742,11 @@ int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset struct fuse_context *fc = fuse_get_context(); char **clist = NULL; + if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0) + return -EIO; + if (d->type != LXC_TYPE_CGDIR) { - fprintf(stderr, "Internal error: file cache info used in readdir\n"); + lxcfs_error("%s\n", "Internal error: file cache info used in readdir."); return -EIO; } if (!d->cgroup && !d->controller) { @@ -1682,22 +1816,29 @@ out: return ret; } -static void do_release_file_info(struct file_info *f) +static void do_release_file_info(struct fuse_file_info *fi) { + struct file_info *f = (struct file_info *)fi->fh; + if (!f) return; + + fi->fh = 0; + free(f->controller); + f->controller = NULL; free(f->cgroup); + f->cgroup = NULL; free(f->file); + f->file = NULL; free(f->buf); + f->buf = NULL; free(f); } int cg_releasedir(const char *path, struct fuse_file_info *fi) { - struct file_info *d = (struct file_info *)fi->fh; - - do_release_file_info(d); + do_release_file_info(fi); return 0; } @@ -1715,10 +1856,10 @@ int cg_open(const char *path, struct fuse_file_info *fi) controller = pick_controller_from_path(fc, path); if (!controller) - return -EIO; + return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) - return -EINVAL; + return -errno; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { @@ -1744,7 +1885,6 @@ int cg_open(const char *path, struct fuse_file_info *fi) goto out; } if (!fc_may_access(fc, controller, path1, path2, fi->flags)) { - // should never get here ret = -EACCES; goto out; } @@ -1770,11 +1910,73 @@ out: return ret; } -int cg_release(const char *path, struct fuse_file_info *fi) +int cg_access(const char *path, int mode) { - struct file_info *f = (struct file_info *)fi->fh; + int ret; + const char *cgroup; + char *path1, *path2, *controller; + char *last = NULL, *cgdir = NULL; + struct cgfs_files *k = NULL; + struct fuse_context *fc = fuse_get_context(); + + if (strcmp(path, "/cgroup") == 0) + return 0; + + if (!fc) + return -EIO; + + controller = pick_controller_from_path(fc, path); + if (!controller) + return -errno; + cgroup = find_cgroup_in_path(path); + if (!cgroup) { + // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not + if ((mode & W_OK) == 0) + return 0; + return -EACCES; + } + + get_cgdir_and_path(cgroup, &cgdir, &last); + if (!last) { + path1 = "/"; + path2 = cgdir; + } else { + path1 = cgdir; + path2 = last; + } + + k = cgfs_get_key(controller, path1, path2); + if (!k) { + if ((mode & W_OK) == 0) + ret = 0; + else + ret = -EACCES; + goto out; + } + free_key(k); + + pid_t initpid = lookup_initpid_in_store(fc->pid); + if (initpid <= 0) + initpid = fc->pid; + if (!caller_may_see_dir(initpid, controller, path1)) { + ret = -ENOENT; + goto out; + } + if (!fc_may_access(fc, controller, path1, path2, mode)) { + ret = -EACCES; + goto out; + } + + ret = 0; + +out: + free(cgdir); + return ret; +} - do_release_file_info(f); +int cg_release(const char *path, struct fuse_file_info *fi) +{ + do_release_file_info(fi); return 0; } @@ -1789,14 +1991,14 @@ static bool wait_for_sock(int sock, int timeout) return false; if ((epfd = epoll_create(1)) < 0) { - fprintf(stderr, "Failed to create epoll socket: %m\n"); + lxcfs_error("%s\n", "Failed to create epoll socket: %m."); return false; } ev.events = POLLIN_SET; ev.data.fd = sock; if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) { - fprintf(stderr, "Failed adding socket to epoll: %m\n"); + lxcfs_error("%s\n", "Failed adding socket to epoll: %m."); close(epfd); return false; } @@ -1844,8 +2046,7 @@ static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst) if (pingfirst) { if (msgrecv(sock, buf, 1) != 1) { - fprintf(stderr, "%s: Error getting reply from server over socketpair\n", - __func__); + lxcfs_error("%s\n", "Error getting reply from server over socketpair."); return SEND_CREDS_FAIL; } } @@ -1869,8 +2070,7 @@ static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst) msg.msg_iovlen = 1; if (sendmsg(sock, &msg, 0) < 0) { - fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__, - strerror(errno)); + lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno)); if (errno == 3) return SEND_CREDS_NOTSK; return SEND_CREDS_FAIL; @@ -1896,12 +2096,12 @@ static bool recv_creds(int sock, struct ucred *cred, char *v) cred->gid = -1; if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) { - fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno)); + lxcfs_error("Failed to set passcred: %s\n", strerror(errno)); return false; } buf[0] = '1'; if (write(sock, buf, 1) != 1) { - fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno)); + lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno)); return false; } @@ -1916,14 +2116,12 @@ static bool recv_creds(int sock, struct ucred *cred, char *v) msg.msg_iovlen = 1; if (!wait_for_sock(sock, 2)) { - fprintf(stderr, "Timed out waiting for scm_cred: %s\n", - strerror(errno)); + lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno)); return false; } ret = recvmsg(sock, &msg, MSG_DONTWAIT); if (ret < 0) { - fprintf(stderr, "Failed to receive scm_cred: %s\n", - strerror(errno)); + lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno)); return false; } @@ -1956,10 +2154,8 @@ static int pid_ns_clone_wrapper(void *arg) { char b = '1'; close(args->cpipe[0]); - if (write(args->cpipe[1], &b, sizeof(char)) < 0) { - fprintf(stderr, "%s (child): error on write: %s\n", - __func__, strerror(errno)); - } + if (write(args->cpipe[1], &b, sizeof(char)) < 0) + lxcfs_error("(child): error on write: %s.\n", strerror(errno)); close(args->cpipe[1]); return args->wrapped(args->sock, args->tpid); } @@ -2093,13 +2289,11 @@ bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *fi // read converted results if (!wait_for_sock(sock[0], 2)) { - fprintf(stderr, "%s: timed out waiting for pid from child: %s\n", - __func__, strerror(errno)); + lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno)); goto out; } if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { - fprintf(stderr, "%s: error reading pid from child: %s\n", - __func__, strerror(errno)); + lxcfs_error("Error reading pid from child: %s.\n", strerror(errno)); goto out; } must_strcat_pid(d, &sz, &asz, qpid); @@ -2114,8 +2308,7 @@ next: v = '1'; if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) { // failed to ask child to exit - fprintf(stderr, "%s: failed to ask child to exit: %s\n", - __func__, strerror(errno)); + lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno)); goto out; } @@ -2143,7 +2336,7 @@ int cg_read(const char *path, char *buf, size_t size, off_t offset, bool r; if (f->type != LXC_TYPE_CGFILE) { - fprintf(stderr, "Internal error: directory cache info used in cg_read\n"); + lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read."); return -EIO; } @@ -2162,7 +2355,7 @@ int cg_read(const char *path, char *buf, size_t size, off_t offset, free_key(k); - if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here + if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { ret = -EACCES; goto out; } @@ -2210,12 +2403,11 @@ static int pid_from_ns(int sock, pid_t tpid) cred.gid = 0; while (1) { if (!wait_for_sock(sock, 2)) { - fprintf(stderr, "%s: timeout reading from parent\n", __func__); + lxcfs_error("%s\n", "Timeout reading from parent."); return 1; } if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) { - fprintf(stderr, "%s: bad read from parent: %s\n", - __func__, strerror(errno)); + lxcfs_error("Bad read from parent: %s.\n", strerror(errno)); return 1; } if (vpid == -1) // done @@ -2316,20 +2508,20 @@ void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid) *gid = -1; sprintf(line, "/proc/%d/status", pid); if ((f = fopen(line, "r")) == NULL) { - fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno)); + lxcfs_error("Error opening %s: %s\n", line, strerror(errno)); return; } while (fgets(line, 400, f)) { if (strncmp(line, "Uid:", 4) == 0) { if (sscanf(line+4, "%u", &u) != 1) { - fprintf(stderr, "bad uid line for pid %u\n", pid); + lxcfs_error("bad uid line for pid %u\n", pid); fclose(f); return; } *uid = u; } else if (strncmp(line, "Gid:", 4) == 0) { if (sscanf(line+4, "%u", &g) != 1) { - fprintf(stderr, "bad gid line for pid %u\n", pid); + lxcfs_error("bad gid line for pid %u\n", pid); fclose(f); return; } @@ -2401,8 +2593,7 @@ static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char char v; if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { - fprintf(stderr, "%s: error writing pid to child: %s\n", - __func__, strerror(errno)); + lxcfs_error("Error writing pid to child: %s.\n", strerror(errno)); goto out; } @@ -2426,7 +2617,7 @@ static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char /* All good, write the value */ qpid = -1; if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid)) - fprintf(stderr, "Warning: failed to ask child to exit\n"); + lxcfs_error("%s\n", "Warning: failed to ask child to exit."); if (!fail) answer = true; @@ -2455,7 +2646,7 @@ int cg_write(const char *path, const char *buf, size_t size, off_t offset, bool r; if (f->type != LXC_TYPE_CGFILE) { - fprintf(stderr, "Internal error: directory cache info used in cg_write\n"); + lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write."); return -EIO; } @@ -2508,15 +2699,16 @@ int cg_chown(const char *path, uid_t uid, gid_t gid) return -EIO; if (strcmp(path, "/cgroup") == 0) - return -EINVAL; + return -EPERM; controller = pick_controller_from_path(fc, path); if (!controller) - return -EINVAL; + return errno == ENOENT ? -EPERM : -errno; + cgroup = find_cgroup_in_path(path); if (!cgroup) /* this is just /cgroup/controller */ - return -EINVAL; + return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); @@ -2573,15 +2765,16 @@ int cg_chmod(const char *path, mode_t mode) return -EIO; if (strcmp(path, "/cgroup") == 0) - return -EINVAL; + return -EPERM; controller = pick_controller_from_path(fc, path); if (!controller) - return -EINVAL; + return errno == ENOENT ? -EPERM : -errno; + cgroup = find_cgroup_in_path(path); if (!cgroup) /* this is just /cgroup/controller */ - return -EINVAL; + return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); @@ -2639,14 +2832,13 @@ int cg_mkdir(const char *path, mode_t mode) if (!fc) return -EIO; - controller = pick_controller_from_path(fc, path); if (!controller) - return -EINVAL; + return errno == ENOENT ? -EPERM : -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) - return -EINVAL; + return -errno; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) @@ -2663,7 +2855,7 @@ int cg_mkdir(const char *path, mode_t mode) else if (last && strcmp(next, last) == 0) ret = -EEXIST; else - ret = -ENOENT; + ret = -EPERM; goto out; } @@ -2695,16 +2887,20 @@ int cg_rmdir(const char *path) return -EIO; controller = pick_controller_from_path(fc, path); - if (!controller) - return -EINVAL; + if (!controller) /* Someone's trying to delete "/cgroup". */ + return -EPERM; cgroup = find_cgroup_in_path(path); - if (!cgroup) - return -EINVAL; + if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */ + return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { - ret = -EINVAL; + /* Someone's trying to delete a cgroup on the same level as the + * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or + * rmdir "/cgroup/blkio/init.slice". + */ + ret = -EPERM; goto out; } @@ -2712,7 +2908,7 @@ int cg_rmdir(const char *path) if (initpid <= 0) initpid = fc->pid; if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) { - if (!last || strcmp(next, last) == 0) + if (!last || (next && (strcmp(next, last) == 0))) ret = -EBUSY; else ret = -ENOENT; @@ -2748,16 +2944,32 @@ static bool startswith(const char *line, const char *pref) return false; } -static void get_mem_cached(char *memstat, unsigned long *v) +static void parse_memstat(char *memstat, unsigned long *cached, + unsigned long *active_anon, unsigned long *inactive_anon, + unsigned long *active_file, unsigned long *inactive_file, + unsigned long *unevictable) { char *eol; - *v = 0; while (*memstat) { - if (startswith(memstat, "total_cache")) { - sscanf(memstat + 11, "%lu", v); - *v /= 1024; - return; + if (startswith(memstat, "cache")) { + sscanf(memstat + 5, "%lu", cached); + *cached /= 1024; + } else if (startswith(memstat, "active_anon")) { + sscanf(memstat + 11, "%lu", active_anon); + *active_anon /= 1024; + } else if (startswith(memstat, "inactive_anon")) { + sscanf(memstat + 13, "%lu", inactive_anon); + *inactive_anon /= 1024; + } else if (startswith(memstat, "active_file")) { + sscanf(memstat + 11, "%lu", active_file); + *active_file /= 1024; + } else if (startswith(memstat, "inactive_file")) { + sscanf(memstat + 13, "%lu", inactive_file); + *inactive_file /= 1024; + } else if (startswith(memstat, "unevictable")) { + sscanf(memstat + 11, "%lu", unevictable); + *unevictable /= 1024; } eol = strchr(memstat, '\n'); if (!eol) @@ -2801,14 +3013,14 @@ static int read_file(const char *path, char *buf, size_t size, return 0; while (getline(&line, &linelen, f) != -1) { - size_t l = snprintf(cache, cache_size, "%s", line); + ssize_t l = snprintf(cache, cache_size, "%s", line); if (l < 0) { perror("Error writing to cache"); rv = 0; goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -2818,7 +3030,8 @@ static int read_file(const char *path, char *buf, size_t size, } d->size = total_len; - if (total_len > size ) total_len = size; + if (total_len > size) + total_len = size; /* read from off 0 */ memcpy(buf, d->buf, total_len); @@ -2833,12 +3046,12 @@ static int read_file(const char *path, char *buf, size_t size, * FUSE ops for /proc */ -static unsigned long get_memlimit(const char *cgroup) +static unsigned long get_memlimit(const char *cgroup, const char *file) { char *memlimit_str = NULL; unsigned long memlimit = -1; - if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str)) + if (cgfs_get_value("memory", cgroup, file, &memlimit_str)) memlimit = strtoul(memlimit_str, NULL, 10); free(memlimit_str); @@ -2846,16 +3059,16 @@ static unsigned long get_memlimit(const char *cgroup) return memlimit; } -static unsigned long get_min_memlimit(const char *cgroup) +static unsigned long get_min_memlimit(const char *cgroup, const char *file) { char *copy = strdupa(cgroup); unsigned long memlimit = 0, retlimit; - retlimit = get_memlimit(copy); + retlimit = get_memlimit(copy, file); while (strcmp(copy, "/") != 0) { copy = dirname(copy); - memlimit = get_memlimit(copy); + memlimit = get_memlimit(copy, file); if (memlimit != -1 && memlimit < retlimit) retlimit = memlimit; }; @@ -2870,10 +3083,11 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, struct file_info *d = (struct file_info *)fi->fh; char *cg; char *memusage_str = NULL, *memstat_str = NULL, - *memswlimit_str = NULL, *memswusage_str = NULL, - *memswlimit_default_str = NULL, *memswusage_default_str = NULL; + *memswlimit_str = NULL, *memswusage_str = NULL; unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0, - cached = 0, hosttotal = 0; + cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0, + active_file = 0, inactive_file = 0, unevictable = 0, + hostswtotal = 0; char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; char *cache = d->buf; @@ -2899,7 +3113,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, return read_file("/proc/meminfo", buf, size, d); prune_init_slice(cg); - memlimit = get_min_memlimit(cg); + memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str)) goto err; if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str)) @@ -2910,20 +3124,9 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) && cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str)) { - /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */ - if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str)) - goto err; - if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str)) - goto err; - - memswlimit = strtoul(memswlimit_str, NULL, 10); + memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes"); memswusage = strtoul(memswusage_str, NULL, 10); - if (!strcmp(memswlimit_str, memswlimit_default_str)) - memswlimit = 0; - if (!strcmp(memswusage_str, memswusage_default_str)) - memswusage = 0; - memswlimit = memswlimit / 1024; memswusage = memswusage / 1024; } @@ -2932,19 +3135,21 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, memlimit /= 1024; memusage /= 1024; - get_mem_cached(memstat_str, &cached); + parse_memstat(memstat_str, &cached, &active_anon, + &inactive_anon, &active_file, &inactive_file, + &unevictable); f = fopen("/proc/meminfo", "r"); if (!f) goto err; while (getline(&line, &linelen, f) != -1) { - size_t l; + ssize_t l; char *printme, lbuf[100]; memset(lbuf, 0, 100); if (startswith(line, "MemTotal:")) { - sscanf(line+14, "%lu", &hosttotal); + sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal); if (hosttotal < memlimit) memlimit = hosttotal; snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit); @@ -2956,11 +3161,16 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage); printme = lbuf; } else if (startswith(line, "SwapTotal:") && memswlimit > 0) { + sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal); + if (hostswtotal < memswlimit - memlimit) + memswlimit = hostswtotal + memlimit; snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit); printme = lbuf; } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) { - snprintf(lbuf, 100, "SwapFree: %8lu kB\n", - (memswlimit - memlimit) - (memswusage - memusage)); + unsigned long swaptotal = memswlimit - memlimit, + swapusage = memswusage - memusage, + swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0; + snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree); printme = lbuf; } else if (startswith(line, "Slab:")) { snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL); @@ -2974,6 +3184,35 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, } else if (startswith(line, "SwapCached:")) { snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL); printme = lbuf; + } else if (startswith(line, "Active:")) { + snprintf(lbuf, 100, "Active: %8lu kB\n", + active_anon + active_file); + printme = lbuf; + } else if (startswith(line, "Inactive:")) { + snprintf(lbuf, 100, "Inactive: %8lu kB\n", + inactive_anon + inactive_file); + printme = lbuf; + } else if (startswith(line, "Active(anon)")) { + snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon); + printme = lbuf; + } else if (startswith(line, "Inactive(anon)")) { + snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon); + printme = lbuf; + } else if (startswith(line, "Active(file)")) { + snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file); + printme = lbuf; + } else if (startswith(line, "Inactive(file)")) { + snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file); + printme = lbuf; + } else if (startswith(line, "Unevictable")) { + snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable); + printme = lbuf; + } else if (startswith(line, "SReclaimable")) { + snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL); + printme = lbuf; + } else if (startswith(line, "SUnreclaim")) { + snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL); + printme = lbuf; } else printme = line; @@ -2985,7 +3224,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3010,8 +3249,6 @@ err: free(memswlimit_str); free(memswusage_str); free(memstat_str); - free(memswlimit_default_str); - free(memswusage_default_str); return rv; } @@ -3060,8 +3297,8 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, char *cpuset = NULL; char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; - bool am_printing = false; - int curcpu = -1; + bool am_printing = false, firstline = true, is_s390x = false; + int curcpu = -1, cpu; char *cache = d->buf; size_t cache_size = d->buflen; FILE *f = NULL; @@ -3094,7 +3331,17 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, goto err; while (getline(&line, &linelen, f) != -1) { - size_t l; + ssize_t l; + if (firstline) { + firstline = false; + if (strstr(line, "IBM/S390") != NULL) { + is_s390x = true; + am_printing = true; + continue; + } + } + if (strncmp(line, "# processors:", 12) == 0) + continue; if (is_processor_line(line)) { am_printing = cpuline_in_cpuset(line, cpuset); if (am_printing) { @@ -3106,7 +3353,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3115,6 +3362,31 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, total_len += l; } continue; + } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) { + char *p; + if (!cpu_in_cpuset(cpu, cpuset)) + continue; + curcpu ++; + p = strchr(line, ':'); + if (!p || !*p) + goto err; + p++; + l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p); + if (l < 0) { + perror("Error writing to cache"); + rv = 0; + goto err; + } + if (l >= cache_size) { + lxcfs_error("%s\n", "Internal error: truncated write to cache."); + rv = 0; + goto err; + } + cache += l; + cache_size -= l; + total_len += l; + continue; + } if (am_printing) { l = snprintf(cache, cache_size, "%s", line); @@ -3124,7 +3396,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3134,12 +3406,44 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, } } - d->cached = 1; - d->size = total_len; - if (total_len > size ) total_len = size; - - /* read from off 0 */ - memcpy(buf, d->buf, total_len); + if (is_s390x) { + char *origcache = d->buf; + ssize_t l; + do { + d->buf = malloc(d->buflen); + } while (!d->buf); + cache = d->buf; + cache_size = d->buflen; + total_len = 0; + l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n"); + if (l < 0 || l >= cache_size) { + free(origcache); + goto err; + } + cache_size -= l; + cache += l; + total_len += l; + l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1); + if (l < 0 || l >= cache_size) { + free(origcache); + goto err; + } + cache_size -= l; + cache += l; + total_len += l; + l = snprintf(cache, cache_size, "%s", origcache); + free(origcache); + if (l < 0 || l >= cache_size) + goto err; + total_len += l; + } + + d->cached = 1; + d->size = total_len; + if (total_len > size ) total_len = size; + + /* read from off 0 */ + memcpy(buf, d->buf, total_len); rv = total_len; err: if (f) @@ -3150,6 +3454,28 @@ err: return rv; } +static long int getreaperctime(pid_t pid) +{ + char fnam[100]; + struct stat sb; + int ret; + pid_t qpid; + + qpid = lookup_initpid_in_store(pid); + if (qpid <= 0) + return 0; + + ret = snprintf(fnam, 100, "/proc/%d", qpid); + if (ret < 0 || ret >= 100) + return 0; + + if (lstat(fnam, &sb) < 0) + return 0; + + return sb.st_ctime; +} + +#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2) static int proc_stat_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { @@ -3160,10 +3486,9 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; int curcpu = -1; /* cpu numbering starts at 0 */ - unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0; + unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0; unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0, - irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0; -#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE + irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0; char cpuall[CPUALL_MAX_SIZE]; /* reserve for cpu all */ char *cache = d->buf + CPUALL_MAX_SIZE; @@ -3199,26 +3524,31 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, //skip first line if (getline(&line, &linelen, f) < 0) { - fprintf(stderr, "proc_stat_read read first line failed\n"); + lxcfs_error("%s\n", "proc_stat_read read first line failed."); goto err; } while (getline(&line, &linelen, f) != -1) { - size_t l; + ssize_t l; int cpu; char cpu_char[10]; /* That's a lot of cores */ char *c; + if (strlen(line) == 0) + continue; if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) { /* not a ^cpuN line containing a number N, just print it */ - l = snprintf(cache, cache_size, "%s", line); + if (strncmp(line, "btime", 5) == 0) + l = snprintf(cache, cache_size, "btime %ld\n", getreaperctime(fc->pid)); + else + l = snprintf(cache, cache_size, "%s", line); if (l < 0) { perror("Error writing to cache"); rv = 0; goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3245,7 +3575,7 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3254,8 +3584,17 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, cache_size -= l; total_len += l; - if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq, - &softirq, &steal, &guest) != 9) + if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + &user, + &nice, + &system, + &idle, + &iowait, + &irq, + &softirq, + &steal, + &guest, + &guest_nice) != 10) continue; user_sum += user; nice_sum += nice; @@ -3266,18 +3605,28 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, softirq_sum += softirq; steal_sum += steal; guest_sum += guest; + guest_nice_sum += guest_nice; } cache = d->buf; - int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum); - if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){ + int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + user_sum, + nice_sum, + system_sum, + idle_sum, + iowait_sum, + irq_sum, + softirq_sum, + steal_sum, + guest_sum, + guest_nice_sum); + if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) { memcpy(cache, cpuall, cpuall_len); cache += cpuall_len; - } else{ + } else { /* shouldn't happen */ - fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len); + lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len); cpuall_len = 0; } @@ -3285,7 +3634,8 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, total_len += cpuall_len; d->cached = 1; d->size = total_len; - if (total_len > size ) total_len = size; + if (total_len > size) + total_len = size; memcpy(buf, d->buf, total_len); rv = total_len; @@ -3301,23 +3651,12 @@ err: static long int getreaperage(pid_t pid) { - char fnam[100]; - struct stat sb; - int ret; - pid_t qpid; + long int ctime; - qpid = lookup_initpid_in_store(pid); - if (qpid <= 0) - return 0; - - ret = snprintf(fnam, 100, "/proc/%d", qpid); - if (ret < 0 || ret >= 100) - return 0; - - if (lstat(fnam, &sb) < 0) - return 0; - - return time(NULL) - sb.st_ctime; + ctime = getreaperctime(pid); + if (ctime) + return time(NULL) - ctime; + return ctime; } static unsigned long get_reaper_busy(pid_t task) @@ -3347,17 +3686,9 @@ out: #if RELOADTEST void iwashere(void) { - char *name, *cwd = get_current_dir_name(); - size_t len; int fd; - if (!cwd) - exit(1); - len = strlen(cwd) + strlen("/iwashere") + 1; - name = alloca(len); - snprintf(name, len, "%s/iwashere", cwd); - free(cwd); - fd = creat(name, 0755); + fd = creat("/tmp/lxcfs-iwashere", 0644); if (fd >= 0) close(fd); } @@ -3376,7 +3707,7 @@ static int proc_uptime_read(char *buf, size_t size, off_t offset, long int reaperage = getreaperage(fc->pid); unsigned long int busytime = get_reaper_busy(fc->pid), idletime; char *cache = d->buf; - size_t total_len = 0; + ssize_t total_len = 0; #if RELOADTEST iwashere(); @@ -3454,15 +3785,15 @@ static int proc_diskstats_read(char *buf, size_t size, off_t offset, return read_file("/proc/diskstats", buf, size, d); prune_init_slice(cg); - if (!cgfs_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str)) + if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str)) goto err; - if (!cgfs_get_value("blkio", cg, "blkio.io_merged", &io_merged_str)) + if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str)) goto err; - if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str)) + if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str)) goto err; - if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str)) + if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str)) goto err; - if (!cgfs_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str)) + if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str)) goto err; @@ -3471,55 +3802,53 @@ static int proc_diskstats_read(char *buf, size_t size, off_t offset, goto err; while (getline(&line, &linelen, f) != -1) { - size_t l; - char *printme, lbuf[256]; + ssize_t l; + char lbuf[256]; i = sscanf(line, "%u %u %71s", &major, &minor, dev_name); - if(i == 3){ - get_blkio_io_value(io_serviced_str, major, minor, "Read", &read); - get_blkio_io_value(io_serviced_str, major, minor, "Write", &write); - get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged); - get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged); - get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors); - read_sectors = read_sectors/512; - get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors); - write_sectors = write_sectors/512; - - get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm); - rd_svctm = rd_svctm/1000000; - get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait); - rd_wait = rd_wait/1000000; - read_ticks = rd_svctm + rd_wait; - - get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm); - wr_svctm = wr_svctm/1000000; - get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait); - wr_wait = wr_wait/1000000; - write_ticks = wr_svctm + wr_wait; - - get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks); - tot_ticks = tot_ticks/1000000; - }else{ + if (i != 3) continue; - } + + get_blkio_io_value(io_serviced_str, major, minor, "Read", &read); + get_blkio_io_value(io_serviced_str, major, minor, "Write", &write); + get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged); + get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged); + get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors); + read_sectors = read_sectors/512; + get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors); + write_sectors = write_sectors/512; + + get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm); + rd_svctm = rd_svctm/1000000; + get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait); + rd_wait = rd_wait/1000000; + read_ticks = rd_svctm + rd_wait; + + get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm); + wr_svctm = wr_svctm/1000000; + get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait); + wr_wait = wr_wait/1000000; + write_ticks = wr_svctm + wr_wait; + + get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks); + tot_ticks = tot_ticks/1000000; memset(lbuf, 0, 256); - if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) { + if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", major, minor, dev_name, read, read_merged, read_sectors, read_ticks, write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks); - printme = lbuf; - } else + else continue; - l = snprintf(cache, cache_size, "%s", printme); + l = snprintf(cache, cache_size, "%s", lbuf); if (l < 0) { perror("Error writing to fuse buf"); rv = 0; goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3553,10 +3882,10 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, struct fuse_context *fc = fuse_get_context(); struct file_info *d = (struct file_info *)fi->fh; char *cg = NULL; - char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL, - *memswlimit_default_str = NULL, *memswusage_default_str = NULL; + char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL; unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0; - size_t total_len = 0, rv = 0; + ssize_t total_len = 0, rv = 0; + ssize_t l = 0; char *cache = d->buf; if (offset) { @@ -3578,32 +3907,19 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, return read_file("/proc/swaps", buf, size, d); prune_init_slice(cg); - if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str)) - goto err; + memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str)) goto err; - memlimit = strtoul(memlimit_str, NULL, 10); memusage = strtoul(memusage_str, NULL, 10); if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) && cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) { - /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */ - if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str)) - goto err; - if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str)) - goto err; - - memswlimit = strtoul(memswlimit_str, NULL, 10); + memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes"); memswusage = strtoul(memswusage_str, NULL, 10); - if (!strcmp(memswlimit_str, memswlimit_default_str)) - memswlimit = 0; - if (!strcmp(memswusage_str, memswusage_default_str)) - memswusage = 0; - swap_total = (memswlimit - memlimit) / 1024; swap_free = (memswusage - memusage) / 1024; } @@ -3632,12 +3948,13 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, } if (swap_total > 0) { - total_len += snprintf(d->buf + total_len, d->size - total_len, - "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ", - swap_total, swap_free); + l = snprintf(d->buf + total_len, d->size - total_len, + "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ", + swap_total, swap_free); + total_len += l; } - if (total_len < 0) { + if (total_len < 0 || l < 0) { perror("Error writing to cache"); rv = 0; goto err; @@ -3656,8 +3973,6 @@ err: free(memlimit_str); free(memusage_str); free(memswusage_str); - free(memswusage_default_str); - free(memswlimit_default_str); return rv; } @@ -3710,12 +4025,14 @@ int proc_getattr(const char *path, struct stat *sb) int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi) { - if (filler(buf, "cpuinfo", NULL, 0) != 0 || - filler(buf, "meminfo", NULL, 0) != 0 || - filler(buf, "stat", NULL, 0) != 0 || - filler(buf, "uptime", NULL, 0) != 0 || - filler(buf, "diskstats", NULL, 0) != 0 || - filler(buf, "swaps", NULL, 0) != 0) + if (filler(buf, ".", NULL, 0) != 0 || + filler(buf, "..", NULL, 0) != 0 || + filler(buf, "cpuinfo", NULL, 0) != 0 || + filler(buf, "meminfo", NULL, 0) != 0 || + filler(buf, "stat", NULL, 0) != 0 || + filler(buf, "uptime", NULL, 0) != 0 || + filler(buf, "diskstats", NULL, 0) != 0 || + filler(buf, "swaps", NULL, 0) != 0) return -EINVAL; return 0; } @@ -3759,11 +4076,20 @@ int proc_open(const char *path, struct fuse_file_info *fi) return 0; } -int proc_release(const char *path, struct fuse_file_info *fi) +int proc_access(const char *path, int mask) { - struct file_info *f = (struct file_info *)fi->fh; + if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0) + return 0; - do_release_file_info(f); + /* these are all read-only */ + if ((mask & ~R_OK) != 0) + return -EACCES; + return 0; +} + +int proc_release(const char *path, struct fuse_file_info *fi) +{ + do_release_file_info(fi); return 0; } @@ -3790,22 +4116,359 @@ int proc_read(const char *path, char *buf, size_t size, off_t offset, } } -static void __attribute__((constructor)) collect_subsystems(void) +/* + * Functions needed to setup cgroups in the __constructor__. + */ + +static bool mkdir_p(const char *dir, mode_t mode) +{ + const char *tmp = dir; + const char *orig = dir; + char *makeme; + + do { + dir = tmp + strspn(tmp, "/"); + tmp = dir + strcspn(dir, "/"); + makeme = strndup(orig, dir - orig); + if (!makeme) + return false; + if (mkdir(makeme, mode) && errno != EEXIST) { + lxcfs_error("Failed to create directory '%s': %s.\n", + makeme, strerror(errno)); + free(makeme); + return false; + } + free(makeme); + } while(tmp != dir); + + return true; +} + +static bool umount_if_mounted(void) +{ + if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { + lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); + return false; + } + return true; +} + +/* __typeof__ should be safe to use with all compilers. */ +typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; +static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) +{ + return (fs->f_type == (fs_type_magic)magic_val); +} + +/* + * looking at fs/proc_namespace.c, it appears we can + * actually expect the rootfs entry to very specifically contain + * " - rootfs rootfs " + * IIUC, so long as we've chrooted so that rootfs is not our root, + * the rootfs entry should always be skipped in mountinfo contents. + */ +static bool is_on_ramfs(void) { FILE *f; + char *p, *p2; char *line = NULL; size_t len = 0; + int i; + + f = fopen("/proc/self/mountinfo", "r"); + if (!f) + return false; + + while (getline(&line, &len, f) != -1) { + for (p = line, i = 0; p && i < 4; i++) + p = strchr(p + 1, ' '); + if (!p) + continue; + p2 = strchr(p + 1, ' '); + if (!p2) + continue; + *p2 = '\0'; + if (strcmp(p + 1, "/") == 0) { + // this is '/'. is it the ramfs? + p = strchr(p2 + 1, '-'); + if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) { + free(line); + fclose(f); + return true; + } + } + } + free(line); + fclose(f); + return false; +} + +static int pivot_enter() +{ + int ret = -1, oldroot = -1, newroot = -1; + + oldroot = open("/", O_DIRECTORY | O_RDONLY); + if (oldroot < 0) { + lxcfs_error("%s\n", "Failed to open old root for fchdir."); + return ret; + } + + newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY); + if (newroot < 0) { + lxcfs_error("%s\n", "Failed to open new root for fchdir."); + goto err; + } + + /* change into new root fs */ + if (fchdir(newroot) < 0) { + lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR); + goto err; + } + + /* pivot_root into our new root fs */ + if (pivot_root(".", ".") < 0) { + lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno)); + goto err; + } + + /* + * At this point the old-root is mounted on top of our new-root. + * To unmounted it we must not be chdir'd into it, so escape back + * to the old-root. + */ + if (fchdir(oldroot) < 0) { + lxcfs_error("%s\n", "Failed to enter old root."); + goto err; + } + + if (umount2(".", MNT_DETACH) < 0) { + lxcfs_error("%s\n", "Failed to detach old root."); + goto err; + } + + if (fchdir(newroot) < 0) { + lxcfs_error("%s\n", "Failed to re-enter new root."); + goto err; + } + + ret = 0; + +err: + if (oldroot > 0) + close(oldroot); + if (newroot > 0) + close(newroot); + + return ret; +} + +static int chroot_enter() +{ + if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { + lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); + return -1; + } + + if (chroot(".") < 0) { + lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); + return -1; + } + + if (chdir("/") < 0) { + lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); + return -1; + } + + return 0; +} + +static int permute_and_enter(void) +{ + struct statfs sb; + + if (statfs("/", &sb) < 0) { + lxcfs_error("%s\n", "Could not stat / mountpoint."); + return -1; + } + + /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will + * likely report TMPFS_MAGIC. Hence, when it reports no we still check + * /proc/1/mountinfo. */ + if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) + return chroot_enter(); + + if (pivot_enter() < 0) { + lxcfs_error("%s\n", "Could not perform pivot root."); + return -1; + } + + return 0; +} + +/* Prepare our new clean root. */ +static int permute_prepare(void) +{ + if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { + lxcfs_error("%s\n", "Failed to create directory for new root."); + return -1; + } + + if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { + lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); + return -1; + } + + if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { + lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); + return -1; + } + + if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { + printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); + return -1; + } + + return 0; +} + +/* Calls chroot() on ramfs, pivot_root() in all other cases. */ +static bool permute_root(void) +{ + /* Prepare new root. */ + if (permute_prepare() < 0) + return false; + + /* Pivot into new root. */ + if (permute_and_enter() < 0) + return false; + + return true; +} + +static bool cgfs_prepare_mounts(void) +{ + if (!mkdir_p(BASEDIR, 0700)) { + lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); + return false; + } + + if (!umount_if_mounted()) { + lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); + return false; + } + + if (unshare(CLONE_NEWNS) < 0) { + lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); + return false; + } + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { + lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); + return false; + } + + if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { + lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); + return false; + } + + return true; +} + +static bool cgfs_mount_hierarchies(void) +{ + char *target; + size_t clen, len; + int i, ret; + + for (i = 0; i < num_hierarchies; i++) { + char *controller = hierarchies[i]; + + clen = strlen(controller); + len = strlen(BASEDIR) + clen + 2; + target = malloc(len); + if (!target) + return false; + + ret = snprintf(target, len, "%s/%s", BASEDIR, controller); + if (ret < 0 || ret >= len) { + free(target); + return false; + } + if (mkdir(target, 0755) < 0 && errno != EEXIST) { + free(target); + return false; + } + if (!strcmp(controller, "unified")) + ret = mount("none", target, "cgroup2", 0, NULL); + else + ret = mount(controller, target, "cgroup", 0, controller); + if (ret < 0) { + lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno)); + free(target); + return false; + } + + fd_hierarchies[i] = open(target, O_DIRECTORY); + if (fd_hierarchies[i] < 0) { + free(target); + return false; + } + free(target); + } + return true; +} + +static bool cgfs_setup_controllers(void) +{ + if (!cgfs_prepare_mounts()) + return false; + + if (!cgfs_mount_hierarchies()) { + lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts."); + return false; + } + + if (!permute_root()) + return false; + + return true; +} + +static int preserve_ns(int pid) +{ + int ret; + size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */; + char path[len]; + + ret = snprintf(path, len, "/proc/%d/ns/mnt", pid); + if (ret < 0 || (size_t)ret >= len) + return -1; + + return open(path, O_RDONLY | O_CLOEXEC); +} + +static void __attribute__((constructor)) collect_and_mount_subsystems(void) +{ + FILE *f; + char *cret, *line = NULL; + char cwd[MAXPATHLEN]; + size_t len = 0; + int i, init_ns = -1; + bool found_unified = false; if ((f = fopen("/proc/self/cgroup", "r")) == NULL) { - fprintf(stderr, "Error opening /proc/self/cgroup: %s\n", strerror(errno)); + lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno)); return; } + while (getline(&line, &len, f) != -1) { - char *p, *p2; + char *idx, *p, *p2; p = strchr(line, ':'); if (!p) goto out; + idx = line; *(p++) = '\0'; p2 = strrchr(p, ':'); @@ -3813,23 +4476,76 @@ static void __attribute__((constructor)) collect_subsystems(void) goto out; *p2 = '\0'; + /* With cgroupv2 /proc/self/cgroup can contain entries of the + * form: 0::/ This will cause lxcfs to fail the cgroup mounts + * because it parses out the empty string "" and later on passes + * it to mount(). Let's skip such entries. + */ + if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) { + found_unified = true; + p = "unified"; + } + if (!store_hierarchy(line, p)) goto out; } + /* Preserve initial namespace. */ + init_ns = preserve_ns(getpid()); + if (init_ns < 0) { + lxcfs_error("%s\n", "Failed to preserve initial mount namespace."); + goto out; + } + + fd_hierarchies = malloc(sizeof(int) * num_hierarchies); + if (!fd_hierarchies) { + lxcfs_error("%s\n", strerror(errno)); + goto out; + } + + for (i = 0; i < num_hierarchies; i++) + fd_hierarchies[i] = -1; + + cret = getcwd(cwd, MAXPATHLEN); + if (!cret) + lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno)); + + /* This function calls unshare(CLONE_NEWNS) our initial mount namespace + * to privately mount lxcfs cgroups. */ + if (!cgfs_setup_controllers()) { + lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs."); + goto out; + } + + if (setns(init_ns, 0) < 0) { + lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno)); + goto out; + } + + if (!cret || chdir(cwd) < 0) + lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno)); + print_subsystems(); out: free(line); fclose(f); + if (init_ns >= 0) + close(init_ns); } static void __attribute__((destructor)) free_subsystems(void) { int i; - for (i = 0; i < num_hierarchies; i++) + lxcfs_debug("%s\n", "Running destructor for liblxcfs."); + + for (i = 0; i < num_hierarchies; i++) { if (hierarchies[i]) free(hierarchies[i]); + if (fd_hierarchies && fd_hierarchies[i] >= 0) + close(fd_hierarchies[i]); + } free(hierarchies); + free(fd_hierarchies); }