X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=bindings.c;h=885eb64936c9921653f5a66d0d44204698bfedbf;hb=a257a8eedcdd48034880fbe5def1d5b809d8a16e;hp=aaba840a1862725b6e471968e25fb8ac88ea79b4;hpb=bc70ba9b492f1af79ce692471f3e300eaf4afe29;p=mirror_lxcfs.git diff --git a/bindings.c b/bindings.c index aaba840..885eb64 100644 --- a/bindings.c +++ b/bindings.c @@ -8,20 +8,24 @@ #define FUSE_USE_VERSION 26 +#define __STDC_FORMAT_MACROS #include #include #include #include +#include #include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -29,10 +33,15 @@ #include #include #include +#include +#include #include "bindings.h" #include "config.h" // for VERSION +/* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */ +#define LXCFS_NUMSTRLEN64 21 + /* Define pivot_root() if missing from the C library */ #ifndef HAVE_PIVOT_ROOT static int pivot_root(const char * new_root, const char * put_old) @@ -48,16 +57,6 @@ return -1; extern int pivot_root(const char * new_root, const char * put_old); #endif -#ifdef DEBUG -#define lxcfs_debug(format, ...) \ - do { \ - fprintf(stderr, "%s: %d: %s: " format, __FILE__, __LINE__, \ - __func__, __VA_ARGS__); \ - } while (false) -#else -#define lxcfs_debug(format, ...) -#endif /* DEBUG */ - enum { LXC_TYPE_CGDIR, LXC_TYPE_CGFILE, @@ -80,8 +79,8 @@ struct file_info { int cached; }; -/* reserve buffer size, for cpuall in /proc/stat */ -#define BUF_RESERVE_SIZE 256 +/* Reserve buffer size to account for file size changes. */ +#define BUF_RESERVE_SIZE 512 /* * A table caching which pid is init for a pid namespace. @@ -116,7 +115,7 @@ static void lock_mutex(pthread_mutex_t *l) int ret; if ((ret = pthread_mutex_lock(l)) != 0) { - fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret)); + lxcfs_error("returned:%d %s\n", ret, strerror(ret)); exit(1); } } @@ -140,13 +139,14 @@ static char **hierarchies; * another namespace using the *at() family of functions * {openat(), fchownat(), ...}. */ static int *fd_hierarchies; +static int cgroup_mount_ns_fd = -1; static void unlock_mutex(pthread_mutex_t *l) { int ret; if ((ret = pthread_mutex_unlock(l)) != 0) { - fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret)); + lxcfs_error("returned:%d %s\n", ret, strerror(ret)); exit(1); } } @@ -383,12 +383,12 @@ static bool write_string(const char *fnam, const char *string, int fd) len = strlen(string); ret = fwrite(string, 1, len, f); if (ret != len) { - fprintf(stderr, "Error writing to file: %s\n", strerror(errno)); + lxcfs_error("Error writing to file: %s\n", strerror(errno)); fclose(f); return false; } if (fclose(f) < 0) { - fprintf(stderr, "Error writing to file: %s\n", strerror(errno)); + lxcfs_error("Error writing to file: %s\n", strerror(errno)); return false; } return true; @@ -408,7 +408,7 @@ static bool store_hierarchy(char *stridx, char *h) n *= ALLOC_NUM; char **tmp = realloc(hierarchies, n * sizeof(char *)); if (!tmp) { - fprintf(stderr, "Out of memory\n"); + lxcfs_error("%s\n", strerror(errno)); exit(1); } hierarchies = tmp; @@ -422,10 +422,12 @@ static void print_subsystems(void) { int i; + fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd); fprintf(stderr, "hierarchies:\n"); for (i = 0; i < num_hierarchies; i++) { if (hierarchies[i]) - fprintf(stderr, " %d: %s\n", i, hierarchies[i]); + fprintf(stderr, " %2d: fd: %3d: %s\n", i, + fd_hierarchies[i], hierarchies[i]); } } @@ -512,7 +514,7 @@ static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, in len = strlen(dirname); if (len >= MAXPATHLEN) { - fprintf(stderr, "chown_all_cgroup_files: pathname too long: %s\n", dirname); + lxcfs_error("Pathname too long: %s\n", dirname); return; } @@ -522,7 +524,7 @@ static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, in d = fdopendir(fd1); if (!d) { - fprintf(stderr, "chown_all_cgroup_files: failed to open %s\n", dirname); + lxcfs_error("Failed to open %s\n", dirname); return; } @@ -531,11 +533,11 @@ static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, in continue; ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); if (ret < 0 || ret >= MAXPATHLEN) { - fprintf(stderr, "chown_all_cgroup_files: pathname too long under %s\n", dirname); + lxcfs_error("Pathname too long under %s\n", dirname); continue; } if (fchownat(fd, path, uid, gid, 0) < 0) - fprintf(stderr, "Failed to chown file %s to %u:%u", path, uid, gid); + lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid); } closedir(d); } @@ -600,7 +602,7 @@ static bool recursive_rmdir(const char *dirname, int fd, const int cfd) rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); if (rc < 0 || rc >= MAXPATHLEN) { - fprintf(stderr, "pathname too long\n"); + lxcfs_error("%s\n", "Pathname too long."); continue; } @@ -616,7 +618,7 @@ static bool recursive_rmdir(const char *dirname, int fd, const int cfd) ret = true; if (closedir(dir) < 0) { - fprintf(stderr, "%s: failed to close directory %s: %s\n", __func__, dirname, strerror(errno)); + lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno)); ret = false; } @@ -766,7 +768,7 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool cg = alloca(len); ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup); if (ret < 0 || (size_t)ret >= len) { - fprintf(stderr, "%s: pathname too long under %s\n", __func__, cgroup); + lxcfs_error("Pathname too long under %s\n", cgroup); return false; } @@ -787,13 +789,13 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name); if (ret < 0 || ret >= MAXPATHLEN) { - fprintf(stderr, "%s: pathname too long under %s\n", __func__, cg); + lxcfs_error("Pathname too long under %s\n", cg); continue; } ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW); if (ret) { - fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno)); + lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno)); continue; } if ((!directories && !S_ISREG(mystat.st_mode)) || @@ -813,7 +815,7 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool sz++; } if (closedir(dir) < 0) { - fprintf(stderr, "%s: failed closedir for %s: %s\n", __func__, cgroup, strerror(errno)); + lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno)); return false; } return true; @@ -870,11 +872,11 @@ bool cgfs_get_value(const char *controller, const char *cgroup, const char *file fnam = alloca(len); ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file); if (ret < 0 || (size_t)ret >= len) - return NULL; + return false; fd = openat(cfd, fnam, O_RDONLY); if (fd < 0) - return NULL; + return false; *value = slurp_file(fnam, fd); return *value != NULL; @@ -932,8 +934,8 @@ static void *make_key_list_entry(const char *controller, const char *cgroup, con { struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry); if (!entry) { - fprintf(stderr, "%s: Error getting files under %s:%s\n", - __func__, controller, cgroup); + lxcfs_error("Error getting files under %s:%s\n", controller, + cgroup); } return entry; } @@ -1170,7 +1172,7 @@ convert_id_to_ns(FILE *idfile, unsigned int in_id) * uids wrapped around - unexpected as this is a procfile, * so just bail. */ - fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n", + lxcfs_error("pid wrapparound at entry %u %u %u in %s\n", nsuid, hostuid, count, line); return -1; } @@ -1276,7 +1278,7 @@ static char *get_next_cgroup_dir(const char *taskcg, const char *querycg) char *start, *end; if (strlen(taskcg) <= strlen(querycg)) { - fprintf(stderr, "%s: I was fed bad input\n", __func__); + lxcfs_error("%s\n", "I was fed bad input."); return NULL; } @@ -1507,7 +1509,7 @@ static char *pick_controller_from_path(struct fuse_context *fc, const char *path char *contr, *slash; if (strlen(path) < 9) { - errno = EINVAL; + errno = EACCES; return NULL; } if (*(path + 7) != '/') { @@ -1542,7 +1544,7 @@ static const char *find_cgroup_in_path(const char *path) const char *p1; if (strlen(path) < 9) { - errno = EINVAL; + errno = EACCES; return NULL; } p1 = strstr(path + 8, "/"); @@ -1680,11 +1682,6 @@ int cg_getattr(const char *path, struct stat *sb) ret = -ENOENT; goto out; } - if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) { - ret = -EACCES; - goto out; - } - ret = 0; } @@ -1758,7 +1755,7 @@ int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset return -EIO; if (d->type != LXC_TYPE_CGDIR) { - fprintf(stderr, "Internal error: file cache info used in readdir\n"); + lxcfs_error("%s\n", "Internal error: file cache info used in readdir."); return -EIO; } if (!d->cgroup && !d->controller) { @@ -2003,14 +2000,14 @@ static bool wait_for_sock(int sock, int timeout) return false; if ((epfd = epoll_create(1)) < 0) { - fprintf(stderr, "Failed to create epoll socket: %m\n"); + lxcfs_error("%s\n", "Failed to create epoll socket: %m."); return false; } ev.events = POLLIN_SET; ev.data.fd = sock; if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) { - fprintf(stderr, "Failed adding socket to epoll: %m\n"); + lxcfs_error("%s\n", "Failed adding socket to epoll: %m."); close(epfd); return false; } @@ -2058,8 +2055,7 @@ static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst) if (pingfirst) { if (msgrecv(sock, buf, 1) != 1) { - fprintf(stderr, "%s: Error getting reply from server over socketpair\n", - __func__); + lxcfs_error("%s\n", "Error getting reply from server over socketpair."); return SEND_CREDS_FAIL; } } @@ -2083,8 +2079,7 @@ static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst) msg.msg_iovlen = 1; if (sendmsg(sock, &msg, 0) < 0) { - fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__, - strerror(errno)); + lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno)); if (errno == 3) return SEND_CREDS_NOTSK; return SEND_CREDS_FAIL; @@ -2110,12 +2105,12 @@ static bool recv_creds(int sock, struct ucred *cred, char *v) cred->gid = -1; if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) { - fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno)); + lxcfs_error("Failed to set passcred: %s\n", strerror(errno)); return false; } buf[0] = '1'; if (write(sock, buf, 1) != 1) { - fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno)); + lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno)); return false; } @@ -2130,14 +2125,12 @@ static bool recv_creds(int sock, struct ucred *cred, char *v) msg.msg_iovlen = 1; if (!wait_for_sock(sock, 2)) { - fprintf(stderr, "Timed out waiting for scm_cred: %s\n", - strerror(errno)); + lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno)); return false; } ret = recvmsg(sock, &msg, MSG_DONTWAIT); if (ret < 0) { - fprintf(stderr, "Failed to receive scm_cred: %s\n", - strerror(errno)); + lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno)); return false; } @@ -2170,10 +2163,8 @@ static int pid_ns_clone_wrapper(void *arg) { char b = '1'; close(args->cpipe[0]); - if (write(args->cpipe[1], &b, sizeof(char)) < 0) { - fprintf(stderr, "%s (child): error on write: %s\n", - __func__, strerror(errno)); - } + if (write(args->cpipe[1], &b, sizeof(char)) < 0) + lxcfs_error("(child): error on write: %s.\n", strerror(errno)); close(args->cpipe[1]); return args->wrapped(args->sock, args->tpid); } @@ -2307,13 +2298,11 @@ bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *fi // read converted results if (!wait_for_sock(sock[0], 2)) { - fprintf(stderr, "%s: timed out waiting for pid from child: %s\n", - __func__, strerror(errno)); + lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno)); goto out; } if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { - fprintf(stderr, "%s: error reading pid from child: %s\n", - __func__, strerror(errno)); + lxcfs_error("Error reading pid from child: %s.\n", strerror(errno)); goto out; } must_strcat_pid(d, &sz, &asz, qpid); @@ -2328,8 +2317,7 @@ next: v = '1'; if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) { // failed to ask child to exit - fprintf(stderr, "%s: failed to ask child to exit: %s\n", - __func__, strerror(errno)); + lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno)); goto out; } @@ -2357,7 +2345,7 @@ int cg_read(const char *path, char *buf, size_t size, off_t offset, bool r; if (f->type != LXC_TYPE_CGFILE) { - fprintf(stderr, "Internal error: directory cache info used in cg_read\n"); + lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read."); return -EIO; } @@ -2424,12 +2412,11 @@ static int pid_from_ns(int sock, pid_t tpid) cred.gid = 0; while (1) { if (!wait_for_sock(sock, 2)) { - fprintf(stderr, "%s: timeout reading from parent\n", __func__); + lxcfs_error("%s\n", "Timeout reading from parent."); return 1; } if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) { - fprintf(stderr, "%s: bad read from parent: %s\n", - __func__, strerror(errno)); + lxcfs_error("Bad read from parent: %s.\n", strerror(errno)); return 1; } if (vpid == -1) // done @@ -2530,20 +2517,20 @@ void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid) *gid = -1; sprintf(line, "/proc/%d/status", pid); if ((f = fopen(line, "r")) == NULL) { - fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno)); + lxcfs_error("Error opening %s: %s\n", line, strerror(errno)); return; } while (fgets(line, 400, f)) { if (strncmp(line, "Uid:", 4) == 0) { if (sscanf(line+4, "%u", &u) != 1) { - fprintf(stderr, "bad uid line for pid %u\n", pid); + lxcfs_error("bad uid line for pid %u\n", pid); fclose(f); return; } *uid = u; } else if (strncmp(line, "Gid:", 4) == 0) { if (sscanf(line+4, "%u", &g) != 1) { - fprintf(stderr, "bad gid line for pid %u\n", pid); + lxcfs_error("bad gid line for pid %u\n", pid); fclose(f); return; } @@ -2615,8 +2602,7 @@ static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char char v; if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { - fprintf(stderr, "%s: error writing pid to child: %s\n", - __func__, strerror(errno)); + lxcfs_error("Error writing pid to child: %s.\n", strerror(errno)); goto out; } @@ -2640,7 +2626,7 @@ static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char /* All good, write the value */ qpid = -1; if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid)) - fprintf(stderr, "Warning: failed to ask child to exit\n"); + lxcfs_error("%s\n", "Warning: failed to ask child to exit."); if (!fail) answer = true; @@ -2669,7 +2655,7 @@ int cg_write(const char *path, const char *buf, size_t size, off_t offset, bool r; if (f->type != LXC_TYPE_CGFILE) { - fprintf(stderr, "Internal error: directory cache info used in cg_write\n"); + lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write."); return -EIO; } @@ -2910,16 +2896,20 @@ int cg_rmdir(const char *path) return -EIO; controller = pick_controller_from_path(fc, path); - if (!controller) - return -errno; + if (!controller) /* Someone's trying to delete "/cgroup". */ + return -EPERM; cgroup = find_cgroup_in_path(path); - if (!cgroup) - return -errno; + if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */ + return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { - ret = -EINVAL; + /* Someone's trying to delete a cgroup on the same level as the + * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or + * rmdir "/cgroup/blkio/init.slice". + */ + ret = -EPERM; goto out; } @@ -2927,7 +2917,7 @@ int cg_rmdir(const char *path) if (initpid <= 0) initpid = fc->pid; if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) { - if (!last || strcmp(next, last) == 0) + if (!last || (next && (strcmp(next, last) == 0))) ret = -EBUSY; else ret = -ENOENT; @@ -2972,19 +2962,19 @@ static void parse_memstat(char *memstat, unsigned long *cached, while (*memstat) { if (startswith(memstat, "cache")) { - sscanf(memstat + 11, "%lu", cached); + sscanf(memstat + 5, "%lu", cached); *cached /= 1024; } else if (startswith(memstat, "active_anon")) { sscanf(memstat + 11, "%lu", active_anon); *active_anon /= 1024; } else if (startswith(memstat, "inactive_anon")) { - sscanf(memstat + 11, "%lu", inactive_anon); + sscanf(memstat + 13, "%lu", inactive_anon); *inactive_anon /= 1024; } else if (startswith(memstat, "active_file")) { sscanf(memstat + 11, "%lu", active_file); *active_file /= 1024; } else if (startswith(memstat, "inactive_file")) { - sscanf(memstat + 11, "%lu", inactive_file); + sscanf(memstat + 13, "%lu", inactive_file); *inactive_file /= 1024; } else if (startswith(memstat, "unevictable")) { sscanf(memstat + 11, "%lu", unevictable); @@ -3039,7 +3029,7 @@ static int read_file(const char *path, char *buf, size_t size, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3065,12 +3055,12 @@ static int read_file(const char *path, char *buf, size_t size, * FUSE ops for /proc */ -static unsigned long get_memlimit(const char *cgroup) +static unsigned long get_memlimit(const char *cgroup, const char *file) { char *memlimit_str = NULL; unsigned long memlimit = -1; - if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str)) + if (cgfs_get_value("memory", cgroup, file, &memlimit_str)) memlimit = strtoul(memlimit_str, NULL, 10); free(memlimit_str); @@ -3078,16 +3068,16 @@ static unsigned long get_memlimit(const char *cgroup) return memlimit; } -static unsigned long get_min_memlimit(const char *cgroup) +static unsigned long get_min_memlimit(const char *cgroup, const char *file) { char *copy = strdupa(cgroup); unsigned long memlimit = 0, retlimit; - retlimit = get_memlimit(copy); + retlimit = get_memlimit(copy, file); while (strcmp(copy, "/") != 0) { copy = dirname(copy); - memlimit = get_memlimit(copy); + memlimit = get_memlimit(copy, file); if (memlimit != -1 && memlimit < retlimit) retlimit = memlimit; }; @@ -3102,11 +3092,11 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, struct file_info *d = (struct file_info *)fi->fh; char *cg; char *memusage_str = NULL, *memstat_str = NULL, - *memswlimit_str = NULL, *memswusage_str = NULL, - *memswlimit_default_str = NULL, *memswusage_default_str = NULL; + *memswlimit_str = NULL, *memswusage_str = NULL; unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0, cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0, - active_file = 0, inactive_file = 0, unevictable = 0; + active_file = 0, inactive_file = 0, unevictable = 0, + hostswtotal = 0; char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; char *cache = d->buf; @@ -3132,7 +3122,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, return read_file("/proc/meminfo", buf, size, d); prune_init_slice(cg); - memlimit = get_min_memlimit(cg); + memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str)) goto err; if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str)) @@ -3143,20 +3133,9 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) && cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str)) { - /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */ - if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str)) - goto err; - if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str)) - goto err; - - memswlimit = strtoul(memswlimit_str, NULL, 10); + memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes"); memswusage = strtoul(memswusage_str, NULL, 10); - if (!strcmp(memswlimit_str, memswlimit_default_str)) - memswlimit = 0; - if (!strcmp(memswusage_str, memswusage_default_str)) - memswusage = 0; - memswlimit = memswlimit / 1024; memswusage = memswusage / 1024; } @@ -3179,7 +3158,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, memset(lbuf, 0, 100); if (startswith(line, "MemTotal:")) { - sscanf(line+14, "%lu", &hosttotal); + sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal); if (hosttotal < memlimit) memlimit = hosttotal; snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit); @@ -3191,10 +3170,13 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage); printme = lbuf; } else if (startswith(line, "SwapTotal:") && memswlimit > 0) { - snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit); + sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal); + if (hostswtotal < memswlimit) + memswlimit = hostswtotal; + snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit); printme = lbuf; } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) { - unsigned long swaptotal = memswlimit - memlimit, + unsigned long swaptotal = memswlimit, swapusage = memswusage - memusage, swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0; snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree); @@ -3211,11 +3193,11 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, } else if (startswith(line, "SwapCached:")) { snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL); printme = lbuf; - } else if (startswith(line, "Active")) { + } else if (startswith(line, "Active:")) { snprintf(lbuf, 100, "Active: %8lu kB\n", active_anon + active_file); printme = lbuf; - } else if (startswith(line, "Inactive")) { + } else if (startswith(line, "Inactive:")) { snprintf(lbuf, 100, "Inactive: %8lu kB\n", inactive_anon + inactive_file); printme = lbuf; @@ -3251,7 +3233,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3276,8 +3258,6 @@ err: free(memswlimit_str); free(memswusage_str); free(memstat_str); - free(memswlimit_default_str); - free(memswusage_default_str); return rv; } @@ -3382,7 +3362,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3407,7 +3387,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3425,7 +3405,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3483,6 +3463,155 @@ err: return rv; } +static uint64_t get_reaper_start_time(pid_t pid) +{ + int ret; + FILE *f; + uint64_t starttime; + /* strlen("/proc/") = 6 + * + + * LXCFS_NUMSTRLEN64 + * + + * strlen("/stat") = 5 + * + + * \0 = 1 + * */ +#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1) + char path[__PROC_PID_STAT_LEN]; + pid_t qpid; + + qpid = lookup_initpid_in_store(pid); + if (qpid <= 0) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid); + if (ret < 0 || ret >= __PROC_PID_STAT_LEN) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + f = fopen(path, "r"); + if (!f) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + /* Note that the *scanf() argument supression requires that length + * modifiers such as "l" are omitted. Otherwise some compilers will yell + * at us. It's like telling someone you're not married and then asking + * if you can bring your wife to the party. + */ + ret = fscanf(f, "%*d " /* (1) pid %d */ + "%*s " /* (2) comm %s */ + "%*c " /* (3) state %c */ + "%*d " /* (4) ppid %d */ + "%*d " /* (5) pgrp %d */ + "%*d " /* (6) session %d */ + "%*d " /* (7) tty_nr %d */ + "%*d " /* (8) tpgid %d */ + "%*u " /* (9) flags %u */ + "%*u " /* (10) minflt %lu */ + "%*u " /* (11) cminflt %lu */ + "%*u " /* (12) majflt %lu */ + "%*u " /* (13) cmajflt %lu */ + "%*u " /* (14) utime %lu */ + "%*u " /* (15) stime %lu */ + "%*d " /* (16) cutime %ld */ + "%*d " /* (17) cstime %ld */ + "%*d " /* (18) priority %ld */ + "%*d " /* (19) nice %ld */ + "%*d " /* (20) num_threads %ld */ + "%*d " /* (21) itrealvalue %ld */ + "%" PRIu64, /* (22) starttime %llu */ + &starttime); + if (ret != 1) { + fclose(f); + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + fclose(f); + + errno = 0; + return starttime; +} + +static uint64_t get_reaper_start_time_in_sec(pid_t pid) +{ + uint64_t clockticks; + int64_t ticks_per_sec; + + clockticks = get_reaper_start_time(pid); + if (clockticks == 0 && errno == EINVAL) { + lxcfs_debug("failed to retrieve start time of pid %d\n", pid); + return 0; + } + + ticks_per_sec = sysconf(_SC_CLK_TCK); + if (ticks_per_sec < 0 && errno == EINVAL) { + lxcfs_debug( + "%s\n", + "failed to determine number of clock ticks in a second"); + return 0; + } + + return (clockticks /= ticks_per_sec); +} + +static uint64_t get_reaper_age(pid_t pid) +{ + uint64_t procstart, uptime, procage; + + /* We need to substract the time the process has started since system + * boot minus the time when the system has started to get the actual + * reaper age. + */ + procstart = get_reaper_start_time_in_sec(pid); + procage = procstart; + if (procstart > 0) { + int ret; + struct timespec spec; + + ret = clock_gettime(CLOCK_BOOTTIME, &spec); + if (ret < 0) + return 0; + /* We could make this more precise here by using the tv_nsec + * field in the timespec struct and convert it to milliseconds + * and then create a double for the seconds and milliseconds but + * that seems more work than it is worth. + */ + uptime = spec.tv_sec; + procage = uptime - procstart; + } + + return procage; +} + +static uint64_t get_reaper_btime(pid) +{ + int ret; + struct sysinfo sys; + uint64_t procstart; + uint64_t uptime; + + ret = sysinfo(&sys); + if (ret < 0) { + lxcfs_debug("%s\n", "failed to retrieve system information"); + return 0; + } + + uptime = (uint64_t)time(NULL) - (uint64_t)sys.uptime; + procstart = get_reaper_start_time_in_sec(pid); + return uptime + procstart; +} + +#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2) static int proc_stat_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { @@ -3493,10 +3622,9 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; int curcpu = -1; /* cpu numbering starts at 0 */ - unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0; + unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0; unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0, - irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0; -#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE + irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0; char cpuall[CPUALL_MAX_SIZE]; /* reserve for cpu all */ char *cache = d->buf + CPUALL_MAX_SIZE; @@ -3532,7 +3660,7 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, //skip first line if (getline(&line, &linelen, f) < 0) { - fprintf(stderr, "proc_stat_read read first line failed\n"); + lxcfs_error("%s\n", "proc_stat_read read first line failed."); goto err; } @@ -3553,7 +3681,7 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3580,7 +3708,7 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3589,8 +3717,17 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, cache_size -= l; total_len += l; - if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq, - &softirq, &steal, &guest) != 9) + if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + &user, + &nice, + &system, + &idle, + &iowait, + &irq, + &softirq, + &steal, + &guest, + &guest_nice) != 10) continue; user_sum += user; nice_sum += nice; @@ -3601,18 +3738,28 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, softirq_sum += softirq; steal_sum += steal; guest_sum += guest; + guest_nice_sum += guest_nice; } cache = d->buf; - int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum); - if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){ + int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + user_sum, + nice_sum, + system_sum, + idle_sum, + iowait_sum, + irq_sum, + softirq_sum, + steal_sum, + guest_sum, + guest_nice_sum); + if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) { memcpy(cache, cpuall, cpuall_len); cache += cpuall_len; - } else{ + } else { /* shouldn't happen */ - fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len); + lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len); cpuall_len = 0; } @@ -3620,7 +3767,8 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, total_len += cpuall_len; d->cached = 1; d->size = total_len; - if (total_len > size ) total_len = size; + if (total_len > size) + total_len = size; memcpy(buf, d->buf, total_len); rv = total_len; @@ -3634,27 +3782,13 @@ err: return rv; } -static long int getreaperage(pid_t pid) -{ - char fnam[100]; - struct stat sb; - int ret; - pid_t qpid; - - qpid = lookup_initpid_in_store(pid); - if (qpid <= 0) - return 0; - - ret = snprintf(fnam, 100, "/proc/%d", qpid); - if (ret < 0 || ret >= 100) - return 0; - - if (lstat(fnam, &sb) < 0) - return 0; - - return time(NULL) - sb.st_ctime; -} - +/* This function retrieves the busy time of a group of tasks by looking at + * cpuacct.usage. Unfortunately, this only makes sense when the container has + * been given it's own cpuacct cgroup. If not, this function will take the busy + * time of all other taks that do not actually belong to the container into + * account as well. If someone has a clever solution for this please send a + * patch! + */ static unsigned long get_reaper_busy(pid_t task) { pid_t initpid = lookup_initpid_in_store(task); @@ -3700,33 +3834,37 @@ static int proc_uptime_read(char *buf, size_t size, off_t offset, { struct fuse_context *fc = fuse_get_context(); struct file_info *d = (struct file_info *)fi->fh; - long int reaperage = getreaperage(fc->pid); - unsigned long int busytime = get_reaper_busy(fc->pid), idletime; + unsigned long int busytime = get_reaper_busy(fc->pid); char *cache = d->buf; ssize_t total_len = 0; + uint64_t idletime, reaperage; #if RELOADTEST iwashere(); #endif if (offset){ - if (offset > d->size) - return -EINVAL; if (!d->cached) return 0; + if (offset > d->size) + return -EINVAL; int left = d->size - offset; total_len = left > size ? size: left; memcpy(buf, cache + offset, total_len); return total_len; } - idletime = reaperage - busytime; - if (idletime > reaperage) - idletime = reaperage; + reaperage = get_reaper_age(fc->pid); + /* To understand why this is done, please read the comment to the + * get_reaper_busy() function. + */ + idletime = reaperage; + if (reaperage >= busytime) + idletime = reaperage - busytime; - total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime); - if (total_len < 0){ - perror("Error writing to cache"); + total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime); + if (total_len < 0 || total_len >= d->buflen){ + lxcfs_error("%s\n", "failed to write to cache"); return 0; } @@ -3844,7 +3982,7 @@ static int proc_diskstats_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3878,8 +4016,7 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, struct fuse_context *fc = fuse_get_context(); struct file_info *d = (struct file_info *)fi->fh; char *cg = NULL; - char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL, - *memswlimit_default_str = NULL, *memswusage_default_str = NULL; + char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL; unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0; ssize_t total_len = 0, rv = 0; ssize_t l = 0; @@ -3904,32 +4041,19 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, return read_file("/proc/swaps", buf, size, d); prune_init_slice(cg); - if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str)) - goto err; + memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str)) goto err; - memlimit = strtoul(memlimit_str, NULL, 10); memusage = strtoul(memusage_str, NULL, 10); if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) && cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) { - /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */ - if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str)) - goto err; - if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str)) - goto err; - - memswlimit = strtoul(memswlimit_str, NULL, 10); + memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes"); memswusage = strtoul(memswusage_str, NULL, 10); - if (!strcmp(memswlimit_str, memswlimit_default_str)) - memswlimit = 0; - if (!strcmp(memswusage_str, memswusage_default_str)) - memswusage = 0; - swap_total = (memswlimit - memlimit) / 1024; swap_free = (memswusage - memusage) / 1024; } @@ -3983,8 +4107,6 @@ err: free(memlimit_str); free(memusage_str); free(memswusage_str); - free(memswusage_default_str); - free(memswlimit_default_str); return rv; } @@ -4145,7 +4267,7 @@ static bool mkdir_p(const char *dir, mode_t mode) if (!makeme) return false; if (mkdir(makeme, mode) && errno != EEXIST) { - fprintf(stderr, "failed to create directory '%s': %s", + lxcfs_error("Failed to create directory '%s': %s.\n", makeme, strerror(errno)); free(makeme); return false; @@ -4159,37 +4281,87 @@ static bool mkdir_p(const char *dir, mode_t mode) static bool umount_if_mounted(void) { if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { - fprintf(stderr, "failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); + lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); return false; } return true; } -static int pivot_enter(void) +/* __typeof__ should be safe to use with all compilers. */ +typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; +static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) +{ + return (fs->f_type == (fs_type_magic)magic_val); +} + +/* + * looking at fs/proc_namespace.c, it appears we can + * actually expect the rootfs entry to very specifically contain + * " - rootfs rootfs " + * IIUC, so long as we've chrooted so that rootfs is not our root, + * the rootfs entry should always be skipped in mountinfo contents. + */ +static bool is_on_ramfs(void) +{ + FILE *f; + char *p, *p2; + char *line = NULL; + size_t len = 0; + int i; + + f = fopen("/proc/self/mountinfo", "r"); + if (!f) + return false; + + while (getline(&line, &len, f) != -1) { + for (p = line, i = 0; p && i < 4; i++) + p = strchr(p + 1, ' '); + if (!p) + continue; + p2 = strchr(p + 1, ' '); + if (!p2) + continue; + *p2 = '\0'; + if (strcmp(p + 1, "/") == 0) { + // this is '/'. is it the ramfs? + p = strchr(p2 + 1, '-'); + if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) { + free(line); + fclose(f); + return true; + } + } + } + free(line); + fclose(f); + return false; +} + +static int pivot_enter() { int ret = -1, oldroot = -1, newroot = -1; oldroot = open("/", O_DIRECTORY | O_RDONLY); if (oldroot < 0) { - fprintf(stderr, "%s: Failed to open old root for fchdir.\n", __func__); + lxcfs_error("%s\n", "Failed to open old root for fchdir."); return ret; } newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY); if (newroot < 0) { - fprintf(stderr, "%s: Failed to open new root for fchdir.\n", __func__); + lxcfs_error("%s\n", "Failed to open new root for fchdir."); goto err; } /* change into new root fs */ if (fchdir(newroot) < 0) { - fprintf(stderr, "%s: Failed to change directory to new rootfs: %s.\n", __func__, ROOTDIR); + lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR); goto err; } /* pivot_root into our new root fs */ if (pivot_root(".", ".") < 0) { - fprintf(stderr, "%s: pivot_root() syscall failed: %s.\n", __func__, strerror(errno)); + lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno)); goto err; } @@ -4199,16 +4371,17 @@ static int pivot_enter(void) * to the old-root. */ if (fchdir(oldroot) < 0) { - fprintf(stderr, "%s: Failed to enter old root.\n", __func__); + lxcfs_error("%s\n", "Failed to enter old root."); goto err; } + if (umount2(".", MNT_DETACH) < 0) { - fprintf(stderr, "%s: Failed to detach old root.\n", __func__); + lxcfs_error("%s\n", "Failed to detach old root."); goto err; } if (fchdir(newroot) < 0) { - fprintf(stderr, "%s: Failed to re-enter new root.\n", __func__); + lxcfs_error("%s\n", "Failed to re-enter new root."); goto err; } @@ -4219,79 +4392,143 @@ err: close(oldroot); if (newroot > 0) close(newroot); + return ret; } +static int chroot_enter() +{ + if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { + lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); + return -1; + } + + if (chroot(".") < 0) { + lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); + return -1; + } + + if (chdir("/") < 0) { + lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); + return -1; + } + + return 0; +} + +static int permute_and_enter(void) +{ + struct statfs sb; + + if (statfs("/", &sb) < 0) { + lxcfs_error("%s\n", "Could not stat / mountpoint."); + return -1; + } + + /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will + * likely report TMPFS_MAGIC. Hence, when it reports no we still check + * /proc/1/mountinfo. */ + if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) + return chroot_enter(); + + if (pivot_enter() < 0) { + lxcfs_error("%s\n", "Could not perform pivot root."); + return -1; + } + + return 0; +} + /* Prepare our new clean root. */ -static int pivot_prepare(void) +static int permute_prepare(void) { if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { - fprintf(stderr, "%s: Failed to create directory for new root.\n", __func__); + lxcfs_error("%s\n", "Failed to create directory for new root."); return -1; } if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { - fprintf(stderr, "%s: Failed to bind-mount / for new root: %s.\n", __func__, strerror(errno)); + lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); return -1; } if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { - fprintf(stderr, "%s: Failed to bind-mount /run into new root: %s.\n", __func__, strerror(errno)); + lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); return -1; } if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { - printf("%s: failed to move " BASEDIR " into new root: %s.\n", __func__, strerror(errno)); + printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); return -1; } return 0; } -static bool pivot_new_root(void) +/* Calls chroot() on ramfs, pivot_root() in all other cases. */ +static bool permute_root(void) { /* Prepare new root. */ - if (pivot_prepare() < 0) + if (permute_prepare() < 0) return false; /* Pivot into new root. */ - if (pivot_enter() < 0) + if (permute_and_enter() < 0) return false; return true; } -static bool setup_cgfs_dir(void) +static int preserve_mnt_ns(int pid) +{ + int ret; + size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt"); + char path[len]; + + ret = snprintf(path, len, "/proc/%d/ns/mnt", pid); + if (ret < 0 || (size_t)ret >= len) + return -1; + + return open(path, O_RDONLY | O_CLOEXEC); +} + +static bool cgfs_prepare_mounts(void) { if (!mkdir_p(BASEDIR, 0700)) { - fprintf(stderr, "Failed to create lxcfs cgroup mountpoint.\n"); + lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); return false; } if (!umount_if_mounted()) { - fprintf(stderr, "Failed to clean up old lxcfs cgroup mountpoint.\n"); + lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); return false; } if (unshare(CLONE_NEWNS) < 0) { - fprintf(stderr, "%s: Failed to unshare mount namespace: %s.\n", __func__, strerror(errno)); + lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); + return false; + } + + cgroup_mount_ns_fd = preserve_mnt_ns(getpid()); + if (cgroup_mount_ns_fd < 0) { + lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); return false; } if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { - fprintf(stderr, "%s: Failed to remount / private: %s.\n", __func__, strerror(errno)); + lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); return false; } if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { - fprintf(stderr, "Failed to mount tmpfs over lxcfs cgroup mountpoint.\n"); + lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); return false; } return true; } -static bool do_mount_cgroups(void) +static bool cgfs_mount_hierarchies(void) { char *target; size_t clen, len; @@ -4299,11 +4536,13 @@ static bool do_mount_cgroups(void) for (i = 0; i < num_hierarchies; i++) { char *controller = hierarchies[i]; + clen = strlen(controller); len = strlen(BASEDIR) + clen + 2; target = malloc(len); if (!target) return false; + ret = snprintf(target, len, "%s/%s", BASEDIR, controller); if (ret < 0 || ret >= len) { free(target); @@ -4313,8 +4552,12 @@ static bool do_mount_cgroups(void) free(target); return false; } - if (mount(controller, target, "cgroup", 0, controller) < 0) { - fprintf(stderr, "Failed mounting cgroup %s\n", controller); + if (!strcmp(controller, "unified")) + ret = mount("none", target, "cgroup2", 0, NULL); + else + ret = mount(controller, target, "cgroup", 0, controller); + if (ret < 0) { + lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno)); free(target); return false; } @@ -4331,50 +4574,41 @@ static bool do_mount_cgroups(void) static bool cgfs_setup_controllers(void) { - if (!setup_cgfs_dir()) + if (!cgfs_prepare_mounts()) return false; - if (!do_mount_cgroups()) { - fprintf(stderr, "Failed to set up private lxcfs cgroup mounts.\n"); + if (!cgfs_mount_hierarchies()) { + lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts."); return false; } - if (!pivot_new_root()) + if (!permute_root()) return false; return true; } -static int preserve_ns(int pid) -{ - int ret; - size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */; - char path[len]; - - ret = snprintf(path, len, "/proc/%d/ns/mnt", pid); - if (ret < 0 || (size_t)ret >= len) - return -1; - - return open(path, O_RDONLY | O_CLOEXEC); -} - static void __attribute__((constructor)) collect_and_mount_subsystems(void) { FILE *f; - char *line = NULL; + char *cret, *line = NULL; + char cwd[MAXPATHLEN]; size_t len = 0; int i, init_ns = -1; + bool found_unified = false; if ((f = fopen("/proc/self/cgroup", "r")) == NULL) { - fprintf(stderr, "Error opening /proc/self/cgroup: %s\n", strerror(errno)); + lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno)); return; } + while (getline(&line, &len, f) != -1) { - char *p, *p2; + char *idx, *p, *p2; p = strchr(line, ':'); if (!p) goto out; + idx = line; *(p++) = '\0'; p2 = strrchr(p, ':'); @@ -4387,32 +4621,49 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void) * because it parses out the empty string "" and later on passes * it to mount(). Let's skip such entries. */ - if (!strcmp(p, "")) - continue; + if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) { + found_unified = true; + p = "unified"; + } if (!store_hierarchy(line, p)) goto out; } /* Preserve initial namespace. */ - init_ns = preserve_ns(getpid()); - if (init_ns < 0) + init_ns = preserve_mnt_ns(getpid()); + if (init_ns < 0) { + lxcfs_error("%s\n", "Failed to preserve initial mount namespace."); goto out; + } - fd_hierarchies = malloc(sizeof(int *) * num_hierarchies); - if (!fd_hierarchies) + fd_hierarchies = malloc(sizeof(int) * num_hierarchies); + if (!fd_hierarchies) { + lxcfs_error("%s\n", strerror(errno)); goto out; + } for (i = 0; i < num_hierarchies; i++) fd_hierarchies[i] = -1; + cret = getcwd(cwd, MAXPATHLEN); + if (!cret) + lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno)); + /* This function calls unshare(CLONE_NEWNS) our initial mount namespace * to privately mount lxcfs cgroups. */ - if (!cgfs_setup_controllers()) + if (!cgfs_setup_controllers()) { + lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs."); goto out; + } - if (setns(init_ns, 0) < 0) + if (setns(init_ns, 0) < 0) { + lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno)); goto out; + } + + if (!cret || chdir(cwd) < 0) + lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno)); print_subsystems(); @@ -4427,6 +4678,8 @@ static void __attribute__((destructor)) free_subsystems(void) { int i; + lxcfs_debug("%s\n", "Running destructor for liblxcfs."); + for (i = 0; i < num_hierarchies; i++) { if (hierarchies[i]) free(hierarchies[i]); @@ -4435,4 +4688,7 @@ static void __attribute__((destructor)) free_subsystems(void) } free(hierarchies); free(fd_hierarchies); + + if (cgroup_mount_ns_fd >= 0) + close(cgroup_mount_ns_fd); }