X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=bindings.c;h=9657160fe485cf70a2248b3f9b8345dc818f76cc;hb=3137a0a63db273005d62973a5d055a7623e98631;hp=ac88f48f0df7abcf06c2608fd2716de71dd0d890;hpb=2b1912c812e153b2958c87e20764a9832b9a664a;p=mirror_lxcfs.git diff --git a/bindings.c b/bindings.c index ac88f48..9657160 100644 --- a/bindings.c +++ b/bindings.c @@ -8,20 +8,24 @@ #define FUSE_USE_VERSION 26 +#define __STDC_FORMAT_MACROS #include #include #include #include +#include #include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -29,10 +33,15 @@ #include #include #include +#include +#include #include "bindings.h" #include "config.h" // for VERSION +/* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */ +#define LXCFS_NUMSTRLEN64 21 + /* Define pivot_root() if missing from the C library */ #ifndef HAVE_PIVOT_ROOT static int pivot_root(const char * new_root, const char * put_old) @@ -70,8 +79,8 @@ struct file_info { int cached; }; -/* reserve buffer size, for cpuall in /proc/stat */ -#define BUF_RESERVE_SIZE 256 +/* Reserve buffer size to account for file size changes. */ +#define BUF_RESERVE_SIZE 512 /* * A table caching which pid is init for a pid namespace. @@ -130,6 +139,7 @@ static char **hierarchies; * another namespace using the *at() family of functions * {openat(), fchownat(), ...}. */ static int *fd_hierarchies; +static int cgroup_mount_ns_fd = -1; static void unlock_mutex(pthread_mutex_t *l) { @@ -412,6 +422,7 @@ static void print_subsystems(void) { int i; + fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd); fprintf(stderr, "hierarchies:\n"); for (i = 0; i < num_hierarchies; i++) { if (hierarchies[i]) @@ -861,11 +872,11 @@ bool cgfs_get_value(const char *controller, const char *cgroup, const char *file fnam = alloca(len); ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file); if (ret < 0 || (size_t)ret >= len) - return NULL; + return false; fd = openat(cfd, fnam, O_RDONLY); if (fd < 0) - return NULL; + return false; *value = slurp_file(fnam, fd); return *value != NULL; @@ -1671,11 +1682,6 @@ int cg_getattr(const char *path, struct stat *sb) ret = -ENOENT; goto out; } - if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) { - ret = -EACCES; - goto out; - } - ret = 0; } @@ -2911,7 +2917,7 @@ int cg_rmdir(const char *path) if (initpid <= 0) initpid = fc->pid; if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) { - if (!last || strcmp(next, last) == 0) + if (!last || (next && (strcmp(next, last) == 0))) ret = -EBUSY; else ret = -ENOENT; @@ -2955,23 +2961,23 @@ static void parse_memstat(char *memstat, unsigned long *cached, char *eol; while (*memstat) { - if (startswith(memstat, "cache")) { + if (startswith(memstat, "total_cache")) { sscanf(memstat + 11, "%lu", cached); *cached /= 1024; - } else if (startswith(memstat, "active_anon")) { - sscanf(memstat + 11, "%lu", active_anon); + } else if (startswith(memstat, "total_active_anon")) { + sscanf(memstat + 17, "%lu", active_anon); *active_anon /= 1024; - } else if (startswith(memstat, "inactive_anon")) { - sscanf(memstat + 11, "%lu", inactive_anon); + } else if (startswith(memstat, "total_inactive_anon")) { + sscanf(memstat + 19, "%lu", inactive_anon); *inactive_anon /= 1024; - } else if (startswith(memstat, "active_file")) { - sscanf(memstat + 11, "%lu", active_file); + } else if (startswith(memstat, "total_active_file")) { + sscanf(memstat + 17, "%lu", active_file); *active_file /= 1024; - } else if (startswith(memstat, "inactive_file")) { - sscanf(memstat + 11, "%lu", inactive_file); + } else if (startswith(memstat, "total_inactive_file")) { + sscanf(memstat + 19, "%lu", inactive_file); *inactive_file /= 1024; - } else if (startswith(memstat, "unevictable")) { - sscanf(memstat + 11, "%lu", unevictable); + } else if (startswith(memstat, "total_unevictable")) { + sscanf(memstat + 17, "%lu", unevictable); *unevictable /= 1024; } eol = strchr(memstat, '\n'); @@ -3049,12 +3055,12 @@ static int read_file(const char *path, char *buf, size_t size, * FUSE ops for /proc */ -static unsigned long get_memlimit(const char *cgroup) +static unsigned long get_memlimit(const char *cgroup, const char *file) { char *memlimit_str = NULL; unsigned long memlimit = -1; - if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str)) + if (cgfs_get_value("memory", cgroup, file, &memlimit_str)) memlimit = strtoul(memlimit_str, NULL, 10); free(memlimit_str); @@ -3062,16 +3068,16 @@ static unsigned long get_memlimit(const char *cgroup) return memlimit; } -static unsigned long get_min_memlimit(const char *cgroup) +static unsigned long get_min_memlimit(const char *cgroup, const char *file) { char *copy = strdupa(cgroup); unsigned long memlimit = 0, retlimit; - retlimit = get_memlimit(copy); + retlimit = get_memlimit(copy, file); while (strcmp(copy, "/") != 0) { copy = dirname(copy); - memlimit = get_memlimit(copy); + memlimit = get_memlimit(copy, file); if (memlimit != -1 && memlimit < retlimit) retlimit = memlimit; }; @@ -3086,11 +3092,11 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, struct file_info *d = (struct file_info *)fi->fh; char *cg; char *memusage_str = NULL, *memstat_str = NULL, - *memswlimit_str = NULL, *memswusage_str = NULL, - *memswlimit_default_str = NULL, *memswusage_default_str = NULL; + *memswlimit_str = NULL, *memswusage_str = NULL; unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0, cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0, - active_file = 0, inactive_file = 0, unevictable = 0; + active_file = 0, inactive_file = 0, unevictable = 0, + hostswtotal = 0; char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; char *cache = d->buf; @@ -3116,7 +3122,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, return read_file("/proc/meminfo", buf, size, d); prune_init_slice(cg); - memlimit = get_min_memlimit(cg); + memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str)) goto err; if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str)) @@ -3127,20 +3133,9 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) && cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str)) { - /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */ - if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str)) - goto err; - if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str)) - goto err; - - memswlimit = strtoul(memswlimit_str, NULL, 10); + memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes"); memswusage = strtoul(memswusage_str, NULL, 10); - if (!strcmp(memswlimit_str, memswlimit_default_str)) - memswlimit = 0; - if (!strcmp(memswusage_str, memswusage_default_str)) - memswusage = 0; - memswlimit = memswlimit / 1024; memswusage = memswusage / 1024; } @@ -3163,7 +3158,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, memset(lbuf, 0, 100); if (startswith(line, "MemTotal:")) { - sscanf(line+14, "%lu", &hosttotal); + sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal); if (hosttotal < memlimit) memlimit = hosttotal; snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit); @@ -3172,13 +3167,16 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage); printme = lbuf; } else if (startswith(line, "MemAvailable:")) { - snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage); + snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached); printme = lbuf; } else if (startswith(line, "SwapTotal:") && memswlimit > 0) { - snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit); + sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal); + if (hostswtotal < memswlimit) + memswlimit = hostswtotal; + snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit); printme = lbuf; } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) { - unsigned long swaptotal = memswlimit - memlimit, + unsigned long swaptotal = memswlimit, swapusage = memswusage - memusage, swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0; snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree); @@ -3195,11 +3193,11 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, } else if (startswith(line, "SwapCached:")) { snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL); printme = lbuf; - } else if (startswith(line, "Active")) { + } else if (startswith(line, "Active:")) { snprintf(lbuf, 100, "Active: %8lu kB\n", active_anon + active_file); printme = lbuf; - } else if (startswith(line, "Inactive")) { + } else if (startswith(line, "Inactive:")) { snprintf(lbuf, 100, "Inactive: %8lu kB\n", inactive_anon + inactive_file); printme = lbuf; @@ -3260,8 +3258,6 @@ err: free(memswlimit_str); free(memswusage_str); free(memstat_str); - free(memswlimit_default_str); - free(memswusage_default_str); return rv; } @@ -3467,6 +3463,137 @@ err: return rv; } +static uint64_t get_reaper_start_time(pid_t pid) +{ + int ret; + FILE *f; + uint64_t starttime; + /* strlen("/proc/") = 6 + * + + * LXCFS_NUMSTRLEN64 + * + + * strlen("/stat") = 5 + * + + * \0 = 1 + * */ +#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1) + char path[__PROC_PID_STAT_LEN]; + pid_t qpid; + + qpid = lookup_initpid_in_store(pid); + if (qpid <= 0) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid); + if (ret < 0 || ret >= __PROC_PID_STAT_LEN) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + f = fopen(path, "r"); + if (!f) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + /* Note that the *scanf() argument supression requires that length + * modifiers such as "l" are omitted. Otherwise some compilers will yell + * at us. It's like telling someone you're not married and then asking + * if you can bring your wife to the party. + */ + ret = fscanf(f, "%*d " /* (1) pid %d */ + "%*s " /* (2) comm %s */ + "%*c " /* (3) state %c */ + "%*d " /* (4) ppid %d */ + "%*d " /* (5) pgrp %d */ + "%*d " /* (6) session %d */ + "%*d " /* (7) tty_nr %d */ + "%*d " /* (8) tpgid %d */ + "%*u " /* (9) flags %u */ + "%*u " /* (10) minflt %lu */ + "%*u " /* (11) cminflt %lu */ + "%*u " /* (12) majflt %lu */ + "%*u " /* (13) cmajflt %lu */ + "%*u " /* (14) utime %lu */ + "%*u " /* (15) stime %lu */ + "%*d " /* (16) cutime %ld */ + "%*d " /* (17) cstime %ld */ + "%*d " /* (18) priority %ld */ + "%*d " /* (19) nice %ld */ + "%*d " /* (20) num_threads %ld */ + "%*d " /* (21) itrealvalue %ld */ + "%" PRIu64, /* (22) starttime %llu */ + &starttime); + if (ret != 1) { + fclose(f); + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + fclose(f); + + errno = 0; + return starttime; +} + +static uint64_t get_reaper_start_time_in_sec(pid_t pid) +{ + uint64_t clockticks; + int64_t ticks_per_sec; + + clockticks = get_reaper_start_time(pid); + if (clockticks == 0 && errno == EINVAL) { + lxcfs_debug("failed to retrieve start time of pid %d\n", pid); + return 0; + } + + ticks_per_sec = sysconf(_SC_CLK_TCK); + if (ticks_per_sec < 0 && errno == EINVAL) { + lxcfs_debug( + "%s\n", + "failed to determine number of clock ticks in a second"); + return 0; + } + + return (clockticks /= ticks_per_sec); +} + +static uint64_t get_reaper_age(pid_t pid) +{ + uint64_t procstart, uptime, procage; + + /* We need to substract the time the process has started since system + * boot minus the time when the system has started to get the actual + * reaper age. + */ + procstart = get_reaper_start_time_in_sec(pid); + procage = procstart; + if (procstart > 0) { + int ret; + struct timespec spec; + + ret = clock_gettime(CLOCK_BOOTTIME, &spec); + if (ret < 0) + return 0; + /* We could make this more precise here by using the tv_nsec + * field in the timespec struct and convert it to milliseconds + * and then create a double for the seconds and milliseconds but + * that seems more work than it is worth. + */ + uptime = spec.tv_sec; + procage = uptime - procstart; + } + + return procage; +} + +#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2) static int proc_stat_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { @@ -3477,10 +3604,9 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; int curcpu = -1; /* cpu numbering starts at 0 */ - unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0; + unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0; unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0, - irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0; -#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE + irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0; char cpuall[CPUALL_MAX_SIZE]; /* reserve for cpu all */ char *cache = d->buf + CPUALL_MAX_SIZE; @@ -3573,8 +3699,17 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, cache_size -= l; total_len += l; - if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq, - &softirq, &steal, &guest) != 9) + if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + &user, + &nice, + &system, + &idle, + &iowait, + &irq, + &softirq, + &steal, + &guest, + &guest_nice) != 10) continue; user_sum += user; nice_sum += nice; @@ -3585,16 +3720,26 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, softirq_sum += softirq; steal_sum += steal; guest_sum += guest; + guest_nice_sum += guest_nice; } cache = d->buf; - int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum); - if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){ + int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + user_sum, + nice_sum, + system_sum, + idle_sum, + iowait_sum, + irq_sum, + softirq_sum, + steal_sum, + guest_sum, + guest_nice_sum); + if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) { memcpy(cache, cpuall, cpuall_len); cache += cpuall_len; - } else{ + } else { /* shouldn't happen */ lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len); cpuall_len = 0; @@ -3604,7 +3749,8 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, total_len += cpuall_len; d->cached = 1; d->size = total_len; - if (total_len > size ) total_len = size; + if (total_len > size) + total_len = size; memcpy(buf, d->buf, total_len); rv = total_len; @@ -3618,27 +3764,13 @@ err: return rv; } -static long int getreaperage(pid_t pid) -{ - char fnam[100]; - struct stat sb; - int ret; - pid_t qpid; - - qpid = lookup_initpid_in_store(pid); - if (qpid <= 0) - return 0; - - ret = snprintf(fnam, 100, "/proc/%d", qpid); - if (ret < 0 || ret >= 100) - return 0; - - if (lstat(fnam, &sb) < 0) - return 0; - - return time(NULL) - sb.st_ctime; -} - +/* This function retrieves the busy time of a group of tasks by looking at + * cpuacct.usage. Unfortunately, this only makes sense when the container has + * been given it's own cpuacct cgroup. If not, this function will take the busy + * time of all other taks that do not actually belong to the container into + * account as well. If someone has a clever solution for this please send a + * patch! + */ static unsigned long get_reaper_busy(pid_t task) { pid_t initpid = lookup_initpid_in_store(task); @@ -3684,33 +3816,37 @@ static int proc_uptime_read(char *buf, size_t size, off_t offset, { struct fuse_context *fc = fuse_get_context(); struct file_info *d = (struct file_info *)fi->fh; - long int reaperage = getreaperage(fc->pid); - unsigned long int busytime = get_reaper_busy(fc->pid), idletime; + unsigned long int busytime = get_reaper_busy(fc->pid); char *cache = d->buf; ssize_t total_len = 0; + uint64_t idletime, reaperage; #if RELOADTEST iwashere(); #endif if (offset){ - if (offset > d->size) - return -EINVAL; if (!d->cached) return 0; + if (offset > d->size) + return -EINVAL; int left = d->size - offset; total_len = left > size ? size: left; memcpy(buf, cache + offset, total_len); return total_len; } - idletime = reaperage - busytime; - if (idletime > reaperage) - idletime = reaperage; + reaperage = get_reaper_age(fc->pid); + /* To understand why this is done, please read the comment to the + * get_reaper_busy() function. + */ + idletime = reaperage; + if (reaperage >= busytime) + idletime = reaperage - busytime; - total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime); - if (total_len < 0){ - perror("Error writing to cache"); + total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime); + if (total_len < 0 || total_len >= d->buflen){ + lxcfs_error("%s\n", "failed to write to cache"); return 0; } @@ -3862,8 +3998,7 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, struct fuse_context *fc = fuse_get_context(); struct file_info *d = (struct file_info *)fi->fh; char *cg = NULL; - char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL, - *memswlimit_default_str = NULL, *memswusage_default_str = NULL; + char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL; unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0; ssize_t total_len = 0, rv = 0; ssize_t l = 0; @@ -3888,32 +4023,19 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, return read_file("/proc/swaps", buf, size, d); prune_init_slice(cg); - if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str)) - goto err; + memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str)) goto err; - memlimit = strtoul(memlimit_str, NULL, 10); memusage = strtoul(memusage_str, NULL, 10); if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) && cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) { - /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */ - if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str)) - goto err; - if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str)) - goto err; - - memswlimit = strtoul(memswlimit_str, NULL, 10); + memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes"); memswusage = strtoul(memswusage_str, NULL, 10); - if (!strcmp(memswlimit_str, memswlimit_default_str)) - memswlimit = 0; - if (!strcmp(memswusage_str, memswusage_default_str)) - memswusage = 0; - swap_total = (memswlimit - memlimit) / 1024; swap_free = (memswusage - memusage) / 1024; } @@ -3967,8 +4089,6 @@ err: free(memlimit_str); free(memusage_str); free(memswusage_str); - free(memswusage_default_str); - free(memswlimit_default_str); return rv; } @@ -4149,7 +4269,57 @@ static bool umount_if_mounted(void) return true; } -static int pivot_enter(void) +/* __typeof__ should be safe to use with all compilers. */ +typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; +static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) +{ + return (fs->f_type == (fs_type_magic)magic_val); +} + +/* + * looking at fs/proc_namespace.c, it appears we can + * actually expect the rootfs entry to very specifically contain + * " - rootfs rootfs " + * IIUC, so long as we've chrooted so that rootfs is not our root, + * the rootfs entry should always be skipped in mountinfo contents. + */ +static bool is_on_ramfs(void) +{ + FILE *f; + char *p, *p2; + char *line = NULL; + size_t len = 0; + int i; + + f = fopen("/proc/self/mountinfo", "r"); + if (!f) + return false; + + while (getline(&line, &len, f) != -1) { + for (p = line, i = 0; p && i < 4; i++) + p = strchr(p + 1, ' '); + if (!p) + continue; + p2 = strchr(p + 1, ' '); + if (!p2) + continue; + *p2 = '\0'; + if (strcmp(p + 1, "/") == 0) { + // this is '/'. is it the ramfs? + p = strchr(p2 + 1, '-'); + if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) { + free(line); + fclose(f); + return true; + } + } + } + free(line); + fclose(f); + return false; +} + +static int pivot_enter() { int ret = -1, oldroot = -1, newroot = -1; @@ -4186,6 +4356,7 @@ static int pivot_enter(void) lxcfs_error("%s\n", "Failed to enter old root."); goto err; } + if (umount2(".", MNT_DETACH) < 0) { lxcfs_error("%s\n", "Failed to detach old root."); goto err; @@ -4203,11 +4374,55 @@ err: close(oldroot); if (newroot > 0) close(newroot); + return ret; } +static int chroot_enter() +{ + if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { + lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); + return -1; + } + + if (chroot(".") < 0) { + lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); + return -1; + } + + if (chdir("/") < 0) { + lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); + return -1; + } + + return 0; +} + +static int permute_and_enter(void) +{ + struct statfs sb; + + if (statfs("/", &sb) < 0) { + lxcfs_error("%s\n", "Could not stat / mountpoint."); + return -1; + } + + /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will + * likely report TMPFS_MAGIC. Hence, when it reports no we still check + * /proc/1/mountinfo. */ + if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) + return chroot_enter(); + + if (pivot_enter() < 0) { + lxcfs_error("%s\n", "Could not perform pivot root."); + return -1; + } + + return 0; +} + /* Prepare our new clean root. */ -static int pivot_prepare(void) +static int permute_prepare(void) { if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { lxcfs_error("%s\n", "Failed to create directory for new root."); @@ -4232,20 +4447,34 @@ static int pivot_prepare(void) return 0; } -static bool pivot_new_root(void) +/* Calls chroot() on ramfs, pivot_root() in all other cases. */ +static bool permute_root(void) { /* Prepare new root. */ - if (pivot_prepare() < 0) + if (permute_prepare() < 0) return false; /* Pivot into new root. */ - if (pivot_enter() < 0) + if (permute_and_enter() < 0) return false; return true; } -static bool setup_cgfs_dir(void) +static int preserve_mnt_ns(int pid) +{ + int ret; + size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt"); + char path[len]; + + ret = snprintf(path, len, "/proc/%d/ns/mnt", pid); + if (ret < 0 || (size_t)ret >= len) + return -1; + + return open(path, O_RDONLY | O_CLOEXEC); +} + +static bool cgfs_prepare_mounts(void) { if (!mkdir_p(BASEDIR, 0700)) { lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); @@ -4262,6 +4491,12 @@ static bool setup_cgfs_dir(void) return false; } + cgroup_mount_ns_fd = preserve_mnt_ns(getpid()); + if (cgroup_mount_ns_fd < 0) { + lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); + return false; + } + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); return false; @@ -4275,7 +4510,7 @@ static bool setup_cgfs_dir(void) return true; } -static bool do_mount_cgroups(void) +static bool cgfs_mount_hierarchies(void) { char *target; size_t clen, len; @@ -4283,11 +4518,13 @@ static bool do_mount_cgroups(void) for (i = 0; i < num_hierarchies; i++) { char *controller = hierarchies[i]; + clen = strlen(controller); len = strlen(BASEDIR) + clen + 2; target = malloc(len); if (!target) return false; + ret = snprintf(target, len, "%s/%s", BASEDIR, controller); if (ret < 0 || ret >= len) { free(target); @@ -4297,8 +4534,12 @@ static bool do_mount_cgroups(void) free(target); return false; } - if (mount(controller, target, "cgroup", 0, controller) < 0) { - lxcfs_error("Failed mounting cgroup %s\n", controller); + if (!strcmp(controller, "unified")) + ret = mount("none", target, "cgroup2", 0, NULL); + else + ret = mount(controller, target, "cgroup", 0, controller); + if (ret < 0) { + lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno)); free(target); return false; } @@ -4315,33 +4556,20 @@ static bool do_mount_cgroups(void) static bool cgfs_setup_controllers(void) { - if (!setup_cgfs_dir()) + if (!cgfs_prepare_mounts()) return false; - if (!do_mount_cgroups()) { + if (!cgfs_mount_hierarchies()) { lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts."); return false; } - if (!pivot_new_root()) + if (!permute_root()) return false; return true; } -static int preserve_ns(int pid) -{ - int ret; - size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */; - char path[len]; - - ret = snprintf(path, len, "/proc/%d/ns/mnt", pid); - if (ret < 0 || (size_t)ret >= len) - return -1; - - return open(path, O_RDONLY | O_CLOEXEC); -} - static void __attribute__((constructor)) collect_and_mount_subsystems(void) { FILE *f; @@ -4349,6 +4577,7 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void) char cwd[MAXPATHLEN]; size_t len = 0; int i, init_ns = -1; + bool found_unified = false; if ((f = fopen("/proc/self/cgroup", "r")) == NULL) { lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno)); @@ -4356,11 +4585,12 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void) } while (getline(&line, &len, f) != -1) { - char *p, *p2; + char *idx, *p, *p2; p = strchr(line, ':'); if (!p) goto out; + idx = line; *(p++) = '\0'; p2 = strrchr(p, ':'); @@ -4373,21 +4603,23 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void) * because it parses out the empty string "" and later on passes * it to mount(). Let's skip such entries. */ - if (!strcmp(p, "")) - continue; + if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) { + found_unified = true; + p = "unified"; + } if (!store_hierarchy(line, p)) goto out; } /* Preserve initial namespace. */ - init_ns = preserve_ns(getpid()); + init_ns = preserve_mnt_ns(getpid()); if (init_ns < 0) { lxcfs_error("%s\n", "Failed to preserve initial mount namespace."); goto out; } - fd_hierarchies = malloc(sizeof(int *) * num_hierarchies); + fd_hierarchies = malloc(sizeof(int) * num_hierarchies); if (!fd_hierarchies) { lxcfs_error("%s\n", strerror(errno)); goto out; @@ -4438,4 +4670,7 @@ static void __attribute__((destructor)) free_subsystems(void) } free(hierarchies); free(fd_hierarchies); + + if (cgroup_mount_ns_fd >= 0) + close(cgroup_mount_ns_fd); }