X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=bindings.c;h=08267f77cde63e9d9d4dec599d5a66115c52c3da;hb=beb5024eaef96e3fae7c8e13646cc7971a9ddc4a;hp=9fbabb6a947403f3b594df2a7e63b99d7273379b;hpb=2f306ad3b96558dbcbb736e59612f5a68ccc500d;p=mirror_lxcfs.git diff --git a/bindings.c b/bindings.c index 9fbabb6..08267f7 100644 --- a/bindings.c +++ b/bindings.c @@ -8,14 +8,17 @@ #define FUSE_USE_VERSION 26 +#define __STDC_FORMAT_MACROS #include #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -30,11 +33,15 @@ #include #include #include +#include #include #include "bindings.h" #include "config.h" // for VERSION +/* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */ +#define LXCFS_NUMSTRLEN64 21 + /* Define pivot_root() if missing from the C library */ #ifndef HAVE_PIVOT_ROOT static int pivot_root(const char * new_root, const char * put_old) @@ -59,6 +66,7 @@ enum { LXC_TYPE_PROC_STAT, LXC_TYPE_PROC_DISKSTATS, LXC_TYPE_PROC_SWAPS, + LXC_TYPE_PROC_LOADAVG, }; struct file_info { @@ -72,8 +80,213 @@ struct file_info { int cached; }; -/* reserve buffer size, for cpuall in /proc/stat */ -#define BUF_RESERVE_SIZE 256 +struct cpuacct_usage { + uint64_t user; + uint64_t system; +}; + +/* The function of hash table.*/ +#define LOAD_SIZE 100 /*the size of hash_table */ +#define FLUSH_TIME 5 /*the flush rate */ +#define DEPTH_DIR 3 /*the depth of per cgroup */ +/* The function of calculate loadavg .*/ +#define FSHIFT 11 /* nr of bits of precision */ +#define FIXED_1 (1<> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) +/* + * This parameter is used for proc_loadavg_read(). + * 1 means use loadavg, 0 means not use. + */ +static int loadavg = 0; +static volatile sig_atomic_t loadavg_stop = 0; +static int calc_hash(char *name) +{ + unsigned int hash = 0; + unsigned int x = 0; + /* ELFHash algorithm. */ + while (*name) { + hash = (hash << 4) + *name++; + x = hash & 0xf0000000; + if (x != 0) + hash ^= (x >> 24); + hash &= ~x; + } + return ((hash & 0x7fffffff) % LOAD_SIZE); +} + +struct load_node { + char *cg; /*cg */ + unsigned long avenrun[3]; /* Load averages */ + unsigned int run_pid; + unsigned int total_pid; + unsigned int last_pid; + int cfd; /* The file descriptor of the mounted cgroup */ + struct load_node *next; + struct load_node **pre; +}; + +struct load_head { + /* + * The lock is about insert load_node and refresh load_node.To the first + * load_node of each hash bucket, insert and refresh in this hash bucket is + * mutually exclusive. + */ + pthread_mutex_t lock; + /* + * The rdlock is about read loadavg and delete load_node.To each hash + * bucket, read and delete is mutually exclusive. But at the same time, we + * allow paratactic read operation. This rdlock is at list level. + */ + pthread_rwlock_t rdlock; + /* + * The rilock is about read loadavg and insert load_node.To the first + * load_node of each hash bucket, read and insert is mutually exclusive. + * But at the same time, we allow paratactic read operation. + */ + pthread_rwlock_t rilock; + struct load_node *next; +}; + +static struct load_head load_hash[LOAD_SIZE]; /* hash table */ +/* + * init_load initialize the hash table. + * Return 0 on success, return -1 on failure. + */ +static int init_load(void) +{ + int i; + int ret; + + for (i = 0; i < LOAD_SIZE; i++) { + load_hash[i].next = NULL; + ret = pthread_mutex_init(&load_hash[i].lock, NULL); + if (ret != 0) { + lxcfs_error("%s\n", "Failed to initialize lock"); + goto out3; + } + ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL); + if (ret != 0) { + lxcfs_error("%s\n", "Failed to initialize rdlock"); + goto out2; + } + ret = pthread_rwlock_init(&load_hash[i].rilock, NULL); + if (ret != 0) { + lxcfs_error("%s\n", "Failed to initialize rilock"); + goto out1; + } + } + return 0; +out1: + pthread_rwlock_destroy(&load_hash[i].rdlock); +out2: + pthread_mutex_destroy(&load_hash[i].lock); +out3: + while (i > 0) { + i--; + pthread_mutex_destroy(&load_hash[i].lock); + pthread_rwlock_destroy(&load_hash[i].rdlock); + pthread_rwlock_destroy(&load_hash[i].rilock); + } + return -1; +} + +static void insert_node(struct load_node **n, int locate) +{ + struct load_node *f; + + pthread_mutex_lock(&load_hash[locate].lock); + pthread_rwlock_wrlock(&load_hash[locate].rilock); + f = load_hash[locate].next; + load_hash[locate].next = *n; + + (*n)->pre = &(load_hash[locate].next); + if (f) + f->pre = &((*n)->next); + (*n)->next = f; + pthread_mutex_unlock(&load_hash[locate].lock); + pthread_rwlock_unlock(&load_hash[locate].rilock); +} +/* + * locate_node() finds special node. Not return NULL means success. + * It should be noted that rdlock isn't unlocked at the end of code + * because this function is used to read special node. Delete is not + * allowed before read has ended. + * unlock rdlock only in proc_loadavg_read(). + */ +static struct load_node *locate_node(char *cg, int locate) +{ + struct load_node *f = NULL; + int i = 0; + + pthread_rwlock_rdlock(&load_hash[locate].rilock); + pthread_rwlock_rdlock(&load_hash[locate].rdlock); + if (load_hash[locate].next == NULL) { + pthread_rwlock_unlock(&load_hash[locate].rilock); + return f; + } + f = load_hash[locate].next; + pthread_rwlock_unlock(&load_hash[locate].rilock); + while (f && ((i = strcmp(f->cg, cg)) != 0)) + f = f->next; + return f; +} +/* Delete the load_node n and return the next node of it. */ +static struct load_node *del_node(struct load_node *n, int locate) +{ + struct load_node *g; + + pthread_rwlock_wrlock(&load_hash[locate].rdlock); + if (n->next == NULL) { + *(n->pre) = NULL; + } else { + *(n->pre) = n->next; + n->next->pre = n->pre; + } + g = n->next; + free(n->cg); + free(n); + pthread_rwlock_unlock(&load_hash[locate].rdlock); + return g; +} + +static void load_free(void) +{ + int i; + struct load_node *f, *p; + + for (i = 0; i < LOAD_SIZE; i++) { + pthread_mutex_lock(&load_hash[i].lock); + pthread_rwlock_wrlock(&load_hash[i].rilock); + pthread_rwlock_wrlock(&load_hash[i].rdlock); + if (load_hash[i].next == NULL) { + pthread_mutex_unlock(&load_hash[i].lock); + pthread_mutex_destroy(&load_hash[i].lock); + pthread_rwlock_unlock(&load_hash[i].rilock); + pthread_rwlock_destroy(&load_hash[i].rilock); + pthread_rwlock_unlock(&load_hash[i].rdlock); + pthread_rwlock_destroy(&load_hash[i].rdlock); + continue; + } + for (f = load_hash[i].next; f; ) { + free(f->cg); + p = f->next; + free(f); + f = p; + } + pthread_mutex_unlock(&load_hash[i].lock); + pthread_mutex_destroy(&load_hash[i].lock); + pthread_rwlock_unlock(&load_hash[i].rilock); + pthread_rwlock_destroy(&load_hash[i].rilock); + pthread_rwlock_unlock(&load_hash[i].rdlock); + pthread_rwlock_destroy(&load_hash[i].rdlock); + } +} +/* Reserve buffer size to account for file size changes. */ +#define BUF_RESERVE_SIZE 512 /* * A table caching which pid is init for a pid namespace. @@ -132,6 +345,7 @@ static char **hierarchies; * another namespace using the *at() family of functions * {openat(), fchownat(), ...}. */ static int *fd_hierarchies; +static int cgroup_mount_ns_fd = -1; static void unlock_mutex(pthread_mutex_t *l) { @@ -370,19 +584,24 @@ static bool write_string(const char *fnam, const char *string, int fd) FILE *f; size_t len, ret; - if (!(f = fdopen(fd, "w"))) + f = fdopen(fd, "w"); + if (!f) return false; + len = strlen(string); ret = fwrite(string, 1, len, f); if (ret != len) { - lxcfs_error("Error writing to file: %s\n", strerror(errno)); + lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n", + strerror(errno), string, fnam); fclose(f); return false; } + if (fclose(f) < 0) { - lxcfs_error("Error writing to file: %s\n", strerror(errno)); + lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam); return false; } + return true; } @@ -414,6 +633,7 @@ static void print_subsystems(void) { int i; + fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd); fprintf(stderr, "hierarchies:\n"); for (i = 0; i < num_hierarchies; i++) { if (hierarchies[i]) @@ -863,11 +1083,11 @@ bool cgfs_get_value(const char *controller, const char *cgroup, const char *file fnam = alloca(len); ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file); if (ret < 0 || (size_t)ret >= len) - return NULL; + return false; fd = openat(cfd, fnam, O_RDONLY); if (fd < 0) - return NULL; + return false; *value = slurp_file(fnam, fd); return *value != NULL; @@ -2908,7 +3128,7 @@ int cg_rmdir(const char *path) if (initpid <= 0) initpid = fc->pid; if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) { - if (!last || strcmp(next, last) == 0) + if (!last || (next && (strcmp(next, last) == 0))) ret = -EBUSY; else ret = -ENOENT; @@ -2952,23 +3172,23 @@ static void parse_memstat(char *memstat, unsigned long *cached, char *eol; while (*memstat) { - if (startswith(memstat, "cache")) { + if (startswith(memstat, "total_cache")) { sscanf(memstat + 11, "%lu", cached); *cached /= 1024; - } else if (startswith(memstat, "active_anon")) { - sscanf(memstat + 11, "%lu", active_anon); + } else if (startswith(memstat, "total_active_anon")) { + sscanf(memstat + 17, "%lu", active_anon); *active_anon /= 1024; - } else if (startswith(memstat, "inactive_anon")) { - sscanf(memstat + 11, "%lu", inactive_anon); + } else if (startswith(memstat, "total_inactive_anon")) { + sscanf(memstat + 19, "%lu", inactive_anon); *inactive_anon /= 1024; - } else if (startswith(memstat, "active_file")) { - sscanf(memstat + 11, "%lu", active_file); + } else if (startswith(memstat, "total_active_file")) { + sscanf(memstat + 17, "%lu", active_file); *active_file /= 1024; - } else if (startswith(memstat, "inactive_file")) { - sscanf(memstat + 11, "%lu", inactive_file); + } else if (startswith(memstat, "total_inactive_file")) { + sscanf(memstat + 19, "%lu", inactive_file); *inactive_file /= 1024; - } else if (startswith(memstat, "unevictable")) { - sscanf(memstat + 11, "%lu", unevictable); + } else if (startswith(memstat, "total_unevictable")) { + sscanf(memstat + 17, "%lu", unevictable); *unevictable /= 1024; } eol = strchr(memstat, '\n'); @@ -3046,12 +3266,12 @@ static int read_file(const char *path, char *buf, size_t size, * FUSE ops for /proc */ -static unsigned long get_memlimit(const char *cgroup) +static unsigned long get_memlimit(const char *cgroup, const char *file) { char *memlimit_str = NULL; unsigned long memlimit = -1; - if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str)) + if (cgfs_get_value("memory", cgroup, file, &memlimit_str)) memlimit = strtoul(memlimit_str, NULL, 10); free(memlimit_str); @@ -3059,16 +3279,16 @@ static unsigned long get_memlimit(const char *cgroup) return memlimit; } -static unsigned long get_min_memlimit(const char *cgroup) +static unsigned long get_min_memlimit(const char *cgroup, const char *file) { char *copy = strdupa(cgroup); unsigned long memlimit = 0, retlimit; - retlimit = get_memlimit(copy); + retlimit = get_memlimit(copy, file); while (strcmp(copy, "/") != 0) { copy = dirname(copy); - memlimit = get_memlimit(copy); + memlimit = get_memlimit(copy, file); if (memlimit != -1 && memlimit < retlimit) retlimit = memlimit; }; @@ -3083,11 +3303,11 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, struct file_info *d = (struct file_info *)fi->fh; char *cg; char *memusage_str = NULL, *memstat_str = NULL, - *memswlimit_str = NULL, *memswusage_str = NULL, - *memswlimit_default_str = NULL, *memswusage_default_str = NULL; + *memswlimit_str = NULL, *memswusage_str = NULL; unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0, cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0, - active_file = 0, inactive_file = 0, unevictable = 0; + active_file = 0, inactive_file = 0, unevictable = 0, + hostswtotal = 0; char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; char *cache = d->buf; @@ -3113,7 +3333,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, return read_file("/proc/meminfo", buf, size, d); prune_init_slice(cg); - memlimit = get_min_memlimit(cg); + memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str)) goto err; if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str)) @@ -3124,20 +3344,9 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) && cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str)) { - /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */ - if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str)) - goto err; - if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str)) - goto err; - - memswlimit = strtoul(memswlimit_str, NULL, 10); + memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes"); memswusage = strtoul(memswusage_str, NULL, 10); - if (!strcmp(memswlimit_str, memswlimit_default_str)) - memswlimit = 0; - if (!strcmp(memswusage_str, memswusage_default_str)) - memswusage = 0; - memswlimit = memswlimit / 1024; memswusage = memswusage / 1024; } @@ -3160,7 +3369,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, memset(lbuf, 0, 100); if (startswith(line, "MemTotal:")) { - sscanf(line+14, "%lu", &hosttotal); + sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal); if (hosttotal < memlimit) memlimit = hosttotal; snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit); @@ -3169,13 +3378,16 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage); printme = lbuf; } else if (startswith(line, "MemAvailable:")) { - snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage); + snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached); printme = lbuf; } else if (startswith(line, "SwapTotal:") && memswlimit > 0) { - snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit); + sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal); + if (hostswtotal < memswlimit) + memswlimit = hostswtotal; + snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit); printme = lbuf; } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) { - unsigned long swaptotal = memswlimit - memlimit, + unsigned long swaptotal = memswlimit, swapusage = memswusage - memusage, swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0; snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree); @@ -3257,8 +3469,6 @@ err: free(memswlimit_str); free(memswusage_str); free(memstat_str); - free(memswlimit_default_str); - free(memswusage_default_str); return rv; } @@ -3464,6 +3674,220 @@ err: return rv; } +static uint64_t get_reaper_start_time(pid_t pid) +{ + int ret; + FILE *f; + uint64_t starttime; + /* strlen("/proc/") = 6 + * + + * LXCFS_NUMSTRLEN64 + * + + * strlen("/stat") = 5 + * + + * \0 = 1 + * */ +#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1) + char path[__PROC_PID_STAT_LEN]; + pid_t qpid; + + qpid = lookup_initpid_in_store(pid); + if (qpid <= 0) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid); + if (ret < 0 || ret >= __PROC_PID_STAT_LEN) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + f = fopen(path, "r"); + if (!f) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + /* Note that the *scanf() argument supression requires that length + * modifiers such as "l" are omitted. Otherwise some compilers will yell + * at us. It's like telling someone you're not married and then asking + * if you can bring your wife to the party. + */ + ret = fscanf(f, "%*d " /* (1) pid %d */ + "%*s " /* (2) comm %s */ + "%*c " /* (3) state %c */ + "%*d " /* (4) ppid %d */ + "%*d " /* (5) pgrp %d */ + "%*d " /* (6) session %d */ + "%*d " /* (7) tty_nr %d */ + "%*d " /* (8) tpgid %d */ + "%*u " /* (9) flags %u */ + "%*u " /* (10) minflt %lu */ + "%*u " /* (11) cminflt %lu */ + "%*u " /* (12) majflt %lu */ + "%*u " /* (13) cmajflt %lu */ + "%*u " /* (14) utime %lu */ + "%*u " /* (15) stime %lu */ + "%*d " /* (16) cutime %ld */ + "%*d " /* (17) cstime %ld */ + "%*d " /* (18) priority %ld */ + "%*d " /* (19) nice %ld */ + "%*d " /* (20) num_threads %ld */ + "%*d " /* (21) itrealvalue %ld */ + "%" PRIu64, /* (22) starttime %llu */ + &starttime); + if (ret != 1) { + fclose(f); + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + fclose(f); + + errno = 0; + return starttime; +} + +static uint64_t get_reaper_start_time_in_sec(pid_t pid) +{ + uint64_t clockticks; + int64_t ticks_per_sec; + + clockticks = get_reaper_start_time(pid); + if (clockticks == 0 && errno == EINVAL) { + lxcfs_debug("failed to retrieve start time of pid %d\n", pid); + return 0; + } + + ticks_per_sec = sysconf(_SC_CLK_TCK); + if (ticks_per_sec < 0 && errno == EINVAL) { + lxcfs_debug( + "%s\n", + "failed to determine number of clock ticks in a second"); + return 0; + } + + return (clockticks /= ticks_per_sec); +} + +static uint64_t get_reaper_age(pid_t pid) +{ + uint64_t procstart, uptime, procage; + + /* We need to substract the time the process has started since system + * boot minus the time when the system has started to get the actual + * reaper age. + */ + procstart = get_reaper_start_time_in_sec(pid); + procage = procstart; + if (procstart > 0) { + int ret; + struct timespec spec; + + ret = clock_gettime(CLOCK_BOOTTIME, &spec); + if (ret < 0) + return 0; + /* We could make this more precise here by using the tv_nsec + * field in the timespec struct and convert it to milliseconds + * and then create a double for the seconds and milliseconds but + * that seems more work than it is worth. + */ + uptime = spec.tv_sec; + procage = uptime - procstart; + } + + return procage; +} + +/* + * Returns 0 on success. + * It is the caller's responsibility to free `return_usage`, unless this + * function returns an error. + */ +static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage) +{ + int cpucount = get_nprocs(); + struct cpuacct_usage *cpu_usage; + int rv = 0, i, j, ret, read_pos = 0, read_cnt; + int cg_cpu; + uint64_t cg_user, cg_system; + int64_t ticks_per_sec; + char *usage_str = NULL; + + ticks_per_sec = sysconf(_SC_CLK_TCK); + + if (ticks_per_sec < 0 && errno == EINVAL) { + lxcfs_debug( + "%s\n", + "read_cpuacct_usage_all failed to determine number of clock ticks " + "in a second"); + return -1; + } + + cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount); + if (!cpu_usage) + return -ENOMEM; + + if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) { + rv = -1; + goto err; + } + + if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) { + lxcfs_error("read_cpuacct_usage_all reading first line from " + "%s/cpuacct.usage_all failed.\n", cg); + rv = -1; + goto err; + } + + read_pos += read_cnt; + + for (i = 0, j = 0; i < cpucount; i++) { + ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user, + &cg_system, &read_cnt); + + if (ret == EOF) + break; + + if (ret != 3) { + lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all " + "failed.\n", cg); + rv = -1; + goto err; + } + + read_pos += read_cnt; + + if (!cpu_in_cpuset(i, cpuset)) + continue; + + /* Convert the time from nanoseconds to USER_HZ */ + cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec; + cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec; + j++; + } + + rv = 0; + *return_usage = cpu_usage; + +err: + if (usage_str) + free(usage_str); + + if (rv != 0) { + free(cpu_usage); + *return_usage = NULL; + } + + return rv; +} + +#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2) static int proc_stat_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { @@ -3474,15 +3898,15 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; int curcpu = -1; /* cpu numbering starts at 0 */ - unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0; + unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0; unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0, - irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0; -#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE + irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0; char cpuall[CPUALL_MAX_SIZE]; /* reserve for cpu all */ char *cache = d->buf + CPUALL_MAX_SIZE; size_t cache_size = d->buflen - CPUALL_MAX_SIZE; FILE *f = NULL; + struct cpuacct_usage *cg_cpu_usage = NULL; if (offset){ if (offset > d->size) @@ -3507,6 +3931,16 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, if (!cpuset) goto err; + /* + * Read cpuacct.usage_all for all CPUs. + * If the cpuacct cgroup is present, it is used to calculate the container's + * CPU usage. If not, values from the host's /proc/stat are used. + */ + if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) { + lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, " + "falling back to the host's /proc/stat"); + } + f = fopen("/proc/stat", "r"); if (!f) goto err; @@ -3522,6 +3956,8 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, int cpu; char cpu_char[10]; /* That's a lot of cores */ char *c; + uint64_t all_used, cg_used, new_idle; + int ret; if (strlen(line) == 0) continue; @@ -3550,48 +3986,112 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, continue; curcpu ++; - c = strchr(line, ' '); - if (!c) - continue; - l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c); - if (l < 0) { - perror("Error writing to cache"); - rv = 0; - goto err; + ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + &user, + &nice, + &system, + &idle, + &iowait, + &irq, + &softirq, + &steal, + &guest, + &guest_nice); + + if (ret != 10 || !cg_cpu_usage) { + c = strchr(line, ' '); + if (!c) + continue; + l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c); + if (l < 0) { + perror("Error writing to cache"); + rv = 0; + goto err; - } - if (l >= cache_size) { - lxcfs_error("%s\n", "Internal error: truncated write to cache."); - rv = 0; - goto err; + } + if (l >= cache_size) { + lxcfs_error("%s\n", "Internal error: truncated write to cache."); + rv = 0; + goto err; + } + + cache += l; + cache_size -= l; + total_len += l; + + if (ret != 10) + continue; } - cache += l; - cache_size -= l; - total_len += l; + if (cg_cpu_usage) { + all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice; + cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system; - if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq, - &softirq, &steal, &guest) != 9) - continue; - user_sum += user; - nice_sum += nice; - system_sum += system; - idle_sum += idle; - iowait_sum += iowait; - irq_sum += irq; - softirq_sum += softirq; - steal_sum += steal; - guest_sum += guest; + if (all_used >= cg_used) { + new_idle = idle + (all_used - cg_used); + + } else { + lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, " + "%lu in cpuacct.usage_all; unable to determine idle time\n", + curcpu, cg, all_used, cg_used); + new_idle = idle; + } + + l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n", + curcpu, cg_cpu_usage[curcpu].user, cg_cpu_usage[curcpu].system, + new_idle); + + if (l < 0) { + perror("Error writing to cache"); + rv = 0; + goto err; + + } + if (l >= cache_size) { + lxcfs_error("%s\n", "Internal error: truncated write to cache."); + rv = 0; + goto err; + } + + cache += l; + cache_size -= l; + total_len += l; + + user_sum += cg_cpu_usage[curcpu].user; + system_sum += cg_cpu_usage[curcpu].system; + idle_sum += new_idle; + + } else { + user_sum += user; + nice_sum += nice; + system_sum += system; + idle_sum += idle; + iowait_sum += iowait; + irq_sum += irq; + softirq_sum += softirq; + steal_sum += steal; + guest_sum += guest; + guest_nice_sum += guest_nice; + } } cache = d->buf; - int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum); - if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){ + int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + user_sum, + nice_sum, + system_sum, + idle_sum, + iowait_sum, + irq_sum, + softirq_sum, + steal_sum, + guest_sum, + guest_nice_sum); + if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) { memcpy(cache, cpuall, cpuall_len); cache += cpuall_len; - } else{ + } else { /* shouldn't happen */ lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len); cpuall_len = 0; @@ -3601,7 +4101,8 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, total_len += cpuall_len; d->cached = 1; d->size = total_len; - if (total_len > size ) total_len = size; + if (total_len > size) + total_len = size; memcpy(buf, d->buf, total_len); rv = total_len; @@ -3609,33 +4110,21 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, err: if (f) fclose(f); + if (cg_cpu_usage) + free(cg_cpu_usage); free(line); free(cpuset); free(cg); return rv; } -static long int getreaperage(pid_t pid) -{ - char fnam[100]; - struct stat sb; - int ret; - pid_t qpid; - - qpid = lookup_initpid_in_store(pid); - if (qpid <= 0) - return 0; - - ret = snprintf(fnam, 100, "/proc/%d", qpid); - if (ret < 0 || ret >= 100) - return 0; - - if (lstat(fnam, &sb) < 0) - return 0; - - return time(NULL) - sb.st_ctime; -} - +/* This function retrieves the busy time of a group of tasks by looking at + * cpuacct.usage. Unfortunately, this only makes sense when the container has + * been given it's own cpuacct cgroup. If not, this function will take the busy + * time of all other taks that do not actually belong to the container into + * account as well. If someone has a clever solution for this please send a + * patch! + */ static unsigned long get_reaper_busy(pid_t task) { pid_t initpid = lookup_initpid_in_store(task); @@ -3681,33 +4170,37 @@ static int proc_uptime_read(char *buf, size_t size, off_t offset, { struct fuse_context *fc = fuse_get_context(); struct file_info *d = (struct file_info *)fi->fh; - long int reaperage = getreaperage(fc->pid); - unsigned long int busytime = get_reaper_busy(fc->pid), idletime; + unsigned long int busytime = get_reaper_busy(fc->pid); char *cache = d->buf; ssize_t total_len = 0; + uint64_t idletime, reaperage; #if RELOADTEST iwashere(); #endif if (offset){ - if (offset > d->size) - return -EINVAL; if (!d->cached) return 0; + if (offset > d->size) + return -EINVAL; int left = d->size - offset; total_len = left > size ? size: left; memcpy(buf, cache + offset, total_len); return total_len; } - idletime = reaperage - busytime; - if (idletime > reaperage) - idletime = reaperage; + reaperage = get_reaper_age(fc->pid); + /* To understand why this is done, please read the comment to the + * get_reaper_busy() function. + */ + idletime = reaperage; + if (reaperage >= busytime) + idletime = reaperage - busytime; - total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime); - if (total_len < 0){ - perror("Error writing to cache"); + total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime); + if (total_len < 0 || total_len >= d->buflen){ + lxcfs_error("%s\n", "failed to write to cache"); return 0; } @@ -3859,8 +4352,7 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, struct fuse_context *fc = fuse_get_context(); struct file_info *d = (struct file_info *)fi->fh; char *cg = NULL; - char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL, - *memswlimit_default_str = NULL, *memswusage_default_str = NULL; + char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL; unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0; ssize_t total_len = 0, rv = 0; ssize_t l = 0; @@ -3885,32 +4377,19 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, return read_file("/proc/swaps", buf, size, d); prune_init_slice(cg); - if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str)) - goto err; + memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str)) goto err; - memlimit = strtoul(memlimit_str, NULL, 10); memusage = strtoul(memusage_str, NULL, 10); if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) && cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) { - /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */ - if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str)) - goto err; - if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str)) - goto err; - - memswlimit = strtoul(memswlimit_str, NULL, 10); + memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes"); memswusage = strtoul(memswusage_str, NULL, 10); - if (!strcmp(memswlimit_str, memswlimit_default_str)) - memswlimit = 0; - if (!strcmp(memswusage_str, memswusage_default_str)) - memswusage = 0; - swap_total = (memswlimit - memlimit) / 1024; swap_free = (memswusage - memusage) / 1024; } @@ -3964,10 +4443,398 @@ err: free(memlimit_str); free(memusage_str); free(memswusage_str); - free(memswusage_default_str); - free(memswlimit_default_str); return rv; } +/* + * Find the process pid from cgroup path. + * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid. + * @pid_buf : put pid to pid_buf. + * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ... + * @depth : the depth of cgroup in container. + * @sum : return the number of pid. + * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu + */ +static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd) +{ + DIR *dir; + int fd; + struct dirent *file; + FILE *f = NULL; + size_t linelen = 0; + char *line = NULL; + int pd; + char *path_dir, *path; + char **pid; + + /* path = dpath + "/cgroup.procs" + /0 */ + do { + path = malloc(strlen(dpath) + 20); + } while (!path); + + strcpy(path, dpath); + fd = openat(cfd, path, O_RDONLY); + if (fd < 0) + goto out; + + dir = fdopendir(fd); + if (dir == NULL) { + close(fd); + goto out; + } + + while (((file = readdir(dir)) != NULL) && depth > 0) { + if (strncmp(file->d_name, ".", 1) == 0) + continue; + if (strncmp(file->d_name, "..", 1) == 0) + continue; + if (file->d_type == DT_DIR) { + /* path + '/' + d_name +/0 */ + do { + path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name)); + } while (!path_dir); + strcpy(path_dir, path); + strcat(path_dir, "/"); + strcat(path_dir, file->d_name); + pd = depth - 1; + sum = calc_pid(pid_buf, path_dir, pd, sum, cfd); + free(path_dir); + } + } + closedir(dir); + + strcat(path, "/cgroup.procs"); + fd = openat(cfd, path, O_RDONLY); + if (fd < 0) + goto out; + + f = fdopen(fd, "r"); + if (!f) { + close(fd); + goto out; + } + + while (getline(&line, &linelen, f) != -1) { + do { + pid = realloc(*pid_buf, sizeof(char *) * (sum + 1)); + } while (!pid); + *pid_buf = pid; + do { + *(*pid_buf + sum) = malloc(strlen(line) + 1); + } while (*(*pid_buf + sum) == NULL); + strcpy(*(*pid_buf + sum), line); + sum++; + } + fclose(f); +out: + free(path); + return sum; +} +/* + * calc_load calculates the load according to the following formula: + * load1 = load0 * exp + active * (1 - exp) + * + * @load1: the new loadavg. + * @load0: the former loadavg. + * @active: the total number of running pid at this moment. + * @exp: the fixed-point defined in the beginning. + */ +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + unsigned long newload; + + active = active > 0 ? active * FIXED_1 : 0; + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1 - 1; + + return newload / FIXED_1; +} + +/* + * Return 0 means that container p->cg is closed. + * Return -1 means that error occurred in refresh. + * Positive num equals the total number of pid. + */ +static int refresh_load(struct load_node *p, char *path) +{ + FILE *f = NULL; + char **idbuf; + char proc_path[256]; + int i, ret, run_pid = 0, total_pid = 0, last_pid = 0; + char *line = NULL; + size_t linelen = 0; + int sum, length; + DIR *dp; + struct dirent *file; + + do { + idbuf = malloc(sizeof(char *)); + } while (!idbuf); + sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd); + /* normal exit */ + if (sum == 0) + goto out; + + for (i = 0; i < sum; i++) { + /*clean up '\n' */ + length = strlen(idbuf[i])-1; + idbuf[i][length] = '\0'; + ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]); + if (ret < 0 || ret > 255) { + lxcfs_error("%s\n", "snprintf() failed in refresh_load."); + i = sum; + sum = -1; + goto err_out; + } + + dp = opendir(proc_path); + if (!dp) { + lxcfs_error("%s\n", "Open proc_path failed in refresh_load."); + continue; + } + while ((file = readdir(dp)) != NULL) { + if (strncmp(file->d_name, ".", 1) == 0) + continue; + if (strncmp(file->d_name, "..", 1) == 0) + continue; + total_pid++; + /* We make the biggest pid become last_pid.*/ + ret = atof(file->d_name); + last_pid = (ret > last_pid) ? ret : last_pid; + + ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name); + if (ret < 0 || ret > 255) { + lxcfs_error("%s\n", "snprintf() failed in refresh_load."); + i = sum; + sum = -1; + closedir(dp); + goto err_out; + } + f = fopen(proc_path, "r"); + if (f != NULL) { + while (getline(&line, &linelen, f) != -1) { + /* Find State */ + if ((line[0] == 'S') && (line[1] == 't')) + break; + } + if ((line[7] == 'R') || (line[7] == 'D')) + run_pid++; + fclose(f); + } + } + closedir(dp); + } + /*Calculate the loadavg.*/ + p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid); + p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid); + p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid); + p->run_pid = run_pid; + p->total_pid = total_pid; + p->last_pid = last_pid; + + free(line); +err_out: + for (; i > 0; i--) + free(idbuf[i-1]); +out: + free(idbuf); + return sum; +} +/* + * Traverse the hash table and update it. + */ +void *load_begin(void *arg) +{ + + char *path = NULL; + int i, sum, length, ret; + struct load_node *f; + int first_node; + clock_t time1, time2; + + while (1) { + if (loadavg_stop == 1) + return NULL; + + time1 = clock(); + for (i = 0; i < LOAD_SIZE; i++) { + pthread_mutex_lock(&load_hash[i].lock); + if (load_hash[i].next == NULL) { + pthread_mutex_unlock(&load_hash[i].lock); + continue; + } + f = load_hash[i].next; + first_node = 1; + while (f) { + length = strlen(f->cg) + 2; + do { + /* strlen(f->cg) + '.' or '' + \0 */ + path = malloc(length); + } while (!path); + + ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg); + if (ret < 0 || ret > length - 1) { + /* snprintf failed, ignore the node.*/ + lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg); + goto out; + } + sum = refresh_load(f, path); + if (sum == 0) { + f = del_node(f, i); + } else { +out: f = f->next; + } + free(path); + /* load_hash[i].lock locks only on the first node.*/ + if (first_node == 1) { + first_node = 0; + pthread_mutex_unlock(&load_hash[i].lock); + } + } + } + + if (loadavg_stop == 1) + return NULL; + + time2 = clock(); + usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC)); + } +} + +static int proc_loadavg_read(char *buf, size_t size, off_t offset, + struct fuse_file_info *fi) +{ + struct fuse_context *fc = fuse_get_context(); + struct file_info *d = (struct file_info *)fi->fh; + pid_t initpid; + char *cg; + size_t total_len = 0; + char *cache = d->buf; + struct load_node *n; + int hash; + int cfd, rv = 0; + unsigned long a, b, c; + + if (offset) { + if (offset > d->size) + return -EINVAL; + if (!d->cached) + return 0; + int left = d->size - offset; + total_len = left > size ? size : left; + memcpy(buf, cache + offset, total_len); + return total_len; + } + if (!loadavg) + return read_file("/proc/loadavg", buf, size, d); + + initpid = lookup_initpid_in_store(fc->pid); + if (initpid <= 0) + initpid = fc->pid; + cg = get_pid_cgroup(initpid, "cpu"); + if (!cg) + return read_file("/proc/loadavg", buf, size, d); + + prune_init_slice(cg); + hash = calc_hash(cg); + n = locate_node(cg, hash); + + /* First time */ + if (n == NULL) { + if (!find_mounted_controller("cpu", &cfd)) { + /* + * In locate_node() above, pthread_rwlock_unlock() isn't used + * because delete is not allowed before read has ended. + */ + pthread_rwlock_unlock(&load_hash[hash].rdlock); + rv = 0; + goto err; + } + do { + n = malloc(sizeof(struct load_node)); + } while (!n); + + do { + n->cg = malloc(strlen(cg)+1); + } while (!n->cg); + strcpy(n->cg, cg); + n->avenrun[0] = 0; + n->avenrun[1] = 0; + n->avenrun[2] = 0; + n->run_pid = 0; + n->total_pid = 1; + n->last_pid = initpid; + n->cfd = cfd; + insert_node(&n, hash); + } + a = n->avenrun[0] + (FIXED_1/200); + b = n->avenrun[1] + (FIXED_1/200); + c = n->avenrun[2] + (FIXED_1/200); + total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + n->run_pid, n->total_pid, n->last_pid); + pthread_rwlock_unlock(&load_hash[hash].rdlock); + if (total_len < 0 || total_len >= d->buflen) { + lxcfs_error("%s\n", "Failed to write to cache"); + rv = 0; + goto err; + } + d->size = (int)total_len; + d->cached = 1; + + if (total_len > size) + total_len = size; + memcpy(buf, d->buf, total_len); + rv = total_len; + +err: + free(cg); + return rv; +} +/* Return a positive number on success, return 0 on failure.*/ +pthread_t load_daemon(int load_use) +{ + int ret; + pthread_t pid; + + ret = init_load(); + if (ret == -1) { + lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!"); + return 0; + } + ret = pthread_create(&pid, NULL, load_begin, NULL); + if (ret != 0) { + lxcfs_error("%s\n", "Create pthread fails in load_daemon!"); + load_free(); + return 0; + } + /* use loadavg, here loadavg = 1*/ + loadavg = load_use; + return pid; +} + +/* Returns 0 on success. */ +int stop_load_daemon(pthread_t pid) +{ + int s; + + /* Signal the thread to gracefully stop */ + loadavg_stop = 1; + + s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */ + if (s != 0) { + lxcfs_error("%s\n", "stop_load_daemon error: failed to join"); + return -1; + } + + load_free(); + loadavg_stop = 0; + + return 0; +} static off_t get_procfile_size(const char *which) { @@ -4005,7 +4872,8 @@ int proc_getattr(const char *path, struct stat *sb) strcmp(path, "/proc/uptime") == 0 || strcmp(path, "/proc/stat") == 0 || strcmp(path, "/proc/diskstats") == 0 || - strcmp(path, "/proc/swaps") == 0) { + strcmp(path, "/proc/swaps") == 0 || + strcmp(path, "/proc/loadavg") == 0) { sb->st_size = 0; sb->st_mode = S_IFREG | 00444; sb->st_nlink = 1; @@ -4025,7 +4893,8 @@ int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offs filler(buf, "stat", NULL, 0) != 0 || filler(buf, "uptime", NULL, 0) != 0 || filler(buf, "diskstats", NULL, 0) != 0 || - filler(buf, "swaps", NULL, 0) != 0) + filler(buf, "swaps", NULL, 0) != 0 || + filler(buf, "loadavg", NULL, 0) != 0) return -EINVAL; return 0; } @@ -4047,6 +4916,8 @@ int proc_open(const char *path, struct fuse_file_info *fi) type = LXC_TYPE_PROC_DISKSTATS; else if (strcmp(path, "/proc/swaps") == 0) type = LXC_TYPE_PROC_SWAPS; + else if (strcmp(path, "/proc/loadavg") == 0) + type = LXC_TYPE_PROC_LOADAVG; if (type == -1) return -ENOENT; @@ -4104,6 +4975,8 @@ int proc_read(const char *path, char *buf, size_t size, off_t offset, return proc_diskstats_read(buf, size, offset, fi); case LXC_TYPE_PROC_SWAPS: return proc_swaps_read(buf, size, offset, fi); + case LXC_TYPE_PROC_LOADAVG: + return proc_loadavg_read(buf, size, offset, fi); default: return -EINVAL; } @@ -4338,6 +5211,19 @@ static bool permute_root(void) return true; } +static int preserve_mnt_ns(int pid) +{ + int ret; + size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt"); + char path[len]; + + ret = snprintf(path, len, "/proc/%d/ns/mnt", pid); + if (ret < 0 || (size_t)ret >= len) + return -1; + + return open(path, O_RDONLY | O_CLOEXEC); +} + static bool cgfs_prepare_mounts(void) { if (!mkdir_p(BASEDIR, 0700)) { @@ -4355,6 +5241,12 @@ static bool cgfs_prepare_mounts(void) return false; } + cgroup_mount_ns_fd = preserve_mnt_ns(getpid()); + if (cgroup_mount_ns_fd < 0) { + lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); + return false; + } + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); return false; @@ -4376,11 +5268,13 @@ static bool cgfs_mount_hierarchies(void) for (i = 0; i < num_hierarchies; i++) { char *controller = hierarchies[i]; + clen = strlen(controller); len = strlen(BASEDIR) + clen + 2; target = malloc(len); if (!target) return false; + ret = snprintf(target, len, "%s/%s", BASEDIR, controller); if (ret < 0 || ret >= len) { free(target); @@ -4390,8 +5284,12 @@ static bool cgfs_mount_hierarchies(void) free(target); return false; } - if (mount(controller, target, "cgroup", 0, controller) < 0) { - lxcfs_error("Failed mounting cgroup %s\n", controller); + if (!strcmp(controller, "unified")) + ret = mount("none", target, "cgroup2", 0, NULL); + else + ret = mount(controller, target, "cgroup", 0, controller); + if (ret < 0) { + lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno)); free(target); return false; } @@ -4422,19 +5320,6 @@ static bool cgfs_setup_controllers(void) return true; } -static int preserve_ns(int pid) -{ - int ret; - size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */; - char path[len]; - - ret = snprintf(path, len, "/proc/%d/ns/mnt", pid); - if (ret < 0 || (size_t)ret >= len) - return -1; - - return open(path, O_RDONLY | O_CLOEXEC); -} - static void __attribute__((constructor)) collect_and_mount_subsystems(void) { FILE *f; @@ -4442,6 +5327,7 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void) char cwd[MAXPATHLEN]; size_t len = 0; int i, init_ns = -1; + bool found_unified = false; if ((f = fopen("/proc/self/cgroup", "r")) == NULL) { lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno)); @@ -4449,11 +5335,12 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void) } while (getline(&line, &len, f) != -1) { - char *p, *p2; + char *idx, *p, *p2; p = strchr(line, ':'); if (!p) goto out; + idx = line; *(p++) = '\0'; p2 = strrchr(p, ':'); @@ -4466,21 +5353,23 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void) * because it parses out the empty string "" and later on passes * it to mount(). Let's skip such entries. */ - if (!strcmp(p, "")) - continue; + if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) { + found_unified = true; + p = "unified"; + } if (!store_hierarchy(line, p)) goto out; } /* Preserve initial namespace. */ - init_ns = preserve_ns(getpid()); + init_ns = preserve_mnt_ns(getpid()); if (init_ns < 0) { lxcfs_error("%s\n", "Failed to preserve initial mount namespace."); goto out; } - fd_hierarchies = malloc(sizeof(int *) * num_hierarchies); + fd_hierarchies = malloc(sizeof(int) * num_hierarchies); if (!fd_hierarchies) { lxcfs_error("%s\n", strerror(errno)); goto out; @@ -4531,4 +5420,7 @@ static void __attribute__((destructor)) free_subsystems(void) } free(hierarchies); free(fd_hierarchies); + + if (cgroup_mount_ns_fd >= 0) + close(cgroup_mount_ns_fd); }