X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=bindings.c;h=e838441c01213255c479057c1a5b2c7f1b2ac829;hb=895f28e5f726fbb8ad2078cf55db5b82128f9b94;hp=f3cd85dab317967f3f6d14a30ea106c808ce13c1;hpb=99142521d202c1b626dd308e25c42eb95f613c21;p=mirror_lxcfs.git diff --git a/bindings.c b/bindings.c index f3cd85d..e838441 100644 --- a/bindings.c +++ b/bindings.c @@ -8,20 +8,24 @@ #define FUSE_USE_VERSION 26 +#define __STDC_FORMAT_MACROS #include #include #include #include +#include #include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -29,10 +33,15 @@ #include #include #include +#include +#include #include "bindings.h" #include "config.h" // for VERSION +/* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */ +#define LXCFS_NUMSTRLEN64 21 + /* Define pivot_root() if missing from the C library */ #ifndef HAVE_PIVOT_ROOT static int pivot_root(const char * new_root, const char * put_old) @@ -48,16 +57,6 @@ return -1; extern int pivot_root(const char * new_root, const char * put_old); #endif -#ifdef DEBUG -#define lxcfs_debug(format, ...) \ - do { \ - fprintf(stderr, "%s: %d: %s: " format, __FILE__, __LINE__, \ - __func__, __VA_ARGS__); \ - } while (false) -#else -#define lxcfs_debug(format, ...) -#endif /* DEBUG */ - enum { LXC_TYPE_CGDIR, LXC_TYPE_CGFILE, @@ -67,6 +66,7 @@ enum { LXC_TYPE_PROC_STAT, LXC_TYPE_PROC_DISKSTATS, LXC_TYPE_PROC_SWAPS, + LXC_TYPE_PROC_LOADAVG, }; struct file_info { @@ -80,8 +80,323 @@ struct file_info { int cached; }; -/* reserve buffer size, for cpuall in /proc/stat */ -#define BUF_RESERVE_SIZE 256 +struct cpuacct_usage { + uint64_t user; + uint64_t system; + uint64_t idle; +}; + +/* The function of hash table.*/ +#define LOAD_SIZE 100 /*the size of hash_table */ +#define FLUSH_TIME 5 /*the flush rate */ +#define DEPTH_DIR 3 /*the depth of per cgroup */ +/* The function of calculate loadavg .*/ +#define FSHIFT 11 /* nr of bits of precision */ +#define FIXED_1 (1<> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) +/* + * This parameter is used for proc_loadavg_read(). + * 1 means use loadavg, 0 means not use. + */ +static int loadavg = 0; +static volatile sig_atomic_t loadavg_stop = 0; +static int calc_hash(const char *name) +{ + unsigned int hash = 0; + unsigned int x = 0; + /* ELFHash algorithm. */ + while (*name) { + hash = (hash << 4) + *name++; + x = hash & 0xf0000000; + if (x != 0) + hash ^= (x >> 24); + hash &= ~x; + } + return (hash & 0x7fffffff); +} + +struct load_node { + char *cg; /*cg */ + unsigned long avenrun[3]; /* Load averages */ + unsigned int run_pid; + unsigned int total_pid; + unsigned int last_pid; + int cfd; /* The file descriptor of the mounted cgroup */ + struct load_node *next; + struct load_node **pre; +}; + +struct load_head { + /* + * The lock is about insert load_node and refresh load_node.To the first + * load_node of each hash bucket, insert and refresh in this hash bucket is + * mutually exclusive. + */ + pthread_mutex_t lock; + /* + * The rdlock is about read loadavg and delete load_node.To each hash + * bucket, read and delete is mutually exclusive. But at the same time, we + * allow paratactic read operation. This rdlock is at list level. + */ + pthread_rwlock_t rdlock; + /* + * The rilock is about read loadavg and insert load_node.To the first + * load_node of each hash bucket, read and insert is mutually exclusive. + * But at the same time, we allow paratactic read operation. + */ + pthread_rwlock_t rilock; + struct load_node *next; +}; + +static struct load_head load_hash[LOAD_SIZE]; /* hash table */ +/* + * init_load initialize the hash table. + * Return 0 on success, return -1 on failure. + */ +static int init_load(void) +{ + int i; + int ret; + + for (i = 0; i < LOAD_SIZE; i++) { + load_hash[i].next = NULL; + ret = pthread_mutex_init(&load_hash[i].lock, NULL); + if (ret != 0) { + lxcfs_error("%s\n", "Failed to initialize lock"); + goto out3; + } + ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL); + if (ret != 0) { + lxcfs_error("%s\n", "Failed to initialize rdlock"); + goto out2; + } + ret = pthread_rwlock_init(&load_hash[i].rilock, NULL); + if (ret != 0) { + lxcfs_error("%s\n", "Failed to initialize rilock"); + goto out1; + } + } + return 0; +out1: + pthread_rwlock_destroy(&load_hash[i].rdlock); +out2: + pthread_mutex_destroy(&load_hash[i].lock); +out3: + while (i > 0) { + i--; + pthread_mutex_destroy(&load_hash[i].lock); + pthread_rwlock_destroy(&load_hash[i].rdlock); + pthread_rwlock_destroy(&load_hash[i].rilock); + } + return -1; +} + +static void insert_node(struct load_node **n, int locate) +{ + struct load_node *f; + + pthread_mutex_lock(&load_hash[locate].lock); + pthread_rwlock_wrlock(&load_hash[locate].rilock); + f = load_hash[locate].next; + load_hash[locate].next = *n; + + (*n)->pre = &(load_hash[locate].next); + if (f) + f->pre = &((*n)->next); + (*n)->next = f; + pthread_mutex_unlock(&load_hash[locate].lock); + pthread_rwlock_unlock(&load_hash[locate].rilock); +} +/* + * locate_node() finds special node. Not return NULL means success. + * It should be noted that rdlock isn't unlocked at the end of code + * because this function is used to read special node. Delete is not + * allowed before read has ended. + * unlock rdlock only in proc_loadavg_read(). + */ +static struct load_node *locate_node(char *cg, int locate) +{ + struct load_node *f = NULL; + int i = 0; + + pthread_rwlock_rdlock(&load_hash[locate].rilock); + pthread_rwlock_rdlock(&load_hash[locate].rdlock); + if (load_hash[locate].next == NULL) { + pthread_rwlock_unlock(&load_hash[locate].rilock); + return f; + } + f = load_hash[locate].next; + pthread_rwlock_unlock(&load_hash[locate].rilock); + while (f && ((i = strcmp(f->cg, cg)) != 0)) + f = f->next; + return f; +} +/* Delete the load_node n and return the next node of it. */ +static struct load_node *del_node(struct load_node *n, int locate) +{ + struct load_node *g; + + pthread_rwlock_wrlock(&load_hash[locate].rdlock); + if (n->next == NULL) { + *(n->pre) = NULL; + } else { + *(n->pre) = n->next; + n->next->pre = n->pre; + } + g = n->next; + free(n->cg); + free(n); + pthread_rwlock_unlock(&load_hash[locate].rdlock); + return g; +} + +static void load_free(void) +{ + int i; + struct load_node *f, *p; + + for (i = 0; i < LOAD_SIZE; i++) { + pthread_mutex_lock(&load_hash[i].lock); + pthread_rwlock_wrlock(&load_hash[i].rilock); + pthread_rwlock_wrlock(&load_hash[i].rdlock); + if (load_hash[i].next == NULL) { + pthread_mutex_unlock(&load_hash[i].lock); + pthread_mutex_destroy(&load_hash[i].lock); + pthread_rwlock_unlock(&load_hash[i].rilock); + pthread_rwlock_destroy(&load_hash[i].rilock); + pthread_rwlock_unlock(&load_hash[i].rdlock); + pthread_rwlock_destroy(&load_hash[i].rdlock); + continue; + } + for (f = load_hash[i].next; f; ) { + free(f->cg); + p = f->next; + free(f); + f = p; + } + pthread_mutex_unlock(&load_hash[i].lock); + pthread_mutex_destroy(&load_hash[i].lock); + pthread_rwlock_unlock(&load_hash[i].rilock); + pthread_rwlock_destroy(&load_hash[i].rilock); + pthread_rwlock_unlock(&load_hash[i].rdlock); + pthread_rwlock_destroy(&load_hash[i].rdlock); + } +} + +/* Data for CPU view */ +struct cg_proc_stat { + char *cg; + struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat + struct cpuacct_usage *view; // Usage stats reported to the container + int cpu_count; + pthread_mutex_t lock; // For node manipulation + struct cg_proc_stat *next; +}; + +struct cg_proc_stat_head { + struct cg_proc_stat *next; + time_t lastcheck; + + /* + * For access to the list. Reading can be parallel, pruning is exclusive. + */ + pthread_rwlock_t lock; +}; + +#define CPUVIEW_HASH_SIZE 100 +static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE]; + +static bool cpuview_init_head(struct cg_proc_stat_head **head) +{ + *head = malloc(sizeof(struct cg_proc_stat_head)); + if (!(*head)) { + lxcfs_error("%s\n", strerror(errno)); + return false; + } + + (*head)->lastcheck = time(NULL); + (*head)->next = NULL; + + if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) { + lxcfs_error("%s\n", "Failed to initialize list lock"); + free(*head); + return false; + } + + return true; +} + +static bool init_cpuview() +{ + int i; + + for (i = 0; i < CPUVIEW_HASH_SIZE; i++) + proc_stat_history[i] = NULL; + + for (i = 0; i < CPUVIEW_HASH_SIZE; i++) { + if (!cpuview_init_head(&proc_stat_history[i])) + goto err; + } + + return true; + +err: + for (i = 0; i < CPUVIEW_HASH_SIZE; i++) { + if (proc_stat_history[i]) { + free(proc_stat_history[i]); + proc_stat_history[i] = NULL; + } + } + + return false; +} + +static void free_proc_stat_node(struct cg_proc_stat *node) +{ + pthread_mutex_destroy(&node->lock); + free(node->cg); + free(node->usage); + free(node->view); + free(node); +} + +static void cpuview_free_head(struct cg_proc_stat_head *head) +{ + struct cg_proc_stat *node, *tmp; + + if (head->next) { + node = head->next; + + for (;;) { + tmp = node; + node = node->next; + free_proc_stat_node(tmp); + + if (!node) + break; + } + } + + pthread_rwlock_destroy(&head->lock); + free(head); +} + +static void free_cpuview() +{ + int i; + + for (i = 0; i < CPUVIEW_HASH_SIZE; i++) { + if (proc_stat_history[i]) + cpuview_free_head(proc_stat_history[i]); + } +} + +/* Reserve buffer size to account for file size changes. */ +#define BUF_RESERVE_SIZE 512 /* * A table caching which pid is init for a pid namespace. @@ -116,7 +431,7 @@ static void lock_mutex(pthread_mutex_t *l) int ret; if ((ret = pthread_mutex_lock(l)) != 0) { - fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret)); + lxcfs_error("returned:%d %s\n", ret, strerror(ret)); exit(1); } } @@ -140,13 +455,14 @@ static char **hierarchies; * another namespace using the *at() family of functions * {openat(), fchownat(), ...}. */ static int *fd_hierarchies; +static int cgroup_mount_ns_fd = -1; static void unlock_mutex(pthread_mutex_t *l) { int ret; if ((ret = pthread_mutex_unlock(l)) != 0) { - fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret)); + lxcfs_error("returned:%d %s\n", ret, strerror(ret)); exit(1); } } @@ -378,19 +694,24 @@ static bool write_string(const char *fnam, const char *string, int fd) FILE *f; size_t len, ret; - if (!(f = fdopen(fd, "w"))) + f = fdopen(fd, "w"); + if (!f) return false; + len = strlen(string); ret = fwrite(string, 1, len, f); if (ret != len) { - fprintf(stderr, "Error writing to file: %s\n", strerror(errno)); + lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n", + strerror(errno), string, fnam); fclose(f); return false; } + if (fclose(f) < 0) { - fprintf(stderr, "Error writing to file: %s\n", strerror(errno)); + lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam); return false; } + return true; } @@ -408,7 +729,7 @@ static bool store_hierarchy(char *stridx, char *h) n *= ALLOC_NUM; char **tmp = realloc(hierarchies, n * sizeof(char *)); if (!tmp) { - fprintf(stderr, "Out of memory\n"); + lxcfs_error("%s\n", strerror(errno)); exit(1); } hierarchies = tmp; @@ -422,10 +743,12 @@ static void print_subsystems(void) { int i; + fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd); fprintf(stderr, "hierarchies:\n"); for (i = 0; i < num_hierarchies; i++) { if (hierarchies[i]) - fprintf(stderr, " %d: %s\n", i, hierarchies[i]); + fprintf(stderr, " %2d: fd: %3d: %s\n", i, + fd_hierarchies[i], hierarchies[i]); } } @@ -512,7 +835,7 @@ static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, in len = strlen(dirname); if (len >= MAXPATHLEN) { - fprintf(stderr, "chown_all_cgroup_files: pathname too long: %s\n", dirname); + lxcfs_error("Pathname too long: %s\n", dirname); return; } @@ -522,7 +845,7 @@ static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, in d = fdopendir(fd1); if (!d) { - fprintf(stderr, "chown_all_cgroup_files: failed to open %s\n", dirname); + lxcfs_error("Failed to open %s\n", dirname); return; } @@ -531,11 +854,11 @@ static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, in continue; ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); if (ret < 0 || ret >= MAXPATHLEN) { - fprintf(stderr, "chown_all_cgroup_files: pathname too long under %s\n", dirname); + lxcfs_error("Pathname too long under %s\n", dirname); continue; } if (fchownat(fd, path, uid, gid, 0) < 0) - fprintf(stderr, "Failed to chown file %s to %u:%u", path, uid, gid); + lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid); } closedir(d); } @@ -600,7 +923,7 @@ static bool recursive_rmdir(const char *dirname, int fd, const int cfd) rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); if (rc < 0 || rc >= MAXPATHLEN) { - fprintf(stderr, "pathname too long\n"); + lxcfs_error("%s\n", "Pathname too long."); continue; } @@ -616,7 +939,7 @@ static bool recursive_rmdir(const char *dirname, int fd, const int cfd) ret = true; if (closedir(dir) < 0) { - fprintf(stderr, "%s: failed to close directory %s: %s\n", __func__, dirname, strerror(errno)); + lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno)); ret = false; } @@ -766,7 +1089,7 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool cg = alloca(len); ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup); if (ret < 0 || (size_t)ret >= len) { - fprintf(stderr, "%s: pathname too long under %s\n", __func__, cgroup); + lxcfs_error("Pathname too long under %s\n", cgroup); return false; } @@ -787,13 +1110,13 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name); if (ret < 0 || ret >= MAXPATHLEN) { - fprintf(stderr, "%s: pathname too long under %s\n", __func__, cg); + lxcfs_error("Pathname too long under %s\n", cg); continue; } ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW); if (ret) { - fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno)); + lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno)); continue; } if ((!directories && !S_ISREG(mystat.st_mode)) || @@ -813,7 +1136,7 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool sz++; } if (closedir(dir) < 0) { - fprintf(stderr, "%s: failed closedir for %s: %s\n", __func__, cgroup, strerror(errno)); + lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno)); return false; } return true; @@ -870,16 +1193,38 @@ bool cgfs_get_value(const char *controller, const char *cgroup, const char *file fnam = alloca(len); ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file); if (ret < 0 || (size_t)ret >= len) - return NULL; + return false; fd = openat(cfd, fnam, O_RDONLY); if (fd < 0) - return NULL; + return false; *value = slurp_file(fnam, fd); return *value != NULL; } +bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file) +{ + int ret, cfd; + size_t len; + char *fnam, *tmpc; + + tmpc = find_mounted_controller(controller, &cfd); + if (!tmpc) + return false; + + /* Make sure we pass a relative path to *at() family of functions. + * . + /cgroup + / + file + \0 + */ + len = strlen(cgroup) + strlen(file) + 3; + fnam = alloca(len); + ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file); + if (ret < 0 || (size_t)ret >= len) + return false; + + return (faccessat(cfd, fnam, F_OK, 0) == 0); +} + struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file) { int ret, cfd; @@ -932,8 +1277,8 @@ static void *make_key_list_entry(const char *controller, const char *cgroup, con { struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry); if (!entry) { - fprintf(stderr, "%s: Error getting files under %s:%s\n", - __func__, controller, cgroup); + lxcfs_error("Error getting files under %s:%s\n", controller, + cgroup); } return entry; } @@ -1170,7 +1515,7 @@ convert_id_to_ns(FILE *idfile, unsigned int in_id) * uids wrapped around - unexpected as this is a procfile, * so just bail. */ - fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n", + lxcfs_error("pid wrapparound at entry %u %u %u in %s\n", nsuid, hostuid, count, line); return -1; } @@ -1276,7 +1621,7 @@ static char *get_next_cgroup_dir(const char *taskcg, const char *querycg) char *start, *end; if (strlen(taskcg) <= strlen(querycg)) { - fprintf(stderr, "%s: I was fed bad input\n", __func__); + lxcfs_error("%s\n", "I was fed bad input."); return NULL; } @@ -1507,7 +1852,7 @@ static char *pick_controller_from_path(struct fuse_context *fc, const char *path char *contr, *slash; if (strlen(path) < 9) { - errno = EINVAL; + errno = EACCES; return NULL; } if (*(path + 7) != '/') { @@ -1541,12 +1886,17 @@ static const char *find_cgroup_in_path(const char *path) { const char *p1; - if (strlen(path) < 9) + if (strlen(path) < 9) { + errno = EACCES; return NULL; - p1 = strstr(path+8, "/"); - if (!p1) + } + p1 = strstr(path + 8, "/"); + if (!p1) { + errno = EINVAL; return NULL; - return p1+1; + } + errno = 0; + return p1 + 1; } /* @@ -1605,7 +1955,7 @@ int cg_getattr(const char *path, struct stat *sb) controller = pick_controller_from_path(fc, path); if (!controller) - return -EIO; + return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) { /* this is just /cgroup/controller, return it as a dir */ @@ -1675,11 +2025,6 @@ int cg_getattr(const char *path, struct stat *sb) ret = -ENOENT; goto out; } - if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) { - ret = -EACCES; - goto out; - } - ret = 0; } @@ -1705,7 +2050,7 @@ int cg_opendir(const char *path, struct fuse_file_info *fi) // return list of keys for the controller, and list of child cgroups controller = pick_controller_from_path(fc, path); if (!controller) - return -EIO; + return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) { @@ -1753,7 +2098,7 @@ int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset return -EIO; if (d->type != LXC_TYPE_CGDIR) { - fprintf(stderr, "Internal error: file cache info used in readdir\n"); + lxcfs_error("%s\n", "Internal error: file cache info used in readdir."); return -EIO; } if (!d->cgroup && !d->controller) { @@ -1841,6 +2186,7 @@ static void do_release_file_info(struct fuse_file_info *fi) free(f->buf); f->buf = NULL; free(f); + f = NULL; } int cg_releasedir(const char *path, struct fuse_file_info *fi) @@ -1863,10 +2209,10 @@ int cg_open(const char *path, struct fuse_file_info *fi) controller = pick_controller_from_path(fc, path); if (!controller) - return -EIO; + return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) - return -EINVAL; + return -errno; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { @@ -1926,18 +2272,15 @@ int cg_access(const char *path, int mode) struct cgfs_files *k = NULL; struct fuse_context *fc = fuse_get_context(); - if (strcmp(path, "/cgroup") == 0) { - if ((mode & W_OK) == 0) - return -EACCES; + if (strcmp(path, "/cgroup") == 0) return 0; - } if (!fc) return -EIO; controller = pick_controller_from_path(fc, path); if (!controller) - return -EIO; + return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) { // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not @@ -2001,14 +2344,14 @@ static bool wait_for_sock(int sock, int timeout) return false; if ((epfd = epoll_create(1)) < 0) { - fprintf(stderr, "Failed to create epoll socket: %m\n"); + lxcfs_error("%s\n", "Failed to create epoll socket: %m."); return false; } ev.events = POLLIN_SET; ev.data.fd = sock; if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) { - fprintf(stderr, "Failed adding socket to epoll: %m\n"); + lxcfs_error("%s\n", "Failed adding socket to epoll: %m."); close(epfd); return false; } @@ -2056,8 +2399,7 @@ static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst) if (pingfirst) { if (msgrecv(sock, buf, 1) != 1) { - fprintf(stderr, "%s: Error getting reply from server over socketpair\n", - __func__); + lxcfs_error("%s\n", "Error getting reply from server over socketpair."); return SEND_CREDS_FAIL; } } @@ -2081,8 +2423,7 @@ static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst) msg.msg_iovlen = 1; if (sendmsg(sock, &msg, 0) < 0) { - fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__, - strerror(errno)); + lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno)); if (errno == 3) return SEND_CREDS_NOTSK; return SEND_CREDS_FAIL; @@ -2108,12 +2449,12 @@ static bool recv_creds(int sock, struct ucred *cred, char *v) cred->gid = -1; if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) { - fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno)); + lxcfs_error("Failed to set passcred: %s\n", strerror(errno)); return false; } buf[0] = '1'; if (write(sock, buf, 1) != 1) { - fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno)); + lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno)); return false; } @@ -2128,14 +2469,12 @@ static bool recv_creds(int sock, struct ucred *cred, char *v) msg.msg_iovlen = 1; if (!wait_for_sock(sock, 2)) { - fprintf(stderr, "Timed out waiting for scm_cred: %s\n", - strerror(errno)); + lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno)); return false; } ret = recvmsg(sock, &msg, MSG_DONTWAIT); if (ret < 0) { - fprintf(stderr, "Failed to receive scm_cred: %s\n", - strerror(errno)); + lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno)); return false; } @@ -2168,10 +2507,8 @@ static int pid_ns_clone_wrapper(void *arg) { char b = '1'; close(args->cpipe[0]); - if (write(args->cpipe[1], &b, sizeof(char)) < 0) { - fprintf(stderr, "%s (child): error on write: %s\n", - __func__, strerror(errno)); - } + if (write(args->cpipe[1], &b, sizeof(char)) < 0) + lxcfs_error("(child): error on write: %s.\n", strerror(errno)); close(args->cpipe[1]); return args->wrapped(args->sock, args->tpid); } @@ -2305,13 +2642,11 @@ bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *fi // read converted results if (!wait_for_sock(sock[0], 2)) { - fprintf(stderr, "%s: timed out waiting for pid from child: %s\n", - __func__, strerror(errno)); + lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno)); goto out; } if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { - fprintf(stderr, "%s: error reading pid from child: %s\n", - __func__, strerror(errno)); + lxcfs_error("Error reading pid from child: %s.\n", strerror(errno)); goto out; } must_strcat_pid(d, &sz, &asz, qpid); @@ -2326,8 +2661,7 @@ next: v = '1'; if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) { // failed to ask child to exit - fprintf(stderr, "%s: failed to ask child to exit: %s\n", - __func__, strerror(errno)); + lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno)); goto out; } @@ -2355,7 +2689,7 @@ int cg_read(const char *path, char *buf, size_t size, off_t offset, bool r; if (f->type != LXC_TYPE_CGFILE) { - fprintf(stderr, "Internal error: directory cache info used in cg_read\n"); + lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read."); return -EIO; } @@ -2422,12 +2756,11 @@ static int pid_from_ns(int sock, pid_t tpid) cred.gid = 0; while (1) { if (!wait_for_sock(sock, 2)) { - fprintf(stderr, "%s: timeout reading from parent\n", __func__); + lxcfs_error("%s\n", "Timeout reading from parent."); return 1; } if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) { - fprintf(stderr, "%s: bad read from parent: %s\n", - __func__, strerror(errno)); + lxcfs_error("Bad read from parent: %s.\n", strerror(errno)); return 1; } if (vpid == -1) // done @@ -2528,20 +2861,20 @@ void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid) *gid = -1; sprintf(line, "/proc/%d/status", pid); if ((f = fopen(line, "r")) == NULL) { - fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno)); + lxcfs_error("Error opening %s: %s\n", line, strerror(errno)); return; } while (fgets(line, 400, f)) { if (strncmp(line, "Uid:", 4) == 0) { if (sscanf(line+4, "%u", &u) != 1) { - fprintf(stderr, "bad uid line for pid %u\n", pid); + lxcfs_error("bad uid line for pid %u\n", pid); fclose(f); return; } *uid = u; } else if (strncmp(line, "Gid:", 4) == 0) { if (sscanf(line+4, "%u", &g) != 1) { - fprintf(stderr, "bad gid line for pid %u\n", pid); + lxcfs_error("bad gid line for pid %u\n", pid); fclose(f); return; } @@ -2613,8 +2946,7 @@ static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char char v; if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { - fprintf(stderr, "%s: error writing pid to child: %s\n", - __func__, strerror(errno)); + lxcfs_error("Error writing pid to child: %s.\n", strerror(errno)); goto out; } @@ -2638,7 +2970,7 @@ static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char /* All good, write the value */ qpid = -1; if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid)) - fprintf(stderr, "Warning: failed to ask child to exit\n"); + lxcfs_error("%s\n", "Warning: failed to ask child to exit."); if (!fail) answer = true; @@ -2667,7 +2999,7 @@ int cg_write(const char *path, const char *buf, size_t size, off_t offset, bool r; if (f->type != LXC_TYPE_CGFILE) { - fprintf(stderr, "Internal error: directory cache info used in cg_write\n"); + lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write."); return -EIO; } @@ -2720,15 +3052,16 @@ int cg_chown(const char *path, uid_t uid, gid_t gid) return -EIO; if (strcmp(path, "/cgroup") == 0) - return -EINVAL; + return -EPERM; controller = pick_controller_from_path(fc, path); if (!controller) - return -EINVAL; + return errno == ENOENT ? -EPERM : -errno; + cgroup = find_cgroup_in_path(path); if (!cgroup) /* this is just /cgroup/controller */ - return -EINVAL; + return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); @@ -2785,15 +3118,16 @@ int cg_chmod(const char *path, mode_t mode) return -EIO; if (strcmp(path, "/cgroup") == 0) - return -EINVAL; + return -EPERM; controller = pick_controller_from_path(fc, path); if (!controller) - return -EINVAL; + return errno == ENOENT ? -EPERM : -errno; + cgroup = find_cgroup_in_path(path); if (!cgroup) /* this is just /cgroup/controller */ - return -EINVAL; + return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); @@ -2851,14 +3185,13 @@ int cg_mkdir(const char *path, mode_t mode) if (!fc) return -EIO; - controller = pick_controller_from_path(fc, path); if (!controller) - return -EINVAL; + return errno == ENOENT ? -EPERM : -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) - return -EINVAL; + return -errno; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) @@ -2875,7 +3208,7 @@ int cg_mkdir(const char *path, mode_t mode) else if (last && strcmp(next, last) == 0) ret = -EEXIST; else - ret = -ENOENT; + ret = -EPERM; goto out; } @@ -2907,16 +3240,20 @@ int cg_rmdir(const char *path) return -EIO; controller = pick_controller_from_path(fc, path); - if (!controller) - return -EINVAL; + if (!controller) /* Someone's trying to delete "/cgroup". */ + return -EPERM; cgroup = find_cgroup_in_path(path); - if (!cgroup) - return -EINVAL; + if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */ + return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { - ret = -EINVAL; + /* Someone's trying to delete a cgroup on the same level as the + * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or + * rmdir "/cgroup/blkio/init.slice". + */ + ret = -EPERM; goto out; } @@ -2924,7 +3261,7 @@ int cg_rmdir(const char *path) if (initpid <= 0) initpid = fc->pid; if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) { - if (!last || strcmp(next, last) == 0) + if (!last || (next && (strcmp(next, last) == 0))) ret = -EBUSY; else ret = -ENOENT; @@ -2963,29 +3300,32 @@ static bool startswith(const char *line, const char *pref) static void parse_memstat(char *memstat, unsigned long *cached, unsigned long *active_anon, unsigned long *inactive_anon, unsigned long *active_file, unsigned long *inactive_file, - unsigned long *unevictable) + unsigned long *unevictable, unsigned long *shmem) { char *eol; while (*memstat) { - if (startswith(memstat, "cache")) { + if (startswith(memstat, "total_cache")) { sscanf(memstat + 11, "%lu", cached); *cached /= 1024; - } else if (startswith(memstat, "active_anon")) { - sscanf(memstat + 11, "%lu", active_anon); + } else if (startswith(memstat, "total_active_anon")) { + sscanf(memstat + 17, "%lu", active_anon); *active_anon /= 1024; - } else if (startswith(memstat, "inactive_anon")) { - sscanf(memstat + 11, "%lu", inactive_anon); + } else if (startswith(memstat, "total_inactive_anon")) { + sscanf(memstat + 19, "%lu", inactive_anon); *inactive_anon /= 1024; - } else if (startswith(memstat, "active_file")) { - sscanf(memstat + 11, "%lu", active_file); + } else if (startswith(memstat, "total_active_file")) { + sscanf(memstat + 17, "%lu", active_file); *active_file /= 1024; - } else if (startswith(memstat, "inactive_file")) { - sscanf(memstat + 11, "%lu", inactive_file); + } else if (startswith(memstat, "total_inactive_file")) { + sscanf(memstat + 19, "%lu", inactive_file); *inactive_file /= 1024; - } else if (startswith(memstat, "unevictable")) { - sscanf(memstat + 11, "%lu", unevictable); + } else if (startswith(memstat, "total_unevictable")) { + sscanf(memstat + 17, "%lu", unevictable); *unevictable /= 1024; + } else if (startswith(memstat, "total_shmem")) { + sscanf(memstat + 11, "%lu", shmem); + *shmem /= 1024; } eol = strchr(memstat, '\n'); if (!eol) @@ -3036,7 +3376,7 @@ static int read_file(const char *path, char *buf, size_t size, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3062,12 +3402,12 @@ static int read_file(const char *path, char *buf, size_t size, * FUSE ops for /proc */ -static unsigned long get_memlimit(const char *cgroup) +static unsigned long get_memlimit(const char *cgroup, const char *file) { char *memlimit_str = NULL; unsigned long memlimit = -1; - if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str)) + if (cgfs_get_value("memory", cgroup, file, &memlimit_str)) memlimit = strtoul(memlimit_str, NULL, 10); free(memlimit_str); @@ -3075,16 +3415,16 @@ static unsigned long get_memlimit(const char *cgroup) return memlimit; } -static unsigned long get_min_memlimit(const char *cgroup) +static unsigned long get_min_memlimit(const char *cgroup, const char *file) { char *copy = strdupa(cgroup); unsigned long memlimit = 0, retlimit; - retlimit = get_memlimit(copy); + retlimit = get_memlimit(copy, file); while (strcmp(copy, "/") != 0) { copy = dirname(copy); - memlimit = get_memlimit(copy); + memlimit = get_memlimit(copy, file); if (memlimit != -1 && memlimit < retlimit) retlimit = memlimit; }; @@ -3099,11 +3439,11 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, struct file_info *d = (struct file_info *)fi->fh; char *cg; char *memusage_str = NULL, *memstat_str = NULL, - *memswlimit_str = NULL, *memswusage_str = NULL, - *memswlimit_default_str = NULL, *memswusage_default_str = NULL; + *memswlimit_str = NULL, *memswusage_str = NULL; unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0, cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0, - active_file = 0, inactive_file = 0, unevictable = 0; + active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0, + hostswtotal = 0; char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; char *cache = d->buf; @@ -3129,7 +3469,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, return read_file("/proc/meminfo", buf, size, d); prune_init_slice(cg); - memlimit = get_min_memlimit(cg); + memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str)) goto err; if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str)) @@ -3140,20 +3480,9 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) && cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str)) { - /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */ - if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str)) - goto err; - if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str)) - goto err; - - memswlimit = strtoul(memswlimit_str, NULL, 10); + memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes"); memswusage = strtoul(memswusage_str, NULL, 10); - if (!strcmp(memswlimit_str, memswlimit_default_str)) - memswlimit = 0; - if (!strcmp(memswusage_str, memswusage_default_str)) - memswusage = 0; - memswlimit = memswlimit / 1024; memswusage = memswusage / 1024; } @@ -3164,7 +3493,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, parse_memstat(memstat_str, &cached, &active_anon, &inactive_anon, &active_file, &inactive_file, - &unevictable); + &unevictable, &shmem); f = fopen("/proc/meminfo", "r"); if (!f) @@ -3176,7 +3505,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, memset(lbuf, 0, 100); if (startswith(line, "MemTotal:")) { - sscanf(line+14, "%lu", &hosttotal); + sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal); if (hosttotal < memlimit) memlimit = hosttotal; snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit); @@ -3185,13 +3514,16 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage); printme = lbuf; } else if (startswith(line, "MemAvailable:")) { - snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage); + snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached); printme = lbuf; } else if (startswith(line, "SwapTotal:") && memswlimit > 0) { - snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit); + sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal); + if (hostswtotal < memswlimit) + memswlimit = hostswtotal; + snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit); printme = lbuf; } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) { - unsigned long swaptotal = memswlimit - memlimit, + unsigned long swaptotal = memswlimit, swapusage = memswusage - memusage, swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0; snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree); @@ -3208,11 +3540,11 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, } else if (startswith(line, "SwapCached:")) { snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL); printme = lbuf; - } else if (startswith(line, "Active")) { + } else if (startswith(line, "Active:")) { snprintf(lbuf, 100, "Active: %8lu kB\n", active_anon + active_file); printme = lbuf; - } else if (startswith(line, "Inactive")) { + } else if (startswith(line, "Inactive:")) { snprintf(lbuf, 100, "Inactive: %8lu kB\n", inactive_anon + inactive_file); printme = lbuf; @@ -3237,6 +3569,15 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, } else if (startswith(line, "SUnreclaim")) { snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL); printme = lbuf; + } else if (startswith(line, "Shmem:")) { + snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem); + printme = lbuf; + } else if (startswith(line, "ShmemHugePages")) { + snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL); + printme = lbuf; + } else if (startswith(line, "ShmemPmdMapped")) { + snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL); + printme = lbuf; } else printme = line; @@ -3248,7 +3589,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3273,8 +3614,6 @@ err: free(memswlimit_str); free(memswusage_str); free(memstat_str); - free(memswlimit_default_str); - free(memswusage_default_str); return rv; } @@ -3302,6 +3641,85 @@ static bool cpuline_in_cpuset(const char *line, const char *cpuset) return cpu_in_cpuset(cpu, cpuset); } +/* + * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`, + * depending on `param`. Parameter value is returned throuh `value`. + */ +static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value) +{ + bool rv = false; + char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0 + char *str = NULL; + + sprintf(file, "cpu.cfs_%s_us", param); + + if (!cgfs_get_value("cpu", cg, file, &str)) + goto err; + + if (sscanf(str, "%ld", value) != 1) + goto err; + + rv = true; + +err: + if (str) + free(str); + return rv; +} + +/* + * Return the maximum number of visible CPUs based on CPU quotas. + * If there is no quota set, zero is returned. + */ +int max_cpu_count(const char *cg) +{ + int rv, nprocs; + int64_t cfs_quota, cfs_period; + + if (!read_cpu_cfs_param(cg, "quota", &cfs_quota)) + return 0; + + if (!read_cpu_cfs_param(cg, "period", &cfs_period)) + return 0; + + if (cfs_quota <= 0 || cfs_period <= 0) + return 0; + + rv = cfs_quota / cfs_period; + + /* In case quota/period does not yield a whole number, add one CPU for + * the remainder. + */ + if ((cfs_quota % cfs_period) > 0) + rv += 1; + + nprocs = get_nprocs(); + + if (rv > nprocs) + rv = nprocs; + + return rv; +} + +/* + * Determine whether CPU views should be used or not. + */ +bool use_cpuview(const char *cg) +{ + int cfd; + char *tmpc; + + tmpc = find_mounted_controller("cpu", &cfd); + if (!tmpc) + return false; + + tmpc = find_mounted_controller("cpuacct", &cfd); + if (!tmpc) + return false; + + return true; +} + /* * check whether this is a '^processor" line in /proc/cpuinfo */ @@ -3324,7 +3742,8 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, char *line = NULL; size_t linelen = 0, total_len = 0, rv = 0; bool am_printing = false, firstline = true, is_s390x = false; - int curcpu = -1, cpu; + int curcpu = -1, cpu, max_cpus = 0; + bool use_view; char *cache = d->buf; size_t cache_size = d->buflen; FILE *f = NULL; @@ -3352,6 +3771,11 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, if (!cpuset) goto err; + use_view = use_cpuview(cg); + + if (use_view) + max_cpus = max_cpu_count(cg); + f = fopen("/proc/cpuinfo", "r"); if (!f) goto err; @@ -3369,6 +3793,8 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, if (strncmp(line, "# processors:", 12) == 0) continue; if (is_processor_line(line)) { + if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus) + break; am_printing = cpuline_in_cpuset(line, cpuset); if (am_printing) { curcpu ++; @@ -3379,7 +3805,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3390,6 +3816,8 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, continue; } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) { char *p; + if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus) + break; if (!cpu_in_cpuset(cpu, cpuset)) continue; curcpu ++; @@ -3404,7 +3832,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3422,7 +3850,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3480,27 +3908,815 @@ err: return rv; } -static int proc_stat_read(char *buf, size_t size, off_t offset, - struct fuse_file_info *fi) +static uint64_t get_reaper_start_time(pid_t pid) { - struct fuse_context *fc = fuse_get_context(); - struct file_info *d = (struct file_info *)fi->fh; - char *cg; - char *cpuset = NULL; - char *line = NULL; - size_t linelen = 0, total_len = 0, rv = 0; - int curcpu = -1; /* cpu numbering starts at 0 */ - unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0; - unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0, - irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0; -#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE - char cpuall[CPUALL_MAX_SIZE]; - /* reserve for cpu all */ - char *cache = d->buf + CPUALL_MAX_SIZE; - size_t cache_size = d->buflen - CPUALL_MAX_SIZE; - FILE *f = NULL; - - if (offset){ + int ret; + FILE *f; + uint64_t starttime; + /* strlen("/proc/") = 6 + * + + * LXCFS_NUMSTRLEN64 + * + + * strlen("/stat") = 5 + * + + * \0 = 1 + * */ +#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1) + char path[__PROC_PID_STAT_LEN]; + pid_t qpid; + + qpid = lookup_initpid_in_store(pid); + if (qpid <= 0) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid); + if (ret < 0 || ret >= __PROC_PID_STAT_LEN) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + f = fopen(path, "r"); + if (!f) { + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + /* Note that the *scanf() argument supression requires that length + * modifiers such as "l" are omitted. Otherwise some compilers will yell + * at us. It's like telling someone you're not married and then asking + * if you can bring your wife to the party. + */ + ret = fscanf(f, "%*d " /* (1) pid %d */ + "%*s " /* (2) comm %s */ + "%*c " /* (3) state %c */ + "%*d " /* (4) ppid %d */ + "%*d " /* (5) pgrp %d */ + "%*d " /* (6) session %d */ + "%*d " /* (7) tty_nr %d */ + "%*d " /* (8) tpgid %d */ + "%*u " /* (9) flags %u */ + "%*u " /* (10) minflt %lu */ + "%*u " /* (11) cminflt %lu */ + "%*u " /* (12) majflt %lu */ + "%*u " /* (13) cmajflt %lu */ + "%*u " /* (14) utime %lu */ + "%*u " /* (15) stime %lu */ + "%*d " /* (16) cutime %ld */ + "%*d " /* (17) cstime %ld */ + "%*d " /* (18) priority %ld */ + "%*d " /* (19) nice %ld */ + "%*d " /* (20) num_threads %ld */ + "%*d " /* (21) itrealvalue %ld */ + "%" PRIu64, /* (22) starttime %llu */ + &starttime); + if (ret != 1) { + fclose(f); + /* Caller can check for EINVAL on 0. */ + errno = EINVAL; + return 0; + } + + fclose(f); + + errno = 0; + return starttime; +} + +static uint64_t get_reaper_start_time_in_sec(pid_t pid) +{ + uint64_t clockticks; + int64_t ticks_per_sec; + + clockticks = get_reaper_start_time(pid); + if (clockticks == 0 && errno == EINVAL) { + lxcfs_debug("failed to retrieve start time of pid %d\n", pid); + return 0; + } + + ticks_per_sec = sysconf(_SC_CLK_TCK); + if (ticks_per_sec < 0 && errno == EINVAL) { + lxcfs_debug( + "%s\n", + "failed to determine number of clock ticks in a second"); + return 0; + } + + return (clockticks /= ticks_per_sec); +} + +static uint64_t get_reaper_age(pid_t pid) +{ + uint64_t procstart, uptime, procage; + + /* We need to substract the time the process has started since system + * boot minus the time when the system has started to get the actual + * reaper age. + */ + procstart = get_reaper_start_time_in_sec(pid); + procage = procstart; + if (procstart > 0) { + int ret; + struct timespec spec; + + ret = clock_gettime(CLOCK_BOOTTIME, &spec); + if (ret < 0) + return 0; + /* We could make this more precise here by using the tv_nsec + * field in the timespec struct and convert it to milliseconds + * and then create a double for the seconds and milliseconds but + * that seems more work than it is worth. + */ + uptime = spec.tv_sec; + procage = uptime - procstart; + } + + return procage; +} + +/* + * Returns 0 on success. + * It is the caller's responsibility to free `return_usage`, unless this + * function returns an error. + */ +static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage) +{ + int cpucount = get_nprocs(); + struct cpuacct_usage *cpu_usage; + int rv = 0, i, j, ret, read_pos = 0, read_cnt; + int cg_cpu; + uint64_t cg_user, cg_system; + int64_t ticks_per_sec; + char *usage_str = NULL; + + ticks_per_sec = sysconf(_SC_CLK_TCK); + + if (ticks_per_sec < 0 && errno == EINVAL) { + lxcfs_debug( + "%s\n", + "read_cpuacct_usage_all failed to determine number of clock ticks " + "in a second"); + return -1; + } + + cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount); + if (!cpu_usage) + return -ENOMEM; + + if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) { + rv = -1; + goto err; + } + + if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) { + lxcfs_error("read_cpuacct_usage_all reading first line from " + "%s/cpuacct.usage_all failed.\n", cg); + rv = -1; + goto err; + } + + read_pos += read_cnt; + + for (i = 0, j = 0; i < cpucount; i++) { + ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user, + &cg_system, &read_cnt); + + if (ret == EOF) + break; + + if (ret != 3) { + lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all " + "failed.\n", cg); + rv = -1; + goto err; + } + + read_pos += read_cnt; + + if (!cpu_in_cpuset(i, cpuset)) + continue; + + /* Convert the time from nanoseconds to USER_HZ */ + cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec; + cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec; + j++; + } + + rv = 0; + *return_usage = cpu_usage; + +err: + if (usage_str) + free(usage_str); + + if (rv != 0) { + free(cpu_usage); + *return_usage = NULL; + } + + return rv; +} + +static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count) +{ + int i; + unsigned long sum = 0; + + for (i = 0; i < cpu_count; i++) { + /* When cpuset is changed on the fly, the CPUs might get reordered. + * We could either reset all counters, or check that the substractions + * below will return expected results. + */ + if (newer[i].user > older[i].user) + diff[i].user = newer[i].user - older[i].user; + else + diff[i].user = 0; + + if (newer[i].system > older[i].system) + diff[i].system = newer[i].system - older[i].system; + else + diff[i].system = 0; + + if (newer[i].idle > older[i].idle) + diff[i].idle = newer[i].idle - older[i].idle; + else + diff[i].idle = 0; + + sum += diff[i].user; + sum += diff[i].system; + sum += diff[i].idle; + } + + return sum; +} + +static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold) +{ + unsigned long free_space, to_add; + + free_space = threshold - usage->user - usage->system; + + if (free_space > usage->idle) + free_space = usage->idle; + + to_add = free_space > *surplus ? *surplus : free_space; + + *counter += to_add; + usage->idle -= to_add; + *surplus -= to_add; +} + +static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node) +{ + struct cg_proc_stat *first = NULL, *prev, *tmp; + + for (prev = NULL; node; ) { + if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) { + tmp = node; + lxcfs_debug("Removing stat node for %s\n", node->cg); + + if (prev) + prev->next = node->next; + else + first = node->next; + + node = node->next; + free_proc_stat_node(tmp); + } else { + if (!first) + first = node; + prev = node; + node = node->next; + } + } + + return first; +} + +#define PROC_STAT_PRUNE_INTERVAL 10 +static void prune_proc_stat_history(void) +{ + int i; + time_t now = time(NULL); + + for (i = 0; i < CPUVIEW_HASH_SIZE; i++) { + pthread_rwlock_wrlock(&proc_stat_history[i]->lock); + + if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) { + pthread_rwlock_unlock(&proc_stat_history[i]->lock); + return; + } + + if (proc_stat_history[i]->next) { + proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next); + proc_stat_history[i]->lastcheck = now; + } + + pthread_rwlock_unlock(&proc_stat_history[i]->lock); + } +} + +static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg) +{ + struct cg_proc_stat *node; + + pthread_rwlock_rdlock(&head->lock); + + if (!head->next) { + pthread_rwlock_unlock(&head->lock); + return NULL; + } + + node = head->next; + + do { + if (strcmp(cg, node->cg) == 0) + goto out; + } while ((node = node->next)); + + node = NULL; + +out: + pthread_rwlock_unlock(&head->lock); + prune_proc_stat_history(); + return node; +} + +static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg) +{ + struct cg_proc_stat *node; + int i; + + node = malloc(sizeof(struct cg_proc_stat)); + if (!node) + goto err; + + node->cg = NULL; + node->usage = NULL; + node->view = NULL; + + node->cg = malloc(strlen(cg) + 1); + if (!node->cg) + goto err; + + strcpy(node->cg, cg); + + node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count); + if (!node->usage) + goto err; + + memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count); + + node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count); + if (!node->view) + goto err; + + node->cpu_count = cpu_count; + node->next = NULL; + + if (pthread_mutex_init(&node->lock, NULL) != 0) { + lxcfs_error("%s\n", "Failed to initialize node lock"); + goto err; + } + + for (i = 0; i < cpu_count; i++) { + node->view[i].user = 0; + node->view[i].system = 0; + node->view[i].idle = 0; + } + + return node; + +err: + if (node && node->cg) + free(node->cg); + if (node && node->usage) + free(node->usage); + if (node && node->view) + free(node->view); + if (node) + free(node); + + return NULL; +} + +static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node) +{ + int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE; + struct cg_proc_stat_head *head = proc_stat_history[hash]; + struct cg_proc_stat *node, *rv = new_node; + + pthread_rwlock_wrlock(&head->lock); + + if (!head->next) { + head->next = new_node; + goto out; + } + + node = head->next; + + for (;;) { + if (strcmp(node->cg, new_node->cg) == 0) { + /* The node is already present, return it */ + free_proc_stat_node(new_node); + rv = node; + goto out; + } + + if (node->next) { + node = node->next; + continue; + } + + node->next = new_node; + goto out; + } + +out: + pthread_rwlock_unlock(&head->lock); + return rv; +} + +static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count) +{ + struct cpuacct_usage *new_usage, *new_view; + int i; + + /* Allocate new memory */ + new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count); + if (!new_usage) + return false; + + new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count); + if (!new_view) { + free(new_usage); + return false; + } + + /* Copy existing data & initialize new elements */ + for (i = 0; i < cpu_count; i++) { + if (i < node->cpu_count) { + new_usage[i].user = node->usage[i].user; + new_usage[i].system = node->usage[i].system; + new_usage[i].idle = node->usage[i].idle; + + new_view[i].user = node->view[i].user; + new_view[i].system = node->view[i].system; + new_view[i].idle = node->view[i].idle; + } else { + new_usage[i].user = 0; + new_usage[i].system = 0; + new_usage[i].idle = 0; + + new_view[i].user = 0; + new_view[i].system = 0; + new_view[i].idle = 0; + } + } + + free(node->usage); + free(node->view); + + node->usage = new_usage; + node->view = new_view; + node->cpu_count = cpu_count; + + return true; +} + +static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg) +{ + int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE; + struct cg_proc_stat_head *head = proc_stat_history[hash]; + struct cg_proc_stat *node; + + node = find_proc_stat_node(head, cg); + + if (!node) { + node = new_proc_stat_node(usage, cpu_count, cg); + if (!node) + return NULL; + + node = add_proc_stat_node(node); + lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg); + } + + pthread_mutex_lock(&node->lock); + + /* If additional CPUs on the host have been enabled, CPU usage counter + * arrays have to be expanded */ + if (node->cpu_count < cpu_count) { + lxcfs_debug("Expanding stat node %d->%d for %s\n", + node->cpu_count, cpu_count, cg); + + if (!expand_proc_stat_node(node, cpu_count)) { + pthread_mutex_unlock(&node->lock); + lxcfs_debug("Unable to expand stat node %d->%d for %s\n", + node->cpu_count, cpu_count, cg); + return NULL; + } + } + + return node; +} + +static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count) +{ + int i; + + lxcfs_debug("Resetting stat node for %s\n", node->cg); + memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count); + + for (i = 0; i < cpu_count; i++) { + node->view[i].user = 0; + node->view[i].system = 0; + node->view[i].idle = 0; + } + + node->cpu_count = cpu_count; +} + +static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, FILE *f, char *buf, size_t buf_size) +{ + char *line = NULL; + size_t linelen = 0, total_len = 0, rv = 0, l; + int curcpu = -1; /* cpu numbering starts at 0 */ + int max_cpus = max_cpu_count(cg), cpu_cnt = 0; + unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0; + unsigned long user_sum = 0, system_sum = 0, idle_sum = 0; + unsigned long user_surplus = 0, system_surplus = 0; + unsigned long total_sum, threshold; + struct cg_proc_stat *stat_node; + struct cpuacct_usage *diff = NULL; + int nprocs = get_nprocs(); + + /* Read all CPU stats and stop when we've encountered other lines */ + while (getline(&line, &linelen, f) != -1) { + int cpu, ret; + char cpu_char[10]; /* That's a lot of cores */ + uint64_t all_used, cg_used; + + if (strlen(line) == 0) + continue; + if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) { + /* not a ^cpuN line containing a number N */ + break; + } + + if (sscanf(cpu_char, "%d", &cpu) != 1) + continue; + if (!cpu_in_cpuset(cpu, cpuset)) + continue; + curcpu ++; + cpu_cnt ++; + + ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + &user, + &nice, + &system, + &idle, + &iowait, + &irq, + &softirq, + &steal, + &guest, + &guest_nice); + + if (ret != 10) + continue; + + all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice; + cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system; + + if (all_used >= cg_used) { + cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used); + + } else { + lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, " + "%lu in cpuacct.usage_all; unable to determine idle time\n", + curcpu, cg, all_used, cg_used); + cg_cpu_usage[curcpu].idle = idle; + } + } + + /* Cannot use more CPUs than is available due to cpuset */ + if (max_cpus > cpu_cnt) + max_cpus = cpu_cnt; + + stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg); + + if (!stat_node) { + lxcfs_error("unable to find/create stat node for %s\n", cg); + rv = 0; + goto err; + } + + diff = malloc(sizeof(struct cpuacct_usage) * nprocs); + if (!diff) { + rv = 0; + goto err; + } + + /* + * If the new values are LOWER than values stored in memory, it means + * the cgroup has been reset/recreated and we should reset too. + */ + if (cg_cpu_usage[0].user < stat_node->usage[0].user) + reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs); + + total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, cpu_cnt); + + for (curcpu = 0; curcpu < cpu_cnt; curcpu++) { + stat_node->usage[curcpu].user += diff[curcpu].user; + stat_node->usage[curcpu].system += diff[curcpu].system; + stat_node->usage[curcpu].idle += diff[curcpu].idle; + + if (max_cpus > 0 && curcpu >= max_cpus) { + user_surplus += diff[curcpu].user; + system_surplus += diff[curcpu].system; + } + } + + /* Calculate usage counters of visible CPUs */ + if (max_cpus > 0) { + /* threshold = maximum usage per cpu, including idle */ + threshold = total_sum / cpu_cnt * max_cpus; + + for (curcpu = 0; curcpu < max_cpus; curcpu++) { + if (diff[curcpu].user + diff[curcpu].system >= threshold) + continue; + + /* Add user */ + add_cpu_usage( + &user_surplus, + &diff[curcpu], + &diff[curcpu].user, + threshold); + + if (diff[curcpu].user + diff[curcpu].system >= threshold) + continue; + + /* If there is still room, add system */ + add_cpu_usage( + &system_surplus, + &diff[curcpu], + &diff[curcpu].system, + threshold); + } + + if (user_surplus > 0) + lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg); + if (system_surplus > 0) + lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg); + + for (curcpu = 0; curcpu < max_cpus; curcpu++) { + stat_node->view[curcpu].user += diff[curcpu].user; + stat_node->view[curcpu].system += diff[curcpu].system; + stat_node->view[curcpu].idle += diff[curcpu].idle; + + user_sum += stat_node->view[curcpu].user; + system_sum += stat_node->view[curcpu].system; + idle_sum += stat_node->view[curcpu].idle; + } + + } else { + for (curcpu = 0; curcpu < cpu_cnt; curcpu++) { + stat_node->view[curcpu].user = stat_node->usage[curcpu].user; + stat_node->view[curcpu].system = stat_node->usage[curcpu].system; + stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle; + + user_sum += stat_node->view[curcpu].user; + system_sum += stat_node->view[curcpu].system; + idle_sum += stat_node->view[curcpu].idle; + } + } + + /* Render the file */ + /* cpu-all */ + l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n", + user_sum, + system_sum, + idle_sum); + + if (l < 0) { + perror("Error writing to cache"); + rv = 0; + goto err; + + } + if (l >= buf_size) { + lxcfs_error("%s\n", "Internal error: truncated write to cache."); + rv = 0; + goto err; + } + + buf += l; + buf_size -= l; + total_len += l; + + /* Render visible CPUs */ + for (curcpu = 0; curcpu < cpu_cnt; curcpu++) { + if (max_cpus > 0 && curcpu == max_cpus) + break; + + l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n", + curcpu, + stat_node->view[curcpu].user, + stat_node->view[curcpu].system, + stat_node->view[curcpu].idle); + + if (l < 0) { + perror("Error writing to cache"); + rv = 0; + goto err; + + } + if (l >= buf_size) { + lxcfs_error("%s\n", "Internal error: truncated write to cache."); + rv = 0; + goto err; + } + + buf += l; + buf_size -= l; + total_len += l; + } + + /* Pass the rest of /proc/stat, start with the last line read */ + l = snprintf(buf, buf_size, "%s", line); + + if (l < 0) { + perror("Error writing to cache"); + rv = 0; + goto err; + + } + if (l >= buf_size) { + lxcfs_error("%s\n", "Internal error: truncated write to cache."); + rv = 0; + goto err; + } + + buf += l; + buf_size -= l; + total_len += l; + + /* Pass the rest of the host's /proc/stat */ + while (getline(&line, &linelen, f) != -1) { + l = snprintf(buf, buf_size, "%s", line); + if (l < 0) { + perror("Error writing to cache"); + rv = 0; + goto err; + } + if (l >= buf_size) { + lxcfs_error("%s\n", "Internal error: truncated write to cache."); + rv = 0; + goto err; + } + buf += l; + buf_size -= l; + total_len += l; + } + + rv = total_len; + +err: + if (stat_node) + pthread_mutex_unlock(&stat_node->lock); + if (line) + free(line); + if (diff) + free(diff); + return rv; +} + +#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2) +static int proc_stat_read(char *buf, size_t size, off_t offset, + struct fuse_file_info *fi) +{ + struct fuse_context *fc = fuse_get_context(); + struct file_info *d = (struct file_info *)fi->fh; + char *cg; + char *cpuset = NULL; + char *line = NULL; + size_t linelen = 0, total_len = 0, rv = 0; + int curcpu = -1; /* cpu numbering starts at 0 */ + unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0; + unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0, + irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0; + char cpuall[CPUALL_MAX_SIZE]; + /* reserve for cpu all */ + char *cache = d->buf + CPUALL_MAX_SIZE; + size_t cache_size = d->buflen - CPUALL_MAX_SIZE; + FILE *f = NULL; + struct cpuacct_usage *cg_cpu_usage = NULL; + + if (offset){ if (offset > d->size) return -EINVAL; if (!d->cached) @@ -3523,21 +4739,38 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, if (!cpuset) goto err; + /* + * Read cpuacct.usage_all for all CPUs. + * If the cpuacct cgroup is present, it is used to calculate the container's + * CPU usage. If not, values from the host's /proc/stat are used. + */ + if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) { + lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, " + "falling back to the host's /proc/stat"); + } + f = fopen("/proc/stat", "r"); if (!f) goto err; //skip first line if (getline(&line, &linelen, f) < 0) { - fprintf(stderr, "proc_stat_read read first line failed\n"); + lxcfs_error("%s\n", "proc_stat_read read first line failed."); goto err; } + if (use_cpuview(cg) && cg_cpu_usage) { + total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, f, d->buf, d->buflen); + goto out; + } + while (getline(&line, &linelen, f) != -1) { ssize_t l; int cpu; char cpu_char[10]; /* That's a lot of cores */ char *c; + uint64_t all_used, cg_used, new_idle; + int ret; if (strlen(line) == 0) continue; @@ -3550,7 +4783,7 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3566,58 +4799,125 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, continue; curcpu ++; - c = strchr(line, ' '); - if (!c) - continue; - l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c); - if (l < 0) { - perror("Error writing to cache"); - rv = 0; - goto err; + ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + &user, + &nice, + &system, + &idle, + &iowait, + &irq, + &softirq, + &steal, + &guest, + &guest_nice); + + if (ret != 10 || !cg_cpu_usage) { + c = strchr(line, ' '); + if (!c) + continue; + l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c); + if (l < 0) { + perror("Error writing to cache"); + rv = 0; + goto err; - } - if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); - rv = 0; - goto err; + } + if (l >= cache_size) { + lxcfs_error("%s\n", "Internal error: truncated write to cache."); + rv = 0; + goto err; + } + + cache += l; + cache_size -= l; + total_len += l; + + if (ret != 10) + continue; } - cache += l; - cache_size -= l; - total_len += l; + if (cg_cpu_usage) { + all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice; + cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system; - if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq, - &softirq, &steal, &guest) != 9) - continue; - user_sum += user; - nice_sum += nice; - system_sum += system; - idle_sum += idle; - iowait_sum += iowait; - irq_sum += irq; - softirq_sum += softirq; - steal_sum += steal; - guest_sum += guest; + if (all_used >= cg_used) { + new_idle = idle + (all_used - cg_used); + + } else { + lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, " + "%lu in cpuacct.usage_all; unable to determine idle time\n", + curcpu, cg, all_used, cg_used); + new_idle = idle; + } + + l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n", + curcpu, cg_cpu_usage[curcpu].user, cg_cpu_usage[curcpu].system, + new_idle); + + if (l < 0) { + perror("Error writing to cache"); + rv = 0; + goto err; + + } + if (l >= cache_size) { + lxcfs_error("%s\n", "Internal error: truncated write to cache."); + rv = 0; + goto err; + } + + cache += l; + cache_size -= l; + total_len += l; + + user_sum += cg_cpu_usage[curcpu].user; + system_sum += cg_cpu_usage[curcpu].system; + idle_sum += new_idle; + + } else { + user_sum += user; + nice_sum += nice; + system_sum += system; + idle_sum += idle; + iowait_sum += iowait; + irq_sum += irq; + softirq_sum += softirq; + steal_sum += steal; + guest_sum += guest; + guest_nice_sum += guest_nice; + } } cache = d->buf; - int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum); - if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){ + int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + user_sum, + nice_sum, + system_sum, + idle_sum, + iowait_sum, + irq_sum, + softirq_sum, + steal_sum, + guest_sum, + guest_nice_sum); + if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) { memcpy(cache, cpuall, cpuall_len); cache += cpuall_len; - } else{ + } else { /* shouldn't happen */ - fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len); + lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len); cpuall_len = 0; } memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len); total_len += cpuall_len; + +out: d->cached = 1; d->size = total_len; - if (total_len > size ) total_len = size; + if (total_len > size) + total_len = size; memcpy(buf, d->buf, total_len); rv = total_len; @@ -3625,33 +4925,21 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, err: if (f) fclose(f); + if (cg_cpu_usage) + free(cg_cpu_usage); free(line); free(cpuset); free(cg); return rv; } -static long int getreaperage(pid_t pid) -{ - char fnam[100]; - struct stat sb; - int ret; - pid_t qpid; - - qpid = lookup_initpid_in_store(pid); - if (qpid <= 0) - return 0; - - ret = snprintf(fnam, 100, "/proc/%d", qpid); - if (ret < 0 || ret >= 100) - return 0; - - if (lstat(fnam, &sb) < 0) - return 0; - - return time(NULL) - sb.st_ctime; -} - +/* This function retrieves the busy time of a group of tasks by looking at + * cpuacct.usage. Unfortunately, this only makes sense when the container has + * been given it's own cpuacct cgroup. If not, this function will take the busy + * time of all other taks that do not actually belong to the container into + * account as well. If someone has a clever solution for this please send a + * patch! + */ static unsigned long get_reaper_busy(pid_t task) { pid_t initpid = lookup_initpid_in_store(task); @@ -3697,33 +4985,37 @@ static int proc_uptime_read(char *buf, size_t size, off_t offset, { struct fuse_context *fc = fuse_get_context(); struct file_info *d = (struct file_info *)fi->fh; - long int reaperage = getreaperage(fc->pid); - unsigned long int busytime = get_reaper_busy(fc->pid), idletime; + unsigned long int busytime = get_reaper_busy(fc->pid); char *cache = d->buf; ssize_t total_len = 0; + uint64_t idletime, reaperage; #if RELOADTEST iwashere(); #endif if (offset){ - if (offset > d->size) - return -EINVAL; if (!d->cached) return 0; + if (offset > d->size) + return -EINVAL; int left = d->size - offset; total_len = left > size ? size: left; memcpy(buf, cache + offset, total_len); return total_len; } - idletime = reaperage - busytime; - if (idletime > reaperage) - idletime = reaperage; + reaperage = get_reaper_age(fc->pid); + /* To understand why this is done, please read the comment to the + * get_reaper_busy() function. + */ + idletime = reaperage; + if (reaperage >= busytime) + idletime = reaperage - busytime; - total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime); - if (total_len < 0){ - perror("Error writing to cache"); + total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime); + if (total_len < 0 || total_len >= d->buflen){ + lxcfs_error("%s\n", "failed to write to cache"); return 0; } @@ -3841,7 +5133,7 @@ static int proc_diskstats_read(char *buf, size_t size, off_t offset, goto err; } if (l >= cache_size) { - fprintf(stderr, "Internal error: truncated write to cache\n"); + lxcfs_error("%s\n", "Internal error: truncated write to cache."); rv = 0; goto err; } @@ -3875,8 +5167,7 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, struct fuse_context *fc = fuse_get_context(); struct file_info *d = (struct file_info *)fi->fh; char *cg = NULL; - char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL, - *memswlimit_default_str = NULL, *memswusage_default_str = NULL; + char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL; unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0; ssize_t total_len = 0, rv = 0; ssize_t l = 0; @@ -3901,32 +5192,19 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, return read_file("/proc/swaps", buf, size, d); prune_init_slice(cg); - if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str)) - goto err; + memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str)) goto err; - memlimit = strtoul(memlimit_str, NULL, 10); memusage = strtoul(memusage_str, NULL, 10); if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) && cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) { - /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */ - if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str)) - goto err; - if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str)) - goto err; - - memswlimit = strtoul(memswlimit_str, NULL, 10); + memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes"); memswusage = strtoul(memswusage_str, NULL, 10); - if (!strcmp(memswlimit_str, memswlimit_default_str)) - memswlimit = 0; - if (!strcmp(memswusage_str, memswusage_default_str)) - memswusage = 0; - swap_total = (memswlimit - memlimit) / 1024; swap_free = (memswusage - memusage) / 1024; } @@ -3980,10 +5258,400 @@ err: free(memlimit_str); free(memusage_str); free(memswusage_str); - free(memswusage_default_str); - free(memswlimit_default_str); return rv; } +/* + * Find the process pid from cgroup path. + * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid. + * @pid_buf : put pid to pid_buf. + * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ... + * @depth : the depth of cgroup in container. + * @sum : return the number of pid. + * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu + */ +static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd) +{ + DIR *dir; + int fd; + struct dirent *file; + FILE *f = NULL; + size_t linelen = 0; + char *line = NULL; + int pd; + char *path_dir, *path; + char **pid; + + /* path = dpath + "/cgroup.procs" + /0 */ + do { + path = malloc(strlen(dpath) + 20); + } while (!path); + + strcpy(path, dpath); + fd = openat(cfd, path, O_RDONLY); + if (fd < 0) + goto out; + + dir = fdopendir(fd); + if (dir == NULL) { + close(fd); + goto out; + } + + while (((file = readdir(dir)) != NULL) && depth > 0) { + if (strncmp(file->d_name, ".", 1) == 0) + continue; + if (strncmp(file->d_name, "..", 1) == 0) + continue; + if (file->d_type == DT_DIR) { + /* path + '/' + d_name +/0 */ + do { + path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name)); + } while (!path_dir); + strcpy(path_dir, path); + strcat(path_dir, "/"); + strcat(path_dir, file->d_name); + pd = depth - 1; + sum = calc_pid(pid_buf, path_dir, pd, sum, cfd); + free(path_dir); + } + } + closedir(dir); + + strcat(path, "/cgroup.procs"); + fd = openat(cfd, path, O_RDONLY); + if (fd < 0) + goto out; + + f = fdopen(fd, "r"); + if (!f) { + close(fd); + goto out; + } + + while (getline(&line, &linelen, f) != -1) { + do { + pid = realloc(*pid_buf, sizeof(char *) * (sum + 1)); + } while (!pid); + *pid_buf = pid; + do { + *(*pid_buf + sum) = malloc(strlen(line) + 1); + } while (*(*pid_buf + sum) == NULL); + strcpy(*(*pid_buf + sum), line); + sum++; + } + fclose(f); +out: + if (line) + free(line); + free(path); + return sum; +} +/* + * calc_load calculates the load according to the following formula: + * load1 = load0 * exp + active * (1 - exp) + * + * @load1: the new loadavg. + * @load0: the former loadavg. + * @active: the total number of running pid at this moment. + * @exp: the fixed-point defined in the beginning. + */ +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + unsigned long newload; + + active = active > 0 ? active * FIXED_1 : 0; + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1 - 1; + + return newload / FIXED_1; +} + +/* + * Return 0 means that container p->cg is closed. + * Return -1 means that error occurred in refresh. + * Positive num equals the total number of pid. + */ +static int refresh_load(struct load_node *p, char *path) +{ + FILE *f = NULL; + char **idbuf; + char proc_path[256]; + int i, ret, run_pid = 0, total_pid = 0, last_pid = 0; + char *line = NULL; + size_t linelen = 0; + int sum, length; + DIR *dp; + struct dirent *file; + + do { + idbuf = malloc(sizeof(char *)); + } while (!idbuf); + sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd); + /* normal exit */ + if (sum == 0) + goto out; + + for (i = 0; i < sum; i++) { + /*clean up '\n' */ + length = strlen(idbuf[i])-1; + idbuf[i][length] = '\0'; + ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]); + if (ret < 0 || ret > 255) { + lxcfs_error("%s\n", "snprintf() failed in refresh_load."); + i = sum; + sum = -1; + goto err_out; + } + + dp = opendir(proc_path); + if (!dp) { + lxcfs_error("%s\n", "Open proc_path failed in refresh_load."); + continue; + } + while ((file = readdir(dp)) != NULL) { + if (strncmp(file->d_name, ".", 1) == 0) + continue; + if (strncmp(file->d_name, "..", 1) == 0) + continue; + total_pid++; + /* We make the biggest pid become last_pid.*/ + ret = atof(file->d_name); + last_pid = (ret > last_pid) ? ret : last_pid; + + ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name); + if (ret < 0 || ret > 255) { + lxcfs_error("%s\n", "snprintf() failed in refresh_load."); + i = sum; + sum = -1; + closedir(dp); + goto err_out; + } + f = fopen(proc_path, "r"); + if (f != NULL) { + while (getline(&line, &linelen, f) != -1) { + /* Find State */ + if ((line[0] == 'S') && (line[1] == 't')) + break; + } + if ((line[7] == 'R') || (line[7] == 'D')) + run_pid++; + fclose(f); + } + } + closedir(dp); + } + /*Calculate the loadavg.*/ + p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid); + p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid); + p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid); + p->run_pid = run_pid; + p->total_pid = total_pid; + p->last_pid = last_pid; + + free(line); +err_out: + for (; i > 0; i--) + free(idbuf[i-1]); +out: + free(idbuf); + return sum; +} +/* + * Traverse the hash table and update it. + */ +void *load_begin(void *arg) +{ + + char *path = NULL; + int i, sum, length, ret; + struct load_node *f; + int first_node; + clock_t time1, time2; + + while (1) { + if (loadavg_stop == 1) + return NULL; + + time1 = clock(); + for (i = 0; i < LOAD_SIZE; i++) { + pthread_mutex_lock(&load_hash[i].lock); + if (load_hash[i].next == NULL) { + pthread_mutex_unlock(&load_hash[i].lock); + continue; + } + f = load_hash[i].next; + first_node = 1; + while (f) { + length = strlen(f->cg) + 2; + do { + /* strlen(f->cg) + '.' or '' + \0 */ + path = malloc(length); + } while (!path); + + ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg); + if (ret < 0 || ret > length - 1) { + /* snprintf failed, ignore the node.*/ + lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg); + goto out; + } + sum = refresh_load(f, path); + if (sum == 0) { + f = del_node(f, i); + } else { +out: f = f->next; + } + free(path); + /* load_hash[i].lock locks only on the first node.*/ + if (first_node == 1) { + first_node = 0; + pthread_mutex_unlock(&load_hash[i].lock); + } + } + } + + if (loadavg_stop == 1) + return NULL; + + time2 = clock(); + usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC)); + } +} + +static int proc_loadavg_read(char *buf, size_t size, off_t offset, + struct fuse_file_info *fi) +{ + struct fuse_context *fc = fuse_get_context(); + struct file_info *d = (struct file_info *)fi->fh; + pid_t initpid; + char *cg; + size_t total_len = 0; + char *cache = d->buf; + struct load_node *n; + int hash; + int cfd, rv = 0; + unsigned long a, b, c; + + if (offset) { + if (offset > d->size) + return -EINVAL; + if (!d->cached) + return 0; + int left = d->size - offset; + total_len = left > size ? size : left; + memcpy(buf, cache + offset, total_len); + return total_len; + } + if (!loadavg) + return read_file("/proc/loadavg", buf, size, d); + + initpid = lookup_initpid_in_store(fc->pid); + if (initpid <= 0) + initpid = fc->pid; + cg = get_pid_cgroup(initpid, "cpu"); + if (!cg) + return read_file("/proc/loadavg", buf, size, d); + + prune_init_slice(cg); + hash = calc_hash(cg) % LOAD_SIZE; + n = locate_node(cg, hash); + + /* First time */ + if (n == NULL) { + if (!find_mounted_controller("cpu", &cfd)) { + /* + * In locate_node() above, pthread_rwlock_unlock() isn't used + * because delete is not allowed before read has ended. + */ + pthread_rwlock_unlock(&load_hash[hash].rdlock); + rv = 0; + goto err; + } + do { + n = malloc(sizeof(struct load_node)); + } while (!n); + + do { + n->cg = malloc(strlen(cg)+1); + } while (!n->cg); + strcpy(n->cg, cg); + n->avenrun[0] = 0; + n->avenrun[1] = 0; + n->avenrun[2] = 0; + n->run_pid = 0; + n->total_pid = 1; + n->last_pid = initpid; + n->cfd = cfd; + insert_node(&n, hash); + } + a = n->avenrun[0] + (FIXED_1/200); + b = n->avenrun[1] + (FIXED_1/200); + c = n->avenrun[2] + (FIXED_1/200); + total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + n->run_pid, n->total_pid, n->last_pid); + pthread_rwlock_unlock(&load_hash[hash].rdlock); + if (total_len < 0 || total_len >= d->buflen) { + lxcfs_error("%s\n", "Failed to write to cache"); + rv = 0; + goto err; + } + d->size = (int)total_len; + d->cached = 1; + + if (total_len > size) + total_len = size; + memcpy(buf, d->buf, total_len); + rv = total_len; + +err: + free(cg); + return rv; +} +/* Return a positive number on success, return 0 on failure.*/ +pthread_t load_daemon(int load_use) +{ + int ret; + pthread_t pid; + + ret = init_load(); + if (ret == -1) { + lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!"); + return 0; + } + ret = pthread_create(&pid, NULL, load_begin, NULL); + if (ret != 0) { + lxcfs_error("%s\n", "Create pthread fails in load_daemon!"); + load_free(); + return 0; + } + /* use loadavg, here loadavg = 1*/ + loadavg = load_use; + return pid; +} + +/* Returns 0 on success. */ +int stop_load_daemon(pthread_t pid) +{ + int s; + + /* Signal the thread to gracefully stop */ + loadavg_stop = 1; + + s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */ + if (s != 0) { + lxcfs_error("%s\n", "stop_load_daemon error: failed to join"); + return -1; + } + + load_free(); + loadavg_stop = 0; + + return 0; +} static off_t get_procfile_size(const char *which) { @@ -4021,7 +5689,8 @@ int proc_getattr(const char *path, struct stat *sb) strcmp(path, "/proc/uptime") == 0 || strcmp(path, "/proc/stat") == 0 || strcmp(path, "/proc/diskstats") == 0 || - strcmp(path, "/proc/swaps") == 0) { + strcmp(path, "/proc/swaps") == 0 || + strcmp(path, "/proc/loadavg") == 0) { sb->st_size = 0; sb->st_mode = S_IFREG | 00444; sb->st_nlink = 1; @@ -4041,7 +5710,8 @@ int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offs filler(buf, "stat", NULL, 0) != 0 || filler(buf, "uptime", NULL, 0) != 0 || filler(buf, "diskstats", NULL, 0) != 0 || - filler(buf, "swaps", NULL, 0) != 0) + filler(buf, "swaps", NULL, 0) != 0 || + filler(buf, "loadavg", NULL, 0) != 0) return -EINVAL; return 0; } @@ -4063,6 +5733,8 @@ int proc_open(const char *path, struct fuse_file_info *fi) type = LXC_TYPE_PROC_DISKSTATS; else if (strcmp(path, "/proc/swaps") == 0) type = LXC_TYPE_PROC_SWAPS; + else if (strcmp(path, "/proc/loadavg") == 0) + type = LXC_TYPE_PROC_LOADAVG; if (type == -1) return -ENOENT; @@ -4120,6 +5792,8 @@ int proc_read(const char *path, char *buf, size_t size, off_t offset, return proc_diskstats_read(buf, size, offset, fi); case LXC_TYPE_PROC_SWAPS: return proc_swaps_read(buf, size, offset, fi); + case LXC_TYPE_PROC_LOADAVG: + return proc_loadavg_read(buf, size, offset, fi); default: return -EINVAL; } @@ -4142,7 +5816,7 @@ static bool mkdir_p(const char *dir, mode_t mode) if (!makeme) return false; if (mkdir(makeme, mode) && errno != EEXIST) { - fprintf(stderr, "failed to create directory '%s': %s", + lxcfs_error("Failed to create directory '%s': %s.\n", makeme, strerror(errno)); free(makeme); return false; @@ -4156,37 +5830,87 @@ static bool mkdir_p(const char *dir, mode_t mode) static bool umount_if_mounted(void) { if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { - fprintf(stderr, "failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); + lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); return false; } return true; } -static int pivot_enter(void) +/* __typeof__ should be safe to use with all compilers. */ +typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; +static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) +{ + return (fs->f_type == (fs_type_magic)magic_val); +} + +/* + * looking at fs/proc_namespace.c, it appears we can + * actually expect the rootfs entry to very specifically contain + * " - rootfs rootfs " + * IIUC, so long as we've chrooted so that rootfs is not our root, + * the rootfs entry should always be skipped in mountinfo contents. + */ +static bool is_on_ramfs(void) +{ + FILE *f; + char *p, *p2; + char *line = NULL; + size_t len = 0; + int i; + + f = fopen("/proc/self/mountinfo", "r"); + if (!f) + return false; + + while (getline(&line, &len, f) != -1) { + for (p = line, i = 0; p && i < 4; i++) + p = strchr(p + 1, ' '); + if (!p) + continue; + p2 = strchr(p + 1, ' '); + if (!p2) + continue; + *p2 = '\0'; + if (strcmp(p + 1, "/") == 0) { + // this is '/'. is it the ramfs? + p = strchr(p2 + 1, '-'); + if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) { + free(line); + fclose(f); + return true; + } + } + } + free(line); + fclose(f); + return false; +} + +static int pivot_enter() { int ret = -1, oldroot = -1, newroot = -1; oldroot = open("/", O_DIRECTORY | O_RDONLY); if (oldroot < 0) { - fprintf(stderr, "%s: Failed to open old root for fchdir.\n", __func__); + lxcfs_error("%s\n", "Failed to open old root for fchdir."); return ret; } newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY); if (newroot < 0) { - fprintf(stderr, "%s: Failed to open new root for fchdir.\n", __func__); + lxcfs_error("%s\n", "Failed to open new root for fchdir."); goto err; } /* change into new root fs */ if (fchdir(newroot) < 0) { - fprintf(stderr, "%s: Failed to change directory to new rootfs: %s.\n", __func__, ROOTDIR); + lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR); goto err; } /* pivot_root into our new root fs */ if (pivot_root(".", ".") < 0) { - fprintf(stderr, "%s: pivot_root() syscall failed: %s.\n", __func__, strerror(errno)); + lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno)); goto err; } @@ -4196,16 +5920,17 @@ static int pivot_enter(void) * to the old-root. */ if (fchdir(oldroot) < 0) { - fprintf(stderr, "%s: Failed to enter old root.\n", __func__); + lxcfs_error("%s\n", "Failed to enter old root."); goto err; } + if (umount2(".", MNT_DETACH) < 0) { - fprintf(stderr, "%s: Failed to detach old root.\n", __func__); + lxcfs_error("%s\n", "Failed to detach old root."); goto err; } if (fchdir(newroot) < 0) { - fprintf(stderr, "%s: Failed to re-enter new root.\n", __func__); + lxcfs_error("%s\n", "Failed to re-enter new root."); goto err; } @@ -4216,79 +5941,143 @@ err: close(oldroot); if (newroot > 0) close(newroot); + return ret; } +static int chroot_enter() +{ + if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { + lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); + return -1; + } + + if (chroot(".") < 0) { + lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); + return -1; + } + + if (chdir("/") < 0) { + lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); + return -1; + } + + return 0; +} + +static int permute_and_enter(void) +{ + struct statfs sb; + + if (statfs("/", &sb) < 0) { + lxcfs_error("%s\n", "Could not stat / mountpoint."); + return -1; + } + + /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will + * likely report TMPFS_MAGIC. Hence, when it reports no we still check + * /proc/1/mountinfo. */ + if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) + return chroot_enter(); + + if (pivot_enter() < 0) { + lxcfs_error("%s\n", "Could not perform pivot root."); + return -1; + } + + return 0; +} + /* Prepare our new clean root. */ -static int pivot_prepare(void) +static int permute_prepare(void) { if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { - fprintf(stderr, "%s: Failed to create directory for new root.\n", __func__); + lxcfs_error("%s\n", "Failed to create directory for new root."); return -1; } if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { - fprintf(stderr, "%s: Failed to bind-mount / for new root: %s.\n", __func__, strerror(errno)); + lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); return -1; } if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { - fprintf(stderr, "%s: Failed to bind-mount /run into new root: %s.\n", __func__, strerror(errno)); + lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); return -1; } if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { - printf("%s: failed to move " BASEDIR " into new root: %s.\n", __func__, strerror(errno)); + printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); return -1; } return 0; } -static bool pivot_new_root(void) +/* Calls chroot() on ramfs, pivot_root() in all other cases. */ +static bool permute_root(void) { /* Prepare new root. */ - if (pivot_prepare() < 0) + if (permute_prepare() < 0) return false; /* Pivot into new root. */ - if (pivot_enter() < 0) + if (permute_and_enter() < 0) return false; return true; } -static bool setup_cgfs_dir(void) +static int preserve_mnt_ns(int pid) +{ + int ret; + size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt"); + char path[len]; + + ret = snprintf(path, len, "/proc/%d/ns/mnt", pid); + if (ret < 0 || (size_t)ret >= len) + return -1; + + return open(path, O_RDONLY | O_CLOEXEC); +} + +static bool cgfs_prepare_mounts(void) { if (!mkdir_p(BASEDIR, 0700)) { - fprintf(stderr, "Failed to create lxcfs cgroup mountpoint.\n"); + lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); return false; } if (!umount_if_mounted()) { - fprintf(stderr, "Failed to clean up old lxcfs cgroup mountpoint.\n"); + lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); return false; } if (unshare(CLONE_NEWNS) < 0) { - fprintf(stderr, "%s: Failed to unshare mount namespace: %s.\n", __func__, strerror(errno)); + lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); + return false; + } + + cgroup_mount_ns_fd = preserve_mnt_ns(getpid()); + if (cgroup_mount_ns_fd < 0) { + lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); return false; } if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { - fprintf(stderr, "%s: Failed to remount / private: %s.\n", __func__, strerror(errno)); + lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); return false; } if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { - fprintf(stderr, "Failed to mount tmpfs over lxcfs cgroup mountpoint.\n"); + lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); return false; } return true; } -static bool do_mount_cgroups(void) +static bool cgfs_mount_hierarchies(void) { char *target; size_t clen, len; @@ -4296,11 +6085,13 @@ static bool do_mount_cgroups(void) for (i = 0; i < num_hierarchies; i++) { char *controller = hierarchies[i]; + clen = strlen(controller); len = strlen(BASEDIR) + clen + 2; target = malloc(len); if (!target) return false; + ret = snprintf(target, len, "%s/%s", BASEDIR, controller); if (ret < 0 || ret >= len) { free(target); @@ -4310,8 +6101,12 @@ static bool do_mount_cgroups(void) free(target); return false; } - if (mount(controller, target, "cgroup", 0, controller) < 0) { - fprintf(stderr, "Failed mounting cgroup %s\n", controller); + if (!strcmp(controller, "unified")) + ret = mount("none", target, "cgroup2", 0, NULL); + else + ret = mount(controller, target, "cgroup", 0, controller); + if (ret < 0) { + lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno)); free(target); return false; } @@ -4328,50 +6123,41 @@ static bool do_mount_cgroups(void) static bool cgfs_setup_controllers(void) { - if (!setup_cgfs_dir()) + if (!cgfs_prepare_mounts()) return false; - if (!do_mount_cgroups()) { - fprintf(stderr, "Failed to set up private lxcfs cgroup mounts.\n"); + if (!cgfs_mount_hierarchies()) { + lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts."); return false; } - if (!pivot_new_root()) + if (!permute_root()) return false; return true; } -static int preserve_ns(int pid) -{ - int ret; - size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */; - char path[len]; - - ret = snprintf(path, len, "/proc/%d/ns/mnt", pid); - if (ret < 0 || (size_t)ret >= len) - return -1; - - return open(path, O_RDONLY | O_CLOEXEC); -} - static void __attribute__((constructor)) collect_and_mount_subsystems(void) { FILE *f; - char *line = NULL; + char *cret, *line = NULL; + char cwd[MAXPATHLEN]; size_t len = 0; int i, init_ns = -1; + bool found_unified = false; if ((f = fopen("/proc/self/cgroup", "r")) == NULL) { - fprintf(stderr, "Error opening /proc/self/cgroup: %s\n", strerror(errno)); + lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno)); return; } + while (getline(&line, &len, f) != -1) { - char *p, *p2; + char *idx, *p, *p2; p = strchr(line, ':'); if (!p) goto out; + idx = line; *(p++) = '\0'; p2 = strrchr(p, ':'); @@ -4384,32 +6170,54 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void) * because it parses out the empty string "" and later on passes * it to mount(). Let's skip such entries. */ - if (!strcmp(p, "")) - continue; + if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) { + found_unified = true; + p = "unified"; + } if (!store_hierarchy(line, p)) goto out; } /* Preserve initial namespace. */ - init_ns = preserve_ns(getpid()); - if (init_ns < 0) + init_ns = preserve_mnt_ns(getpid()); + if (init_ns < 0) { + lxcfs_error("%s\n", "Failed to preserve initial mount namespace."); goto out; + } - fd_hierarchies = malloc(sizeof(int *) * num_hierarchies); - if (!fd_hierarchies) + fd_hierarchies = malloc(sizeof(int) * num_hierarchies); + if (!fd_hierarchies) { + lxcfs_error("%s\n", strerror(errno)); goto out; + } for (i = 0; i < num_hierarchies; i++) fd_hierarchies[i] = -1; + cret = getcwd(cwd, MAXPATHLEN); + if (!cret) + lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno)); + /* This function calls unshare(CLONE_NEWNS) our initial mount namespace * to privately mount lxcfs cgroups. */ - if (!cgfs_setup_controllers()) + if (!cgfs_setup_controllers()) { + lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs."); goto out; + } + + if (setns(init_ns, 0) < 0) { + lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno)); + goto out; + } - if (setns(init_ns, 0) < 0) + if (!cret || chdir(cwd) < 0) + lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno)); + + if (!init_cpuview()) { + lxcfs_error("%s\n", "failed to init CPU view"); goto out; + } print_subsystems(); @@ -4424,6 +6232,8 @@ static void __attribute__((destructor)) free_subsystems(void) { int i; + lxcfs_debug("%s\n", "Running destructor for liblxcfs."); + for (i = 0; i < num_hierarchies; i++) { if (hierarchies[i]) free(hierarchies[i]); @@ -4432,4 +6242,8 @@ static void __attribute__((destructor)) free_subsystems(void) } free(hierarchies); free(fd_hierarchies); + free_cpuview(); + + if (cgroup_mount_ns_fd >= 0) + close(cgroup_mount_ns_fd); }