/* lxcfs
*
- * Copyright © 2014 Canonical, Inc
+ * Copyright © 2014-2016 Canonical, Inc
* Author: Serge Hallyn <serge.hallyn@ubuntu.com>
*
* See COPYING file for details.
*/
-/*
- * NOTES - make sure to run this as -s to avoid threading.
- * TODO - can we enforce that here from the code?
- */
#define FUSE_USE_VERSION 26
#include <stdio.h>
#include <stdlib.h>
#include <libgen.h>
#include <sched.h>
+#include <pthread.h>
#include <linux/sched.h>
#include <sys/socket.h>
#include <sys/mount.h>
+#include <sys/epoll.h>
#include <wait.h>
-#include <nih/alloc.h>
-#include <nih/string.h>
-
-#include "cgmanager.h"
+#ifdef FORTRAVIS
+#define GLIB_DISABLE_DEPRECATION_WARNINGS
+#include <glib-object.h>
+#endif
-struct lxcfs_state {
- /*
- * a null-terminated, nih-allocated list of the mounted subsystems. We
- * detect this at startup.
- */
- char **subsystems;
-};
-#define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
+#include "cgfs.h"
+#include "config.h" // for VERSION
enum {
LXC_TYPE_CGDIR,
char *buf; // unused as of yet
int buflen;
int size; //actual data size
+ int cached;
};
/* reserve buffer size, for cpuall in /proc/stat */
#define BUF_RESERVE_SIZE 256
-static char *must_copy_string(void *parent, const char *str)
+/*
+ * A table caching which pid is init for a pid namespace.
+ * When looking up which pid is init for $qpid, we first
+ * 1. Stat /proc/$qpid/ns/pid.
+ * 2. Check whether the ino_t is in our store.
+ * a. if not, fork a child in qpid's ns to send us
+ * ucred.pid = 1, and read the initpid. Cache
+ * initpid and creation time for /proc/initpid
+ * in a new store entry.
+ * b. if so, verify that /proc/initpid still matches
+ * what we have saved. If not, clear the store
+ * entry and go back to a. If so, return the
+ * cached initpid.
+ */
+struct pidns_init_store {
+ ino_t ino; // inode number for /proc/$pid/ns/pid
+ pid_t initpid; // the pid of nit in that ns
+ long int ctime; // the time at which /proc/$initpid was created
+ struct pidns_init_store *next;
+ long int lastcheck;
+};
+
+/* lol - look at how they are allocated in the kernel */
+#define PIDNS_HASH_SIZE 4096
+#define HASH(x) ((x) % PIDNS_HASH_SIZE)
+
+struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
+static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
+static void lock_mutex(pthread_mutex_t *l)
{
- if (!str)
- return NULL;
- return NIH_MUST( nih_strdup(parent, str) );
+ int ret;
+
+ if ((ret = pthread_mutex_lock(l)) != 0) {
+ fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
+ exit(1);
+ }
+}
+
+static void unlock_mutex(pthread_mutex_t *l)
+{
+ int ret;
+
+ if ((ret = pthread_mutex_unlock(l)) != 0) {
+ fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
+ exit(1);
+ }
+}
+
+static void store_lock(void)
+{
+ lock_mutex(&pidns_store_mutex);
+}
+
+static void store_unlock(void)
+{
+ unlock_mutex(&pidns_store_mutex);
+}
+
+/* Must be called under store_lock */
+static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
+{
+ struct stat initsb;
+ char fnam[100];
+
+ snprintf(fnam, 100, "/proc/%d", e->initpid);
+ if (stat(fnam, &initsb) < 0)
+ return false;
+#if DEBUG
+ fprintf(stderr, "comparing ctime %ld %ld for pid %d\n",
+ e->ctime, initsb.st_ctime, e->initpid);
+#endif
+ if (e->ctime != initsb.st_ctime)
+ return false;
+ return true;
+}
+
+/* Must be called under store_lock */
+static void remove_initpid(struct pidns_init_store *e)
+{
+ struct pidns_init_store *tmp;
+ int h;
+
+#if DEBUG
+ fprintf(stderr, "remove_initpid: removing entry for %d\n", e->initpid);
+#endif
+ h = HASH(e->ino);
+ if (pidns_hash_table[h] == e) {
+ pidns_hash_table[h] = e->next;
+ free(e);
+ return;
+ }
+
+ tmp = pidns_hash_table[h];
+ while (tmp) {
+ if (tmp->next == e) {
+ tmp->next = e->next;
+ free(e);
+ return;
+ }
+ tmp = tmp->next;
+ }
+}
+
+#define PURGE_SECS 5
+/* Must be called under store_lock */
+static void prune_initpid_store(void)
+{
+ static long int last_prune = 0;
+ struct pidns_init_store *e, *prev, *delme;
+ long int now, threshold;
+ int i;
+
+ if (!last_prune) {
+ last_prune = time(NULL);
+ return;
+ }
+ now = time(NULL);
+ if (now < last_prune + PURGE_SECS)
+ return;
+#if DEBUG
+ fprintf(stderr, "pruning\n");
+#endif
+ last_prune = now;
+ threshold = now - 2 * PURGE_SECS;
+
+ for (i = 0; i < PIDNS_HASH_SIZE; i++) {
+ for (prev = NULL, e = pidns_hash_table[i]; e; ) {
+ if (e->lastcheck < threshold) {
+#if DEBUG
+ fprintf(stderr, "Removing cached entry for %d\n", e->initpid);
+#endif
+ delme = e;
+ if (prev)
+ prev->next = e->next;
+ else
+ pidns_hash_table[i] = e->next;
+ e = e->next;
+ free(delme);
+ } else {
+ prev = e;
+ e = e->next;
+ }
+ }
+ }
+}
+
+/* Must be called under store_lock */
+static void save_initpid(struct stat *sb, pid_t pid)
+{
+ struct pidns_init_store *e;
+ char fpath[100];
+ struct stat procsb;
+ int h;
+
+#if DEBUG
+ fprintf(stderr, "save_initpid: adding entry for %d\n", pid);
+#endif
+ snprintf(fpath, 100, "/proc/%d", pid);
+ if (stat(fpath, &procsb) < 0)
+ return;
+ do {
+ e = malloc(sizeof(*e));
+ } while (!e);
+ e->ino = sb->st_ino;
+ e->initpid = pid;
+ e->ctime = procsb.st_ctime;
+ h = HASH(e->ino);
+ e->next = pidns_hash_table[h];
+ e->lastcheck = time(NULL);
+ pidns_hash_table[h] = e;
}
/*
- * TODO - return value should denote whether child exited with failure
- * so callers can return errors. Esp read/write of tasks and cgroup.procs
+ * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
+ * entry for the inode number and creation time. Verify that the init pid
+ * is still valid. If not, remove it. Return the entry if valid, NULL
+ * otherwise.
+ * Must be called under store_lock
*/
+static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
+{
+ int h = HASH(sb->st_ino);
+ struct pidns_init_store *e = pidns_hash_table[h];
+
+ while (e) {
+ if (e->ino == sb->st_ino) {
+ if (initpid_still_valid(e, sb)) {
+ e->lastcheck = time(NULL);
+ return e;
+ }
+ remove_initpid(e);
+ return NULL;
+ }
+ e = e->next;
+ }
+
+ return NULL;
+}
+
+#define SEND_CREDS_OK 0
+#define SEND_CREDS_NOTSK 1
+#define SEND_CREDS_FAIL 2
+static bool recv_creds(int sock, struct ucred *cred, char *v);
+static int wait_for_pid(pid_t pid);
+static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
+
+/*
+ * fork a task which switches to @task's namespace and writes '1'.
+ * over a unix sock so we can read the task's reaper's pid in our
+ * namespace
+ */
+static void write_task_init_pid_exit(int sock, pid_t target)
+{
+ struct ucred cred;
+ char fnam[100];
+ pid_t pid;
+ char v;
+ int fd, ret;
+
+ ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
+ if (ret < 0 || ret >= sizeof(fnam))
+ _exit(1);
+
+ fd = open(fnam, O_RDONLY);
+ if (fd < 0) {
+ perror("write_task_init_pid_exit open of ns/pid");
+ _exit(1);
+ }
+ if (setns(fd, 0)) {
+ perror("write_task_init_pid_exit setns 1");
+ close(fd);
+ _exit(1);
+ }
+ pid = fork();
+ if (pid < 0)
+ _exit(1);
+ if (pid != 0) {
+ if (!wait_for_pid(pid))
+ _exit(1);
+ _exit(0);
+ }
+
+ /* we are the child */
+ cred.uid = 0;
+ cred.gid = 0;
+ cred.pid = 1;
+ v = '1';
+ if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
+ _exit(1);
+ _exit(0);
+}
+
+static pid_t get_init_pid_for_task(pid_t task)
+{
+ int sock[2];
+ pid_t pid;
+ pid_t ret = -1;
+ char v = '0';
+ struct ucred cred;
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
+ perror("socketpair");
+ return -1;
+ }
+
+ pid = fork();
+ if (pid < 0)
+ goto out;
+ if (!pid) {
+ close(sock[1]);
+ write_task_init_pid_exit(sock[0], task);
+ _exit(0);
+ }
+
+ if (!recv_creds(sock[1], &cred, &v))
+ goto out;
+ ret = cred.pid;
+
+out:
+ close(sock[0]);
+ close(sock[1]);
+ if (pid > 0)
+ wait_for_pid(pid);
+ return ret;
+}
+
+static pid_t lookup_initpid_in_store(pid_t qpid)
+{
+ pid_t answer = 0;
+ struct stat sb;
+ struct pidns_init_store *e;
+ char fnam[100];
+
+ snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
+ store_lock();
+ if (stat(fnam, &sb) < 0)
+ goto out;
+ e = lookup_verify_initpid(&sb);
+ if (e) {
+ answer = e->initpid;
+ goto out;
+ }
+ answer = get_init_pid_for_task(qpid);
+ if (answer > 0)
+ save_initpid(&sb, answer);
+
+out:
+ /* we prune at end in case we are returning
+ * the value we were about to return */
+ prune_initpid_store();
+ store_unlock();
+ return answer;
+}
+
static int wait_for_pid(pid_t pid)
{
int status, ret;
+ if (pid <= 0)
+ return -1;
+
again:
ret = waitpid(pid, &status, 0);
if (ret == -1) {
return 0;
}
+
+/*
+ * append pid to *src.
+ * src: a pointer to a char* in which ot append the pid.
+ * sz: the number of characters printed so far, minus trailing \0.
+ * asz: the allocated size so far
+ * pid: the pid to append
+ */
+static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
+{
+ char tmp[30];
+
+ int tmplen = sprintf(tmp, "%d\n", (int)pid);
+
+ if (!*src || tmplen + *sz + 1 >= *asz) {
+ char *tmp;
+ do {
+ tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
+ } while (!tmp);
+ *src = tmp;
+ *asz += BUF_RESERVE_SIZE;
+ }
+ memcpy((*src) +*sz , tmp, tmplen);
+ *sz += tmplen;
+ (*src)[*sz] = '\0';
+}
+
/*
* Given a open file * to /proc/pid/{u,g}id_map, and an id
* valid in the caller's namespace, return the id mapped into
#define NS_ROOT_REQD true
#define NS_ROOT_OPT false
+#define PROCLEN 100
+
static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
{
- nih_local char *fpath = NULL;
+ char fpath[PROCLEN];
+ int ret;
bool answer = false;
uid_t nsuid;
if (!req_ns_root && uid == victim)
return true;
- fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
+ ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
+ if (ret < 0 || ret >= PROCLEN)
+ return false;
FILE *f = fopen(fpath, "r");
if (!f)
return false;
return ((fmode & r) == r);
}
+
+/*
+ * taskcg is a/b/c
+ * querycg is /a/b/c/d/e
+ * we return 'd'
+ */
static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
{
char *start, *end;
}
if (strcmp(querycg, "/") == 0)
- start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
+ start = strdup(taskcg + 1);
else
- start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
+ start = strdup(taskcg + strlen(querycg) + 1);
+ if (!start)
+ return NULL;
end = strchr(start, '/');
if (end)
*end = '\0';
return start;
}
+static void stripnewline(char *x)
+{
+ size_t l = strlen(x);
+ if (l && x[l-1] == '\n')
+ x[l-1] = '\0';
+}
+
+static char *get_pid_cgroup(pid_t pid, const char *contrl)
+{
+ char fnam[PROCLEN];
+ FILE *f;
+ char *answer = NULL;
+ char *line = NULL;
+ size_t len = 0;
+ int ret;
+ const char *h = find_mounted_controller(contrl);
+ if (!h)
+ return NULL;
+
+ ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
+ if (ret < 0 || ret >= PROCLEN)
+ return NULL;
+ if (!(f = fopen(fnam, "r")))
+ return NULL;
+
+ while (getline(&line, &len, f) != -1) {
+ char *c1, *c2;
+ if (!line[0])
+ continue;
+ c1 = strchr(line, ':');
+ if (!c1)
+ goto out;
+ c1++;
+ c2 = strchr(c1, ':');
+ if (!c2)
+ goto out;
+ *c2 = '\0';
+ if (strcmp(c1, h) != 0)
+ continue;
+ c2++;
+ stripnewline(c2);
+ do {
+ answer = strdup(c2);
+ } while (!answer);
+ break;
+ }
+
+out:
+ fclose(f);
+ free(line);
+ return answer;
+}
+
/*
* check whether a fuse context may access a cgroup dir or file
*
*/
static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
{
- nih_local struct cgm_keys **list = NULL;
- int i;
-
- if (!file)
- file = "tasks";
+ struct cgfs_files *k = NULL;
+ bool ret = false;
- if (*file == '/')
- file++;
-
- if (!cgm_list_keys(contrl, cg, &list))
+ k = cgfs_get_key(contrl, cg, file);
+ if (!k)
return false;
- for (i = 0; list[i]; i++) {
- if (strcmp(list[i]->name, file) == 0) {
- struct cgm_keys *k = list[i];
- if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
- if (perms_include(k->mode >> 6, mode))
- return true;
- }
- if (fc->gid == k->gid) {
- if (perms_include(k->mode >> 3, mode))
- return true;
- }
- return perms_include(k->mode, mode);
+
+ if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
+ if (perms_include(k->mode >> 6, mode)) {
+ ret = true;
+ goto out;
}
}
+ if (fc->gid == k->gid) {
+ if (perms_include(k->mode >> 3, mode)) {
+ ret = true;
+ goto out;
+ }
+ }
+ ret = perms_include(k->mode, mode);
- return false;
+out:
+ free_key(k);
+ return ret;
}
-static void stripnewline(char *x)
+#define INITSCOPE "/init.scope"
+static void prune_init_slice(char *cg)
{
- size_t l = strlen(x);
- if (l && x[l-1] == '\n')
- x[l-1] = '\0';
+ char *point;
+ size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
+
+ if (cg_len < initscope_len)
+ return;
+
+ point = cg + cg_len - initscope_len;
+ if (strcmp(point, INITSCOPE) == 0) {
+ if (point == cg)
+ *(point+1) = '\0';
+ else
+ *point = '\0';
+ }
}
/*
- * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
- * If caller is in /a, he may act on /a/b, but not on /b.
+ * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
+ * If pid is in /a, he may act on /a/b, but not on /b.
* if the answer is false and nextcg is not NULL, then *nextcg will point
- * to a nih_alloc'd string containing the next cgroup directory under cg
+ * to a string containing the next cgroup directory under cg, which must be
+ * freed by the caller.
*/
static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
{
- nih_local char *fnam = NULL;
- FILE *f;
bool answer = false;
- char *line = NULL;
- size_t len = 0;
+ char *c2 = get_pid_cgroup(pid, contrl);
+ char *linecmp;
- fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
- if (!(f = fopen(fnam, "r")))
+ if (!c2)
return false;
+ prune_init_slice(c2);
- while (getline(&line, &len, f) != -1) {
- char *c1, *c2, *linecmp;
- if (!line[0])
- continue;
- c1 = strchr(line, ':');
- if (!c1)
- goto out;
- c1++;
- c2 = strchr(c1, ':');
- if (!c2)
- goto out;
- *c2 = '\0';
- if (strcmp(c1, contrl) != 0)
- continue;
- c2++;
- stripnewline(c2);
- /*
- * callers pass in '/' for root cgroup, otherwise they pass
- * in a cgroup without leading '/'
- */
- linecmp = *cg == '/' ? c2 : c2+1;
- if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
- if (nextcg)
- *nextcg = get_next_cgroup_dir(linecmp, cg);
- goto out;
+ /*
+ * callers pass in '/' for root cgroup, otherwise they pass
+ * in a cgroup without leading '/'
+ */
+ linecmp = *cg == '/' ? c2 : c2+1;
+ if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
+ if (nextcg) {
+ *nextcg = get_next_cgroup_dir(linecmp, cg);
}
+ goto out;
+ }
+ answer = true;
+
+out:
+ free(c2);
+ return answer;
+}
+
+/*
+ * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
+ */
+static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
+{
+ bool answer = false;
+ char *c2, *task_cg;
+ size_t target_len, task_len;
+
+ if (strcmp(cg, "/") == 0)
+ return true;
+
+ c2 = get_pid_cgroup(pid, contrl);
+ if (!c2)
+ return false;
+ prune_init_slice(c2);
+
+ task_cg = c2 + 1;
+ target_len = strlen(cg);
+ task_len = strlen(task_cg);
+ if (task_len == 0) {
+ /* Task is in the root cg, it can see everything. This case is
+ * not handled by the strmcps below, since they test for the
+ * last /, but that is the first / that we've chopped off
+ * above.
+ */
+ answer = true;
+ goto out;
+ }
+ if (strcmp(cg, task_cg) == 0) {
answer = true;
goto out;
}
+ if (target_len < task_len) {
+ /* looking up a parent dir */
+ if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
+ answer = true;
+ goto out;
+ }
+ if (target_len > task_len) {
+ /* looking up a child dir */
+ if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
+ answer = true;
+ goto out;
+ }
out:
- fclose(f);
- free(line);
+ free(c2);
return answer;
}
/*
- * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
- * and needs to be nih_freed.
+ * given /cgroup/freezer/a/b, return "freezer".
+ * the returned char* should NOT be freed.
*/
static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
{
const char *p1;
- char *ret, *slash;
+ char *contr, *slash;
if (strlen(path) < 9)
return NULL;
+ if (*(path+7) != '/')
+ return NULL;
p1 = path+8;
- ret = nih_strdup(NULL, p1);
- if (!ret)
- return ret;
- slash = strstr(ret, "/");
+ contr = strdupa(p1);
+ if (!contr)
+ return NULL;
+ slash = strstr(contr, "/");
if (slash)
*slash = '\0';
- /* verify that it is a subsystem */
- char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
int i;
- if (!list) {
- nih_free(ret);
- return NULL;
+ for (i = 0; i < num_hierarchies; i++) {
+ if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
+ return hierarchies[i];
}
- for (i = 0; list[i]; i++) {
- if (strcmp(list[i], ret) == 0)
- return ret;
- }
- nih_free(ret);
return NULL;
}
return p1+1;
}
-static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
-{
- nih_local char **list = NULL;
- int i;
-
- if (!f)
- return false;
- if (*f == '/')
- f++;
-
- if (!cgm_list_children(contr, dir, &list))
- return false;
- for (i = 0; list[i]; i++) {
- if (strcmp(list[i], f) == 0)
- return true;
- }
-
- return false;
-}
-
-static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
-{
- nih_local struct cgm_keys **list = NULL;
- struct cgm_keys *k;
- int i;
-
- if (!f)
- return NULL;
- if (*f == '/')
- f++;
- if (!cgm_list_keys(contr, dir, &list))
- return NULL;
- for (i = 0; list[i]; i++) {
- if (strcmp(list[i]->name, f) == 0) {
- k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
- k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
- k->uid = list[i]->uid;
- k->gid = list[i]->gid;
- k->mode = list[i]->mode;
- return k;
- }
- }
-
- return NULL;
-}
-
-static void get_cgdir_and_path(const char *cg, char **dir, char **file)
+/*
+ * split the last path element from the path in @cg.
+ * @dir is newly allocated and should be freed, @last not
+*/
+static void get_cgdir_and_path(const char *cg, char **dir, char **last)
{
char *p;
- *dir = NIH_MUST( nih_strdup(NULL, cg) );
- *file = strrchr(cg, '/');
- if (!*file) {
- *file = NULL;
+ do {
+ *dir = strdup(cg);
+ } while (!*dir);
+ *last = strrchr(cg, '/');
+ if (!*last) {
+ *last = NULL;
return;
}
p = strrchr(*dir, '/');
*p = '\0';
}
-static size_t get_file_size(const char *contrl, const char *cg, const char *f)
-{
- nih_local char *data = NULL;
- size_t s;
- if (!cgm_get_value(contrl, cg, f, &data))
- return -EINVAL;
- s = strlen(data);
- return s;
-}
-
/*
* FUSE ops for /cgroup
*/
{
struct timespec now;
struct fuse_context *fc = fuse_get_context();
- nih_local char * cgdir = NULL;
- char *fpath = NULL, *path1, *path2;
- nih_local struct cgm_keys *k = NULL;
+ char * cgdir = NULL;
+ char *last = NULL, *path1, *path2;
+ struct cgfs_files *k = NULL;
const char *cgroup;
- nih_local char *controller = NULL;
+ const char *controller = NULL;
+ int ret = -ENOENT;
if (!fc)
return 0;
}
- get_cgdir_and_path(cgroup, &cgdir, &fpath);
+ get_cgdir_and_path(cgroup, &cgdir, &last);
- if (!fpath) {
+ if (!last) {
path1 = "/";
path2 = cgdir;
} else {
path1 = cgdir;
- path2 = fpath;
+ path2 = last;
}
+ pid_t initpid = lookup_initpid_in_store(fc->pid);
+ if (initpid <= 0)
+ initpid = fc->pid;
/* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
- * Then check that caller's cgroup is under path if fpath is a child
- * cgroup, or cgdir if fpath is a file */
+ * Then check that caller's cgroup is under path if last is a child
+ * cgroup, or cgdir if last is a file */
if (is_child_cgroup(controller, path1, path2)) {
- if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
+ if (!caller_may_see_dir(initpid, controller, cgroup)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
/* this is just /cgroup/controller, return it as a dir */
sb->st_mode = S_IFDIR | 00555;
sb->st_nlink = 2;
- return 0;
+ ret = 0;
+ goto out;
+ }
+ if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
+ ret = -EACCES;
+ goto out;
}
- if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
- return -EACCES;
// get uid, gid, from '/tasks' file and make up a mode
// That is a hack, until cgmanager gains a GetCgroupPerms fn.
sb->st_mode = S_IFDIR | 00755;
- k = get_cgroup_key(controller, cgroup, "tasks");
+ k = cgfs_get_key(controller, cgroup, NULL);
if (!k) {
sb->st_uid = sb->st_gid = 0;
} else {
sb->st_uid = k->uid;
sb->st_gid = k->gid;
}
+ free_key(k);
sb->st_nlink = 2;
- return 0;
+ ret = 0;
+ goto out;
}
- if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
- if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
- return -ENOENT;
- if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
- return -EACCES;
-
+ if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
sb->st_mode = S_IFREG | k->mode;
sb->st_nlink = 1;
sb->st_uid = k->uid;
sb->st_gid = k->gid;
- sb->st_size = get_file_size(controller, path1, path2);
- return 0;
+ sb->st_size = 0;
+ free_key(k);
+ if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
+ ret = -EACCES;
+ goto out;
+ }
+
+ ret = 0;
}
- return -ENOENT;
+out:
+ free(cgdir);
+ return ret;
}
-/*
- * TODO - cache these results in a table for use in opendir, free
- * in releasedir
- */
static int cg_opendir(const char *path, struct fuse_file_info *fi)
{
struct fuse_context *fc = fuse_get_context();
- nih_local struct cgm_keys **list = NULL;
const char *cgroup;
struct file_info *dir_info;
- nih_local char *controller = NULL;
+ char *controller = NULL;
if (!fc)
return -EIO;
}
}
- if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
- return -EACCES;
+ pid_t initpid = lookup_initpid_in_store(fc->pid);
+ if (initpid <= 0)
+ initpid = fc->pid;
+ if (cgroup) {
+ if (!caller_may_see_dir(initpid, controller, cgroup))
+ return -ENOENT;
+ if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
+ return -EACCES;
+ }
/* we'll free this at cg_releasedir */
- dir_info = NIH_MUST( nih_alloc(NULL, sizeof(*dir_info)) );
- dir_info->controller = must_copy_string(dir_info, controller);
- dir_info->cgroup = must_copy_string(dir_info, cgroup);
+ dir_info = malloc(sizeof(*dir_info));
+ if (!dir_info)
+ return -ENOMEM;
+ dir_info->controller = must_copy_string(controller);
+ dir_info->cgroup = must_copy_string(cgroup);
dir_info->type = LXC_TYPE_CGDIR;
dir_info->buf = NULL;
dir_info->file = NULL;
struct fuse_file_info *fi)
{
struct file_info *d = (struct file_info *)fi->fh;
- nih_local struct cgm_keys **list = NULL;
- int i;
- nih_local char *nextcg = NULL;
+ struct cgfs_files **list = NULL;
+ int i, ret;
+ char *nextcg = NULL;
struct fuse_context *fc = fuse_get_context();
+ char **clist = NULL;
if (d->type != LXC_TYPE_CGDIR) {
fprintf(stderr, "Internal error: file cache info used in readdir\n");
}
if (!d->cgroup && !d->controller) {
// ls /var/lib/lxcfs/cgroup - just show list of controllers
- char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
int i;
- if (!list)
- return -EIO;
-
- for (i = 0; list[i]; i++) {
- if (filler(buf, list[i], NULL, 0) != 0) {
+ for (i = 0; i < num_hierarchies; i++) {
+ if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
return -EIO;
}
}
return 0;
}
- if (!cgm_list_keys(d->controller, d->cgroup, &list))
+ if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
// not a valid cgroup
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
+ }
- if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
+ pid_t initpid = lookup_initpid_in_store(fc->pid);
+ if (initpid <= 0)
+ initpid = fc->pid;
+ if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
if (nextcg) {
int ret;
ret = filler(buf, nextcg, NULL, 0);
- if (ret != 0)
- return -EIO;
+ free(nextcg);
+ if (ret != 0) {
+ ret = -EIO;
+ goto out;
+ }
}
- return 0;
+ ret = 0;
+ goto out;
}
for (i = 0; list[i]; i++) {
if (filler(buf, list[i]->name, NULL, 0) != 0) {
- return -EIO;
+ ret = -EIO;
+ goto out;
}
}
// now get the list of child cgroups
- nih_local char **clist = NULL;
- if (!cgm_list_children(d->controller, d->cgroup, &clist))
- return 0;
+ if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
+ ret = 0;
+ goto out;
+ }
for (i = 0; clist[i]; i++) {
if (filler(buf, clist[i], NULL, 0) != 0) {
- return -EIO;
+ ret = -EIO;
+ goto out;
}
}
- return 0;
+ ret = 0;
+
+out:
+ free_keys(list);
+ if (clist) {
+ for (i = 0; clist[i]; i++)
+ free(clist[i]);
+ free(clist);
+ }
+ return ret;
}
static void do_release_file_info(struct file_info *f)
{
- /*
- * all file_info fields which are nih_alloc()d with f as parent
- * will be automatically freed
- */
- nih_free(f);
+ if (!f)
+ return;
+ free(f->controller);
+ free(f->cgroup);
+ free(f->file);
+ free(f->buf);
+ free(f);
}
static int cg_releasedir(const char *path, struct fuse_file_info *fi)
static int cg_open(const char *path, struct fuse_file_info *fi)
{
- nih_local char *controller = NULL;
const char *cgroup;
- char *fpath = NULL, *path1, *path2;
- nih_local char * cgdir = NULL;
- nih_local struct cgm_keys *k = NULL;
+ char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
+ struct cgfs_files *k = NULL;
struct file_info *file_info;
struct fuse_context *fc = fuse_get_context();
+ int ret;
if (!fc)
return -EIO;
if (!cgroup)
return -EINVAL;
- get_cgdir_and_path(cgroup, &cgdir, &fpath);
- if (!fpath) {
+ get_cgdir_and_path(cgroup, &cgdir, &last);
+ if (!last) {
path1 = "/";
path2 = cgdir;
} else {
path1 = cgdir;
- path2 = fpath;
+ path2 = last;
}
- k = get_cgroup_key(controller, path1, path2);
- if (!k)
- return -EINVAL;
+ k = cgfs_get_key(controller, path1, path2);
+ if (!k) {
+ ret = -EINVAL;
+ goto out;
+ }
+ free_key(k);
- if (!fc_may_access(fc, controller, path1, path2, fi->flags))
+ pid_t initpid = lookup_initpid_in_store(fc->pid);
+ if (initpid <= 0)
+ initpid = fc->pid;
+ if (!caller_may_see_dir(initpid, controller, path1)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
// should never get here
- return -EACCES;
+ ret = -EACCES;
+ goto out;
+ }
/* we'll free this at cg_release */
- file_info = NIH_MUST( nih_alloc(NULL, sizeof(*file_info)) );
- file_info->controller = must_copy_string(file_info, controller);
- file_info->cgroup = must_copy_string(file_info, path1);
- file_info->file = must_copy_string(file_info, path2);
+ file_info = malloc(sizeof(*file_info));
+ if (!file_info) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ file_info->controller = must_copy_string(controller);
+ file_info->cgroup = must_copy_string(path1);
+ file_info->file = must_copy_string(path2);
file_info->type = LXC_TYPE_CGFILE;
file_info->buf = NULL;
file_info->buflen = 0;
fi->fh = (unsigned long)file_info;
- return 0;
+ ret = 0;
+
+out:
+ free(cgdir);
+ return ret;
}
static int cg_release(const char *path, struct fuse_file_info *fi)
return 0;
}
-static int msgrecv(int sockfd, void *buf, size_t len)
+#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
+
+static bool wait_for_sock(int sock, int timeout)
{
- struct timeval tv;
- fd_set rfds;
+ struct epoll_event ev;
+ int epfd, ret, now, starttime, deltatime, saved_errno;
+
+ if ((starttime = time(NULL)) < 0)
+ return false;
+
+ if ((epfd = epoll_create(1)) < 0) {
+ fprintf(stderr, "Failed to create epoll socket: %m\n");
+ return false;
+ }
- FD_ZERO(&rfds);
- FD_SET(sockfd, &rfds);
- tv.tv_sec = 2;
- tv.tv_usec = 0;
+ ev.events = POLLIN_SET;
+ ev.data.fd = sock;
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
+ fprintf(stderr, "Failed adding socket to epoll: %m\n");
+ close(epfd);
+ return false;
+ }
+
+again:
+ if ((now = time(NULL)) < 0) {
+ close(epfd);
+ return false;
+ }
+
+ deltatime = (starttime + timeout) - now;
+ if (deltatime < 0) { // timeout
+ errno = 0;
+ close(epfd);
+ return false;
+ }
+ ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
+ if (ret < 0 && errno == EINTR)
+ goto again;
+ saved_errno = errno;
+ close(epfd);
+
+ if (ret <= 0) {
+ errno = saved_errno;
+ return false;
+ }
+ return true;
+}
- if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
+static int msgrecv(int sockfd, void *buf, size_t len)
+{
+ if (!wait_for_sock(sockfd, 2))
return -1;
return recv(sockfd, buf, len, MSG_DONTWAIT);
}
-#define SEND_CREDS_OK 0
-#define SEND_CREDS_NOTSK 1
-#define SEND_CREDS_FAIL 2
static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
{
struct msghdr msg = { 0 };
char buf[1];
int ret;
int optval = 1;
- struct timeval tv;
- fd_set rfds;
*v = '1';
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
- FD_ZERO(&rfds);
- FD_SET(sock, &rfds);
- tv.tv_sec = 2;
- tv.tv_usec = 0;
- if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
- fprintf(stderr, "Failed to select for scm_cred: %s\n",
+ if (!wait_for_sock(sock, 2)) {
+ fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
strerror(errno));
return false;
}
while (recv_creds(sock, &cred, &v)) {
if (v == '1')
- exit(0);
+ _exit(0);
if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
- exit(1);
+ _exit(1);
}
- exit(0);
+ _exit(0);
}
/*
int newnsfd = -1, ret, cpipe[2];
char fnam[100];
pid_t cpid;
- struct timeval tv;
- fd_set s;
char v;
- sprintf(fnam, "/proc/%d/ns/pid", tpid);
+ ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
+ if (ret < 0 || ret >= sizeof(fnam))
+ _exit(1);
newnsfd = open(fnam, O_RDONLY);
if (newnsfd < 0)
- exit(1);
+ _exit(1);
if (setns(newnsfd, 0) < 0)
- exit(1);
+ _exit(1);
close(newnsfd);
if (pipe(cpipe) < 0)
- exit(1);
+ _exit(1);
-loop:
cpid = fork();
if (cpid < 0)
- exit(1);
+ _exit(1);
if (!cpid) {
char b = '1';
}
close(cpipe[1]);
pid_to_ns(sock, tpid);
+ _exit(1); // not reached
}
// give the child 1 second to be done forking and
- // write it's ack
- FD_ZERO(&s);
- FD_SET(cpipe[0], &s);
- tv.tv_sec = 1;
- tv.tv_usec = 0;
- ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
- if (ret <= 0)
- goto again;
+ // write its ack
+ if (!wait_for_sock(cpipe[0], 1))
+ _exit(1);
ret = read(cpipe[0], &v, 1);
- if (ret != sizeof(char) || v != '1') {
- goto again;
- }
+ if (ret != sizeof(char) || v != '1')
+ _exit(1);
if (!wait_for_pid(cpid))
- exit(1);
- exit(0);
-
-again:
- kill(cpid, SIGKILL);
- wait_for_pid(cpid);
- goto loop;
+ _exit(1);
+ _exit(0);
}
/*
* To read cgroup files with a particular pid, we will setns into the child
* pidns, open a pipe, fork a child - which will be the first to really be in
- * the child ns - which does the cgm_get_value and writes the data to the pipe.
+ * the child ns - which does the cgfs_get_value and writes the data to the pipe.
*/
static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
{
int sock[2] = {-1, -1};
- nih_local char *tmpdata = NULL;
+ char *tmpdata = NULL;
int ret;
pid_t qpid, cpid = -1;
bool answer = false;
char v = '0';
struct ucred cred;
- struct timeval tv;
- fd_set s;
+ size_t sz = 0, asz = 0;
- if (!cgm_get_value(contrl, cg, file, &tmpdata))
+ if (!cgfs_get_value(contrl, cg, file, &tmpdata))
return false;
/*
if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
perror("socketpair");
- exit(1);
+ free(tmpdata);
+ return false;
}
cpid = fork();
if (cpid == -1)
goto out;
- if (!cpid) // child
+ if (!cpid) // child - exits when done
pid_to_ns_wrapper(sock[1], tpid);
char *ptr = tmpdata;
goto out;
// read converted results
- FD_ZERO(&s);
- FD_SET(sock[0], &s);
- tv.tv_sec = 2;
- tv.tv_usec = 0;
- ret = select(sock[0]+1, &s, NULL, NULL, &tv);
- if (ret <= 0) {
- fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
+ if (!wait_for_sock(sock[0], 2)) {
+ fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
__func__, strerror(errno));
goto out;
}
__func__, strerror(errno));
goto out;
}
- NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
+ must_strcat_pid(d, &sz, &asz, qpid);
next:
ptr = strchr(ptr, '\n');
if (!ptr)
answer = true;
out:
+ free(tmpdata);
if (cpid != -1)
wait_for_pid(cpid);
if (sock[0] != -1) {
{
struct fuse_context *fc = fuse_get_context();
struct file_info *f = (struct file_info *)fi->fh;
- nih_local struct cgm_keys *k = NULL;
+ struct cgfs_files *k = NULL;
+ char *data = NULL;
+ int ret, s;
+ bool r;
if (f->type != LXC_TYPE_CGFILE) {
fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
}
if (offset)
- return -EIO;
+ return 0;
if (!fc)
return -EIO;
if (!f->controller)
return -EINVAL;
- if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
- nih_local char *data = NULL;
- int s;
- bool r;
+ if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
+ return -EINVAL;
+ }
+ free_key(k);
- if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY))
- // should never get here
- return -EACCES;
- if (strcmp(f->file, "tasks") == 0 ||
- strcmp(f->file, "/tasks") == 0 ||
- strcmp(f->file, "/cgroup.procs") == 0 ||
- strcmp(f->file, "cgroup.procs") == 0)
- // special case - we have to translate the pids
- r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
- else
- r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
+ if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here
+ ret = -EACCES;
+ goto out;
+ }
- if (!r)
- return -EINVAL;
+ if (strcmp(f->file, "tasks") == 0 ||
+ strcmp(f->file, "/tasks") == 0 ||
+ strcmp(f->file, "/cgroup.procs") == 0 ||
+ strcmp(f->file, "cgroup.procs") == 0)
+ // special case - we have to translate the pids
+ r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
+ else
+ r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
- if (!data)
- return 0;
- s = strlen(data);
- if (s > size)
- s = size;
- memcpy(buf, data, s);
+ if (!r) {
+ ret = -EINVAL;
+ goto out;
+ }
- return s;
+ if (!data) {
+ ret = 0;
+ goto out;
}
+ s = strlen(data);
+ if (s > size)
+ s = size;
+ memcpy(buf, data, s);
+ if (s > 0 && s < size && data[s-1] != '\n')
+ buf[s++] = '\n';
- return -EINVAL;
+ ret = s;
+
+out:
+ free(data);
+ return ret;
}
static void pid_from_ns(int sock, pid_t tpid)
pid_t vpid;
struct ucred cred;
char v;
- struct timeval tv;
- fd_set s;
int ret;
cred.uid = 0;
cred.gid = 0;
while (1) {
- FD_ZERO(&s);
- FD_SET(sock, &s);
- tv.tv_sec = 2;
- tv.tv_usec = 0;
- ret = select(sock+1, &s, NULL, NULL, &tv);
- if (ret <= 0) {
- fprintf(stderr, "%s: bad select before read from parent: %s\n",
- __func__, strerror(errno));
- exit(1);
+ if (!wait_for_sock(sock, 2)) {
+ fprintf(stderr, "%s: timeout reading from parent\n", __func__);
+ _exit(1);
}
if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
fprintf(stderr, "%s: bad read from parent: %s\n",
__func__, strerror(errno));
- exit(1);
+ _exit(1);
}
if (vpid == -1) // done
break;
v = '1';
cred.pid = getpid();
if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
- exit(1);
+ _exit(1);
}
}
- exit(0);
+ _exit(0);
}
static void pid_from_ns_wrapper(int sock, pid_t tpid)
int newnsfd = -1, ret, cpipe[2];
char fnam[100];
pid_t cpid;
- fd_set s;
- struct timeval tv;
char v;
- sprintf(fnam, "/proc/%d/ns/pid", tpid);
+ ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
+ if (ret < 0 || ret >= sizeof(fnam))
+ _exit(1);
newnsfd = open(fnam, O_RDONLY);
if (newnsfd < 0)
- exit(1);
+ _exit(1);
if (setns(newnsfd, 0) < 0)
- exit(1);
+ _exit(1);
close(newnsfd);
if (pipe(cpipe) < 0)
- exit(1);
+ _exit(1);
loop:
cpid = fork();
if (cpid < 0)
- exit(1);
+ _exit(1);
if (!cpid) {
char b = '1';
}
// give the child 1 second to be done forking and
- // write it's ack
- FD_ZERO(&s);
- FD_SET(cpipe[0], &s);
- tv.tv_sec = 1;
- tv.tv_usec = 0;
- ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
- if (ret <= 0)
+ // write its ack
+ if (!wait_for_sock(cpipe[0], 1))
goto again;
ret = read(cpipe[0], &v, 1);
if (ret != sizeof(char) || v != '1') {
}
if (!wait_for_pid(cpid))
- exit(1);
- exit(0);
+ _exit(1);
+ _exit(0);
again:
kill(cpid, SIGKILL);
goto loop;
}
-static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
+/*
+ * Given host @uid, return the uid to which it maps in
+ * @pid's user namespace, or -1 if none.
+ */
+bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
+{
+ FILE *f;
+ char line[400];
+
+ sprintf(line, "/proc/%d/uid_map", pid);
+ if ((f = fopen(line, "r")) == NULL) {
+ return false;
+ }
+
+ *answer = convert_id_to_ns(f, uid);
+ fclose(f);
+
+ if (*answer == -1)
+ return false;
+ return true;
+}
+
+/*
+ * get_pid_creds: get the real uid and gid of @pid from
+ * /proc/$$/status
+ * (XXX should we use euid here?)
+ */
+void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
+{
+ char line[400];
+ uid_t u;
+ gid_t g;
+ FILE *f;
+
+ *uid = -1;
+ *gid = -1;
+ sprintf(line, "/proc/%d/status", pid);
+ if ((f = fopen(line, "r")) == NULL) {
+ fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
+ return;
+ }
+ while (fgets(line, 400, f)) {
+ if (strncmp(line, "Uid:", 4) == 0) {
+ if (sscanf(line+4, "%u", &u) != 1) {
+ fprintf(stderr, "bad uid line for pid %u\n", pid);
+ fclose(f);
+ return;
+ }
+ *uid = u;
+ } else if (strncmp(line, "Gid:", 4) == 0) {
+ if (sscanf(line+4, "%u", &g) != 1) {
+ fprintf(stderr, "bad gid line for pid %u\n", pid);
+ fclose(f);
+ return;
+ }
+ *gid = g;
+ }
+ }
+ fclose(f);
+}
+
+/*
+ * May the requestor @r move victim @v to a new cgroup?
+ * This is allowed if
+ * . they are the same task
+ * . they are ownedy by the same uid
+ * . @r is root on the host, or
+ * . @v's uid is mapped into @r's where @r is root.
+ */
+bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
+{
+ uid_t v_uid, tmpuid;
+ gid_t v_gid;
+
+ if (r == v)
+ return true;
+ if (r_uid == 0)
+ return true;
+ get_pid_creds(v, &v_uid, &v_gid);
+ if (r_uid == v_uid)
+ return true;
+ if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
+ && hostuid_to_ns(v_uid, r, &tmpuid))
+ return true;
+ return false;
+}
+
+static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
+ const char *file, const char *buf)
{
int sock[2] = {-1, -1};
pid_t qpid, cpid = -1;
+ FILE *pids_file = NULL;
bool answer = false, fail = false;
+ pids_file = open_pids_file(contrl, cg);
+ if (!pids_file)
+ return false;
+
/*
* write the pids to a socket, have helper in writer's pidns
* call movepid for us
*/
if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
perror("socketpair");
- exit(1);
+ goto out;
}
cpid = fork();
if (cpid == -1)
goto out;
- if (!cpid) // child
+ if (!cpid) { // child
+ fclose(pids_file);
pid_from_ns_wrapper(sock[1], tpid);
+ }
const char *ptr = buf;
while (sscanf(ptr, "%d", &qpid) == 1) {
if (recv_creds(sock[0], &cred, &v)) {
if (v == '0') {
- if (!cgm_move_pid(contrl, cg, cred.pid))
+ if (!may_move_pid(tpid, tuid, cred.pid)) {
+ fail = true;
+ break;
+ }
+ if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
fail = true;
}
}
close(sock[0]);
close(sock[1]);
}
+ if (pids_file) {
+ if (fclose(pids_file) != 0)
+ answer = false;
+ }
return answer;
}
struct fuse_file_info *fi)
{
struct fuse_context *fc = fuse_get_context();
- nih_local char *localbuf = NULL;
- nih_local struct cgm_keys *k = NULL;
+ char *localbuf = NULL;
+ struct cgfs_files *k = NULL;
struct file_info *f = (struct file_info *)fi->fh;
+ bool r;
if (f->type != LXC_TYPE_CGFILE) {
fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
}
if (offset)
- return -EINVAL;
+ return 0;
if (!fc)
return -EIO;
- localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
+ localbuf = alloca(size+1);
localbuf[size] = '\0';
memcpy(localbuf, buf, size);
- if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
- bool r;
-
- if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY))
- return -EACCES;
+ if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
+ size = -EINVAL;
+ goto out;
+ }
- if (strcmp(f->file, "tasks") == 0 ||
- strcmp(f->file, "/tasks") == 0 ||
- strcmp(f->file, "/cgroup.procs") == 0 ||
- strcmp(f->file, "cgroup.procs") == 0)
- // special case - we have to translate the pids
- r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
- else
- r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
+ if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
+ size = -EACCES;
+ goto out;
+ }
- if (!r)
- return -EINVAL;
+ if (strcmp(f->file, "tasks") == 0 ||
+ strcmp(f->file, "/tasks") == 0 ||
+ strcmp(f->file, "/cgroup.procs") == 0 ||
+ strcmp(f->file, "cgroup.procs") == 0)
+ // special case - we have to translate the pids
+ r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
+ else
+ r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
- return size;
- }
+ if (!r)
+ size = -EINVAL;
- return -EINVAL;
+out:
+ free_key(k);
+ return size;
}
int cg_chown(const char *path, uid_t uid, gid_t gid)
{
struct fuse_context *fc = fuse_get_context();
- nih_local char * cgdir = NULL;
- char *fpath = NULL, *path1, *path2;
- nih_local struct cgm_keys *k = NULL;
+ char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
+ struct cgfs_files *k = NULL;
const char *cgroup;
- nih_local char *controller = NULL;
-
+ int ret;
if (!fc)
return -EIO;
/* this is just /cgroup/controller */
return -EINVAL;
- get_cgdir_and_path(cgroup, &cgdir, &fpath);
+ get_cgdir_and_path(cgroup, &cgdir, &last);
- if (!fpath) {
+ if (!last) {
path1 = "/";
path2 = cgdir;
} else {
path1 = cgdir;
- path2 = fpath;
+ path2 = last;
}
if (is_child_cgroup(controller, path1, path2)) {
// get uid, gid, from '/tasks' file and make up a mode
// That is a hack, until cgmanager gains a GetCgroupPerms fn.
- k = get_cgroup_key(controller, cgroup, "tasks");
+ k = cgfs_get_key(controller, cgroup, "tasks");
} else
- k = get_cgroup_key(controller, path1, path2);
+ k = cgfs_get_key(controller, path1, path2);
- if (!k)
- return -EINVAL;
+ if (!k) {
+ ret = -EINVAL;
+ goto out;
+ }
/*
* This being a fuse request, the uid and gid must be valid
* sure that the caller is root in his uid, and privileged
* over the file's current owner.
*/
- if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
- return -EACCES;
+ if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
+ ret = -EACCES;
+ goto out;
+ }
- if (!cgm_chown_file(controller, cgroup, uid, gid))
- return -EINVAL;
- return 0;
+ ret = cgfs_chown_file(controller, cgroup, uid, gid);
+
+out:
+ free_key(k);
+ free(cgdir);
+
+ return ret;
}
int cg_chmod(const char *path, mode_t mode)
{
struct fuse_context *fc = fuse_get_context();
- nih_local char * cgdir = NULL;
- char *fpath = NULL, *path1, *path2;
- nih_local struct cgm_keys *k = NULL;
+ char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
+ struct cgfs_files *k = NULL;
const char *cgroup;
- nih_local char *controller = NULL;
+ int ret;
if (!fc)
return -EIO;
/* this is just /cgroup/controller */
return -EINVAL;
- get_cgdir_and_path(cgroup, &cgdir, &fpath);
+ get_cgdir_and_path(cgroup, &cgdir, &last);
- if (!fpath) {
+ if (!last) {
path1 = "/";
path2 = cgdir;
} else {
path1 = cgdir;
- path2 = fpath;
+ path2 = last;
}
if (is_child_cgroup(controller, path1, path2)) {
// get uid, gid, from '/tasks' file and make up a mode
// That is a hack, until cgmanager gains a GetCgroupPerms fn.
- k = get_cgroup_key(controller, cgroup, "tasks");
+ k = cgfs_get_key(controller, cgroup, "tasks");
} else
- k = get_cgroup_key(controller, path1, path2);
+ k = cgfs_get_key(controller, path1, path2);
- if (!k)
- return -EINVAL;
+ if (!k) {
+ ret = -EINVAL;
+ goto out;
+ }
/*
* This being a fuse request, the uid and gid must be valid
* sure that the caller is root in his uid, and privileged
* over the file's current owner.
*/
- if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
- return -EPERM;
+ if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
+ ret = -EPERM;
+ goto out;
+ }
- if (!cgm_chmod_file(controller, cgroup, mode))
- return -EINVAL;
- return 0;
+ if (!cgfs_chmod_file(controller, cgroup, mode)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ free_key(k);
+ free(cgdir);
+ return ret;
}
int cg_mkdir(const char *path, mode_t mode)
{
struct fuse_context *fc = fuse_get_context();
- nih_local struct cgm_keys **list = NULL;
- char *fpath = NULL, *path1;
- nih_local char * cgdir = NULL;
+ char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
const char *cgroup;
- nih_local char *controller = NULL;
+ int ret;
if (!fc)
return -EIO;
if (!cgroup)
return -EINVAL;
- get_cgdir_and_path(cgroup, &cgdir, &fpath);
- if (!fpath)
+ get_cgdir_and_path(cgroup, &cgdir, &last);
+ if (!last)
path1 = "/";
else
path1 = cgdir;
- if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
- return -EACCES;
+ pid_t initpid = lookup_initpid_in_store(fc->pid);
+ if (initpid <= 0)
+ initpid = fc->pid;
+ if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
+ if (!next)
+ ret = -EINVAL;
+ else if (last && strcmp(next, last) == 0)
+ ret = -EEXIST;
+ else
+ ret = -ENOENT;
+ goto out;
+ }
+ if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
+ ret = -EACCES;
+ goto out;
+ }
+ if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
+ ret = -EACCES;
+ goto out;
+ }
- if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
- return -EINVAL;
+ ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
- return 0;
+out:
+ free(cgdir);
+ free(next);
+ return ret;
}
static int cg_rmdir(const char *path)
{
struct fuse_context *fc = fuse_get_context();
- nih_local struct cgm_keys **list = NULL;
- char *fpath = NULL;
- nih_local char * cgdir = NULL;
+ char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
const char *cgroup;
- nih_local char *controller = NULL;
+ int ret;
if (!fc)
return -EIO;
-
controller = pick_controller_from_path(fc, path);
if (!controller)
return -EINVAL;
if (!cgroup)
return -EINVAL;
- get_cgdir_and_path(cgroup, &cgdir, &fpath);
- if (!fpath)
- return -EINVAL;
+ get_cgdir_and_path(cgroup, &cgdir, &last);
+ if (!last) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pid_t initpid = lookup_initpid_in_store(fc->pid);
+ if (initpid <= 0)
+ initpid = fc->pid;
+ if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
+ if (!last || strcmp(next, last) == 0)
+ ret = -EBUSY;
+ else
+ ret = -ENOENT;
+ goto out;
+ }
- if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
- return -EACCES;
+ if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
+ ret = -EACCES;
+ goto out;
+ }
+ if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
+ ret = -EACCES;
+ goto out;
+ }
- if (!cgm_remove(controller, cgroup))
- return -EINVAL;
+ if (!cgfs_remove(controller, cgroup)) {
+ ret = -EINVAL;
+ goto out;
+ }
- return 0;
+ ret = 0;
+
+out:
+ free(cgdir);
+ free(next);
+ return ret;
}
static bool startswith(const char *line, const char *pref)
if (startswith(str, key)) {
sscanf(str + len, "%lu", v);
return;
- }
- eol = strchr(str, '\n');
- if (!eol)
- return;
- str = eol+1;
- }
-}
-
-static char *get_pid_cgroup(pid_t pid, const char *contrl)
-{
- nih_local char *fnam = NULL;
- FILE *f;
- char *answer = NULL;
- char *line = NULL;
- size_t len = 0;
-
- fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
- if (!(f = fopen(fnam, "r")))
- return false;
-
- while (getline(&line, &len, f) != -1) {
- char *c1, *c2;
- if (!line[0])
- continue;
- c1 = strchr(line, ':');
- if (!c1)
- goto out;
- c1++;
- c2 = strchr(c1, ':');
- if (!c2)
- goto out;
- *c2 = '\0';
- if (strcmp(c1, contrl) != 0)
- continue;
- c2++;
- stripnewline(c2);
- answer = NIH_MUST( nih_strdup(NULL, c2) );
- goto out;
+ }
+ eol = strchr(str, '\n');
+ if (!eol)
+ return;
+ str = eol+1;
}
+}
-out:
+static int read_file(const char *path, char *buf, size_t size,
+ struct file_info *d)
+{
+ size_t linelen = 0, total_len = 0, rv = 0;
+ char *line = NULL;
+ char *cache = d->buf;
+ size_t cache_size = d->buflen;
+ FILE *f = fopen(path, "r");
+ if (!f)
+ return 0;
+
+ while (getline(&line, &linelen, f) != -1) {
+ size_t l = snprintf(cache, cache_size, "%s", line);
+ if (l < 0) {
+ perror("Error writing to cache");
+ rv = 0;
+ goto err;
+ }
+ if (l >= cache_size) {
+ fprintf(stderr, "Internal error: truncated write to cache\n");
+ rv = 0;
+ goto err;
+ }
+ if (l < cache_size) {
+ cache += l;
+ cache_size -= l;
+ total_len += l;
+ } else {
+ cache += cache_size;
+ total_len += cache_size;
+ cache_size = 0;
+ break;
+ }
+ }
+
+ d->size = total_len;
+ if (total_len > size ) total_len = size;
+
+ /* read from off 0 */
+ memcpy(buf, d->buf, total_len);
+ rv = total_len;
+ err:
fclose(f);
free(line);
- return answer;
+ return rv;
}
/*
* FUSE ops for /proc
*/
+static unsigned long get_memlimit(const char *cgroup)
+{
+ char *memlimit_str = NULL;
+ unsigned long memlimit = -1;
+
+ if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
+ memlimit = strtoul(memlimit_str, NULL, 10);
+
+ free(memlimit_str);
+
+ return memlimit;
+}
+
+static unsigned long get_min_memlimit(const char *cgroup)
+{
+ char *copy = strdupa(cgroup);
+ unsigned long memlimit = 0, retlimit;
+
+ retlimit = get_memlimit(copy);
+
+ while (strcmp(copy, "/") != 0) {
+ copy = dirname(copy);
+ memlimit = get_memlimit(copy);
+ if (memlimit != -1 && memlimit < retlimit)
+ retlimit = memlimit;
+ };
+
+ return retlimit;
+}
+
static int proc_meminfo_read(char *buf, size_t size, off_t offset,
struct fuse_file_info *fi)
{
struct fuse_context *fc = fuse_get_context();
struct file_info *d = (struct file_info *)fi->fh;
- nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
- nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
- unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
+ char *cg;
+ char *memusage_str = NULL, *memstat_str = NULL,
+ *memswlimit_str = NULL, *memswusage_str = NULL,
+ *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
+ unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
+ cached = 0, hosttotal = 0;
char *line = NULL;
- size_t linelen = 0, total_len = 0;
+ size_t linelen = 0, total_len = 0, rv = 0;
char *cache = d->buf;
size_t cache_size = d->buflen;
- FILE *f;
+ FILE *f = NULL;
if (offset){
if (offset > d->size)
return -EINVAL;
+ if (!d->cached)
+ return 0;
int left = d->size - offset;
total_len = left > size ? size: left;
memcpy(buf, cache + offset, total_len);
return total_len;
}
+ pid_t initpid = lookup_initpid_in_store(fc->pid);
+ if (initpid <= 0)
+ initpid = fc->pid;
+ cg = get_pid_cgroup(initpid, "memory");
if (!cg)
- return 0;
+ return read_file("/proc/meminfo", buf, size, d);
+
+ memlimit = get_min_memlimit(cg);
+ if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
+ goto err;
+ if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
+ goto err;
+
+ // Following values are allowed to fail, because swapaccount might be turned
+ // off for current kernel
+ if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
+ cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
+ {
+ /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
+ if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
+ goto err;
+ if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
+ goto err;
+
+ memswlimit = strtoul(memswlimit_str, NULL, 10);
+ memswusage = strtoul(memswusage_str, NULL, 10);
+
+ if (!strcmp(memswlimit_str, memswlimit_default_str))
+ memswlimit = 0;
+ if (!strcmp(memswusage_str, memswusage_default_str))
+ memswusage = 0;
+
+ memswlimit = memswlimit / 1024;
+ memswusage = memswusage / 1024;
+ }
- if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
- return 0;
- if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
- return 0;
- if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
- return 0;
- memlimit = strtoul(memlimit_str, NULL, 10);
memusage = strtoul(memusage_str, NULL, 10);
memlimit /= 1024;
memusage /= 1024;
+
get_mem_cached(memstat_str, &cached);
f = fopen("/proc/meminfo", "r");
if (!f)
- return 0;
+ goto err;
while (getline(&line, &linelen, f) != -1) {
size_t l;
} else if (startswith(line, "MemAvailable:")) {
snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
printme = lbuf;
+ } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
+ snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
+ printme = lbuf;
+ } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
+ snprintf(lbuf, 100, "SwapFree: %8lu kB\n",
+ (memswlimit - memlimit) - (memswusage - memusage));
+ printme = lbuf;
} else if (startswith(line, "Buffers:")) {
snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
printme = lbuf;
printme = line;
l = snprintf(cache, cache_size, "%s", printme);
+ if (l < 0) {
+ perror("Error writing to cache");
+ rv = 0;
+ goto err;
+
+ }
+ if (l >= cache_size) {
+ fprintf(stderr, "Internal error: truncated write to cache\n");
+ rv = 0;
+ goto err;
+ }
+
cache += l;
cache_size -= l;
total_len += l;
}
+ d->cached = 1;
d->size = total_len;
if (total_len > size ) total_len = size;
memcpy(buf, d->buf, total_len);
- fclose(f);
+ rv = total_len;
+err:
+ if (f)
+ fclose(f);
free(line);
- return total_len;
+ free(cg);
+ free(memusage_str);
+ free(memswlimit_str);
+ free(memswusage_str);
+ free(memstat_str);
+ free(memswlimit_default_str);
+ free(memswusage_default_str);
+ return rv;
}
/*
* Read the cpuset.cpus for cg
- * Return the answer in a nih_alloced string
+ * Return the answer in a newly allocated string which must be freed
*/
static char *get_cpuset(const char *cg)
{
char *answer;
- if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
+ if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
return NULL;
return answer;
}
-/*
- * Helper functions for cpuset_in-set
- */
-char *cpuset_nexttok(const char *c)
-{
- char *r = strchr(c+1, ',');
- if (r)
- return r+1;
- return NULL;
-}
-
-int cpuset_getrange(const char *c, int *a, int *b)
-{
- int ret;
-
- ret = sscanf(c, "%d-%d", a, b);
- return ret;
-}
-
-/*
- * cpusets are in format "1,2-3,4"
- * iow, comma-delimited ranges
- */
-static bool cpu_in_cpuset(int cpu, const char *cpuset)
-{
- const char *c;
-
- for (c = cpuset; c; c = cpuset_nexttok(c)) {
- int a, b, ret;
-
- ret = cpuset_getrange(c, &a, &b);
- if (ret == 1 && cpu == a)
- return true;
- if (ret != 2) // bad cpuset!
- return false;
- if (cpu >= a && cpu <= b)
- return true;
- }
-
- return false;
-}
+bool cpu_in_cpuset(int cpu, const char *cpuset);
static bool cpuline_in_cpuset(const char *line, const char *cpuset)
{
{
struct fuse_context *fc = fuse_get_context();
struct file_info *d = (struct file_info *)fi->fh;
- nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
- nih_local char *cpuset = NULL;
+ char *cg;
+ char *cpuset = NULL;
char *line = NULL;
- size_t linelen = 0, total_len = 0;
+ size_t linelen = 0, total_len = 0, rv = 0;
bool am_printing = false;
int curcpu = -1;
char *cache = d->buf;
size_t cache_size = d->buflen;
- FILE *f;
+ FILE *f = NULL;
if (offset){
if (offset > d->size)
return -EINVAL;
+ if (!d->cached)
+ return 0;
int left = d->size - offset;
total_len = left > size ? size: left;
memcpy(buf, cache + offset, total_len);
return total_len;
}
+ pid_t initpid = lookup_initpid_in_store(fc->pid);
+ if (initpid <= 0)
+ initpid = fc->pid;
+ cg = get_pid_cgroup(initpid, "cpuset");
if (!cg)
- return 0;
+ return read_file("proc/cpuinfo", buf, size, d);
cpuset = get_cpuset(cg);
if (!cpuset)
- return 0;
+ goto err;
f = fopen("/proc/cpuinfo", "r");
if (!f)
- return 0;
+ goto err;
while (getline(&line, &linelen, f) != -1) {
size_t l;
if (am_printing) {
curcpu ++;
l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
+ if (l < 0) {
+ perror("Error writing to cache");
+ rv = 0;
+ goto err;
+ }
+ if (l >= cache_size) {
+ fprintf(stderr, "Internal error: truncated write to cache\n");
+ rv = 0;
+ goto err;
+ }
if (l < cache_size){
cache += l;
cache_size -= l;
}
if (am_printing) {
l = snprintf(cache, cache_size, "%s", line);
+ if (l < 0) {
+ perror("Error writing to cache");
+ rv = 0;
+ goto err;
+ }
+ if (l >= cache_size) {
+ fprintf(stderr, "Internal error: truncated write to cache\n");
+ rv = 0;
+ goto err;
+ }
if (l < cache_size) {
cache += l;
cache_size -= l;
}
}
+ d->cached = 1;
d->size = total_len;
if (total_len > size ) total_len = size;
/* read from off 0 */
memcpy(buf, d->buf, total_len);
-
- fclose(f);
+ rv = total_len;
+err:
+ if (f)
+ fclose(f);
free(line);
- return total_len;
+ free(cpuset);
+ free(cg);
+ return rv;
}
static int proc_stat_read(char *buf, size_t size, off_t offset,
{
struct fuse_context *fc = fuse_get_context();
struct file_info *d = (struct file_info *)fi->fh;
- nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
- nih_local char *cpuset = NULL;
+ char *cg;
+ char *cpuset = NULL;
char *line = NULL;
- size_t linelen = 0, total_len = 0;
+ size_t linelen = 0, total_len = 0, rv = 0;
int curcpu = -1; /* cpu numbering starts at 0 */
unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
/* reserve for cpu all */
char *cache = d->buf + CPUALL_MAX_SIZE;
size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
- FILE *f;
+ FILE *f = NULL;
if (offset){
if (offset > d->size)
return -EINVAL;
+ if (!d->cached)
+ return 0;
int left = d->size - offset;
total_len = left > size ? size: left;
memcpy(buf, d->buf + offset, total_len);
return total_len;
}
+ pid_t initpid = lookup_initpid_in_store(fc->pid);
+ if (initpid <= 0)
+ initpid = fc->pid;
+ cg = get_pid_cgroup(initpid, "cpuset");
if (!cg)
- return 0;
+ return read_file("/proc/stat", buf, size, d);
cpuset = get_cpuset(cg);
if (!cpuset)
- return 0;
+ goto err;
f = fopen("/proc/stat", "r");
if (!f)
- return 0;
+ goto err;
//skip first line
if (getline(&line, &linelen, f) < 0) {
fprintf(stderr, "proc_stat_read read first line failed\n");
- goto out;
+ goto err;
}
while (getline(&line, &linelen, f) != -1) {
if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
/* not a ^cpuN line containing a number N, just print it */
l = snprintf(cache, cache_size, "%s", line);
- if (l < cache_size){
+ if (l < 0) {
+ perror("Error writing to cache");
+ rv = 0;
+ goto err;
+ }
+ if (l >= cache_size) {
+ fprintf(stderr, "Internal error: truncated write to cache\n");
+ rv = 0;
+ goto err;
+ }
+ if (l < cache_size) {
cache += l;
cache_size -= l;
total_len += l;
continue;
- }else{
+ } else {
//no more space, break it
cache += cache_size;
total_len += cache_size;
if (!c)
continue;
l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
+ if (l < 0) {
+ perror("Error writing to cache");
+ rv = 0;
+ goto err;
+
+ }
+ if (l >= cache_size) {
+ fprintf(stderr, "Internal error: truncated write to cache\n");
+ rv = 0;
+ goto err;
+ }
+
cache += l;
cache_size -= l;
total_len += l;
if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
memcpy(cache, cpuall, cpuall_len);
cache += cpuall_len;
- }else{
+ } else{
/* shouldn't happen */
fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
cpuall_len = 0;
memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
total_len += cpuall_len;
+ d->cached = 1;
d->size = total_len;
if (total_len > size ) total_len = size;
memcpy(buf, d->buf, total_len);
-out:
- fclose(f);
+ rv = total_len;
+
+err:
+ if (f)
+ fclose(f);
free(line);
- return total_len;
+ free(cpuset);
+ free(cg);
+ return rv;
}
-/*
- * How to guess what to present for uptime?
- * One thing we could do would be to take the date on the caller's
- * memory.usage_in_bytes file, which should equal the time of creation
- * of his cgroup. However, a task could be in a sub-cgroup of the
- * container. The same problem exists if we try to look at the ages
- * of processes in the caller's cgroup.
- *
- * So we'll fork a task that will enter the caller's pidns, mount a
- * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
- *
- * For the second uptime #, we'll do as Stéphane had done, just copy
- * the number from /proc/uptime. Not sure how to best emulate 'idle'
- * time. Maybe someone can come up with a good algorithm and submit a
- * patch. Maybe something based on cpushare info?
- */
-
-/* return age of the reaper for $pid, taken from ctime of its procdir */
-static long int get_pid1_time(pid_t pid)
+static long int getreaperage(pid_t pid)
{
char fnam[100];
- int fd, cpipe[2], ret;
struct stat sb;
- pid_t cpid;
- struct timeval tv;
- fd_set s;
- char v;
-
- if (unshare(CLONE_NEWNS))
- return 0;
+ int ret;
+ pid_t qpid;
- if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
- perror("rslave mount failed");
+ qpid = lookup_initpid_in_store(pid);
+ if (qpid <= 0)
return 0;
- }
- sprintf(fnam, "/proc/%d/ns/pid", pid);
- fd = open(fnam, O_RDONLY);
- if (fd < 0) {
- perror("get_pid1_time open of ns/pid");
- return 0;
- }
- if (setns(fd, 0)) {
- perror("get_pid1_time setns 1");
- close(fd);
+ ret = snprintf(fnam, 100, "/proc/%d", qpid);
+ if (ret < 0 || ret >= 100)
return 0;
- }
- close(fd);
-
- if (pipe(cpipe) < 0)
- exit(1);
-loop:
- cpid = fork();
- if (cpid < 0)
+ if (lstat(fnam, &sb) < 0)
return 0;
- if (!cpid) {
- char b = '1';
- close(cpipe[0]);
- if (write(cpipe[1], &b, sizeof(char)) < 0) {
- fprintf(stderr, "%s (child): erorr on write: %s\n",
- __func__, strerror(errno));
- }
- close(cpipe[1]);
- umount2("/proc", MNT_DETACH);
- if (mount("proc", "/proc", "proc", 0, NULL)) {
- perror("get_pid1_time mount");
- return 0;
- }
- ret = lstat("/proc/1", &sb);
- if (ret) {
- perror("get_pid1_time lstat");
- return 0;
- }
- return time(NULL) - sb.st_ctime;
- }
-
- // give the child 1 second to be done forking and
- // write it's ack
- FD_ZERO(&s);
- FD_SET(cpipe[0], &s);
- tv.tv_sec = 1;
- tv.tv_usec = 0;
- ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
- if (ret <= 0)
- goto again;
- ret = read(cpipe[0], &v, 1);
- if (ret != sizeof(char) || v != '1') {
- goto again;
- }
-
- wait_for_pid(cpid);
- exit(0);
-
-again:
- kill(cpid, SIGKILL);
- wait_for_pid(cpid);
- goto loop;
+ return time(NULL) - sb.st_ctime;
}
-static long int getreaperage(pid_t qpid)
+static unsigned long get_reaper_busy(pid_t task)
{
- int pid, mypipe[2], ret;
- struct timeval tv;
- fd_set s;
- long int mtime, answer = 0;
+ pid_t initpid = lookup_initpid_in_store(task);
+ char *cgroup = NULL, *usage_str = NULL;
+ unsigned long usage = 0;
- if (pipe(mypipe)) {
+ if (initpid <= 0)
return 0;
- }
-
- pid = fork();
-
- if (!pid) { // child
- mtime = get_pid1_time(qpid);
- if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
- fprintf(stderr, "Warning: bad write from getreaperage\n");
- exit(0);
- }
- close(mypipe[1]);
- FD_ZERO(&s);
- FD_SET(mypipe[0], &s);
- tv.tv_sec = 1;
- tv.tv_usec = 0;
- ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
- if (ret <= 0) {
- perror("select");
- goto out;
- }
- if (!ret) {
- fprintf(stderr, "timed out\n");
+ cgroup = get_pid_cgroup(initpid, "cpuacct");
+ if (!cgroup)
goto out;
- }
- if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
- perror("read");
+ if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
goto out;
- }
- answer = mtime;
+ usage = strtoul(usage_str, NULL, 10);
+ usage /= 1000000000;
out:
- wait_for_pid(pid);
- close(mypipe[0]);
- return answer;
-}
-
-static long int getprocidle(void)
-{
- FILE *f = fopen("/proc/uptime", "r");
- long int age, idle;
- int ret;
- if (!f)
- return 0;
- ret = fscanf(f, "%ld %ld", &age, &idle);
- fclose(f);
- if (ret != 2)
- return 0;
- return idle;
+ free(cgroup);
+ free(usage_str);
+ return usage;
}
/*
{
struct fuse_context *fc = fuse_get_context();
struct file_info *d = (struct file_info *)fi->fh;
- long int reaperage = getreaperage(fc->pid);;
- long int idletime = getprocidle();
+ long int reaperage = getreaperage(fc->pid);
+ unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
+ char *cache = d->buf;
size_t total_len = 0;
if (offset){
if (offset > d->size)
return -EINVAL;
+ if (!d->cached)
+ return 0;
+ int left = d->size - offset;
+ total_len = left > size ? size: left;
+ memcpy(buf, cache + offset, total_len);
+ return total_len;
+ }
+
+ idletime = reaperage - busytime;
+ if (idletime > reaperage)
+ idletime = reaperage;
+
+ total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
+ if (total_len < 0){
+ perror("Error writing to cache");
return 0;
}
- total_len = snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
- d->size = total_len;
+ d->size = (int)total_len;
+ d->cached = 1;
+
+ if (total_len > size) total_len = size;
+
+ memcpy(buf, d->buf, total_len);
return total_len;
}
char dev_name[72];
struct fuse_context *fc = fuse_get_context();
struct file_info *d = (struct file_info *)fi->fh;
- nih_local char *cg = get_pid_cgroup(fc->pid, "blkio");
- nih_local char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
+ char *cg;
+ char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
*io_wait_time_str = NULL, *io_service_time_str = NULL;
unsigned long read = 0, write = 0;
unsigned long read_merged = 0, write_merged = 0;
unsigned long read_ticks = 0, write_ticks = 0;
unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
+ char *cache = d->buf;
+ size_t cache_size = d->buflen;
char *line = NULL;
- size_t linelen = 0, total_len = 0;
+ size_t linelen = 0, total_len = 0, rv = 0;
unsigned int major = 0, minor = 0;
int i = 0;
- FILE *f;
+ FILE *f = NULL;
if (offset){
if (offset > d->size)
return -EINVAL;
- return 0;
+ if (!d->cached)
+ return 0;
+ int left = d->size - offset;
+ total_len = left > size ? size: left;
+ memcpy(buf, cache + offset, total_len);
+ return total_len;
}
+ pid_t initpid = lookup_initpid_in_store(fc->pid);
+ if (initpid <= 0)
+ initpid = fc->pid;
+ cg = get_pid_cgroup(initpid, "blkio");
if (!cg)
- return 0;
+ return read_file("/proc/diskstats", buf, size, d);
- if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
- return 0;
- if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
- return 0;
- if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
- return 0;
- if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
- return 0;
- if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
- return 0;
+ if (!cgfs_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
+ goto err;
+ if (!cgfs_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
+ goto err;
+ if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
+ goto err;
+ if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
+ goto err;
+ if (!cgfs_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
+ goto err;
f = fopen("/proc/diskstats", "r");
if (!f)
- return 0;
+ goto err;
while (getline(&line, &linelen, f) != -1) {
size_t l;
char *printme, lbuf[256];
- i = sscanf(line, "%u %u %s", &major, &minor, dev_name);
+ i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
if(i == 3){
get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
} else
continue;
- l = snprintf(buf, size, "%s", printme);
- buf += l;
- size -= l;
+ l = snprintf(cache, cache_size, "%s", printme);
+ if (l < 0) {
+ perror("Error writing to fuse buf");
+ rv = 0;
+ goto err;
+ }
+ if (l >= cache_size) {
+ fprintf(stderr, "Internal error: truncated write to cache\n");
+ rv = 0;
+ goto err;
+ }
+ cache += l;
+ cache_size -= l;
total_len += l;
}
+ d->cached = 1;
d->size = total_len;
+ if (total_len > size ) total_len = size;
+ memcpy(buf, d->buf, total_len);
- fclose(f);
+ rv = total_len;
+err:
+ free(cg);
+ if (f)
+ fclose(f);
free(line);
- return total_len;
+ free(io_serviced_str);
+ free(io_merged_str);
+ free(io_service_bytes_str);
+ free(io_wait_time_str);
+ free(io_service_time_str);
+ return rv;
}
static off_t get_procfile_size(const char *which)
strcmp(path, "/proc/uptime") == 0 ||
strcmp(path, "/proc/stat") == 0 ||
strcmp(path, "/proc/diskstats") == 0) {
- sb->st_size = get_procfile_size(path);
+ sb->st_size = 0;
sb->st_mode = S_IFREG | 00444;
sb->st_nlink = 1;
return 0;
if (type == -1)
return -ENOENT;
- info = NIH_MUST( nih_alloc(NULL, sizeof(*info)) );
+ info = malloc(sizeof(*info));
+ if (!info)
+ return -ENOMEM;
+
memset(info, 0, sizeof(*info));
info->type = type;
info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
- info->buf = NIH_MUST( nih_alloc(info, info->buflen) );
+ do {
+ info->buf = malloc(info->buflen);
+ } while (!info->buf);
memset(info->buf, 0, info->buflen);
/* set actual size to buffer size */
info->size = info->buflen;
{
fprintf(stderr, "Usage:\n");
fprintf(stderr, "\n");
- fprintf(stderr, "%s [FUSE and mount options] mountpoint\n", me);
+ fprintf(stderr, "%s mountpoint\n", me);
+ fprintf(stderr, "%s -h\n", me);
exit(1);
}
return false;
}
-int main(int argc, char *argv[])
+void swallow_arg(int *argcp, char *argv[], char *which)
{
- int ret;
- struct lxcfs_state *d;
+ int i;
- if (argc < 2 || is_help(argv[1]))
- usage(argv[0]);
+ for (i = 1; argv[i]; i++) {
+ if (strcmp(argv[i], which) != 0)
+ continue;
+ for (; argv[i]; i++) {
+ argv[i] = argv[i+1];
+ }
+ (*argcp)--;
+ return;
+ }
+}
- d = malloc(sizeof(*d));
- if (!d)
- return -1;
+void swallow_option(int *argcp, char *argv[], char *opt, char *v)
+{
+ int i;
+
+ for (i = 1; argv[i]; i++) {
+ if (!argv[i+1])
+ continue;
+ if (strcmp(argv[i], opt) != 0)
+ continue;
+ if (strcmp(argv[i+1], v) != 0) {
+ fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
+ exit(1);
+ }
+ for (; argv[i+1]; i++) {
+ argv[i] = argv[i+2];
+ }
+ (*argcp) -= 2;
+ return;
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ int ret = -1;
+ /*
+ * what we pass to fuse_main is:
+ * argv[0] -s -f -o allow_other,directio argv[1] NULL
+ */
+ int nargs = 5, cnt = 0;
+ char *newargv[6];
+
+#ifdef FORTRAVIS
+ /* for travis which runs on 12.04 */
+ if (glib_check_version (2, 36, 0) != NULL)
+ g_type_init ();
+#endif
+
+ /* accomodate older init scripts */
+ swallow_arg(&argc, argv, "-s");
+ swallow_arg(&argc, argv, "-f");
+ swallow_option(&argc, argv, "-o", "allow_other");
+
+ if (argc == 2 && strcmp(argv[1], "--version") == 0) {
+ fprintf(stderr, "%s\n", VERSION);
+ exit(0);
+ }
+ if (argc != 2 || is_help(argv[1]))
+ usage(argv[0]);
- if (!cgm_escape_cgroup())
- fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
+ newargv[cnt++] = argv[0];
+ newargv[cnt++] = "-f";
+ newargv[cnt++] = "-o";
+ newargv[cnt++] = "allow_other,direct_io,entry_timeout=0.5,attr_timeout=0.5";
+ newargv[cnt++] = argv[1];
+ newargv[cnt++] = NULL;
- if (!cgm_get_controllers(&d->subsystems))
- return -1;
+ if (!cgfs_setup_controllers())
+ goto out;
- ret = fuse_main(argc, argv, &lxcfs_ops, d);
+ ret = fuse_main(nargs, newargv, &lxcfs_ops, NULL);
+out:
return ret;
}