#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/param.h>
+#include <signal.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/sysinfo.h>
#include <sys/vfs.h>
+#include "api_extensions.h"
#include "bindings.h"
#include "config.h"
#include "cgroup_fuse.h"
#include "proc_cpuview.h"
#include "utils.h"
+static bool can_use_pidfd;
+
/* Define pivot_root() if missing from the C library */
#ifndef HAVE_PIVOT_ROOT
-static int pivot_root(const char * new_root, const char * put_old)
+static int pivot_root(const char *new_root, const char *put_old)
{
#ifdef __NR_pivot_root
-return syscall(__NR_pivot_root, new_root, put_old);
+ return syscall(__NR_pivot_root, new_root, put_old);
#else
-errno = ENOSYS;
-return -1;
+ errno = ENOSYS;
+ return -1;
#endif
}
#else
-extern int pivot_root(const char * new_root, const char * put_old);
+extern int pivot_root(const char *new_root, const char *put_old);
#endif
/*
* cached initpid.
*/
struct pidns_init_store {
- ino_t ino; // inode number for /proc/$pid/ns/pid
- pid_t initpid; // the pid of nit in that ns
- long int ctime; // the time at which /proc/$initpid was created
+ ino_t ino; /* inode number for /proc/$pid/ns/pid */
+ pid_t initpid; /* the pid of nit in that ns */
+ int init_pidfd;
+ long int ctime; /* the time at which /proc/$initpid was created */
struct pidns_init_store *next;
long int lastcheck;
};
static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
+
static void lock_mutex(pthread_mutex_t *l)
{
int ret;
- if ((ret = pthread_mutex_lock(l)) != 0) {
- lxcfs_error("returned:%d %s\n", ret, strerror(ret));
- exit(1);
- }
+ ret = pthread_mutex_lock(l);
+ if (ret)
+ log_exit("%s - returned %d\n", strerror(ret), ret);
}
struct cgroup_ops *cgroup_ops;
{
int ret;
- if ((ret = pthread_mutex_unlock(l)) != 0) {
- lxcfs_error("returned:%d %s\n", ret, strerror(ret));
- exit(1);
- }
+ ret = pthread_mutex_unlock(l);
+ if (ret)
+ log_exit("%s - returned %d\n", strerror(ret), ret);
}
static void store_lock(void)
unlock_mutex(&pidns_store_mutex);
}
+/* /proc/ = 6
+ * +
+ * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
+ * +
+ * \0 = 1
+ */
+#define LXCFS_PROC_PID_LEN \
+ (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
+
/* Must be called under store_lock */
-static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
+static bool initpid_still_valid(struct pidns_init_store *entry)
{
- struct stat initsb;
- char fnam[100];
+ bool valid = true;
- snprintf(fnam, 100, "/proc/%d", e->initpid);
- if (stat(fnam, &initsb) < 0)
- return false;
+ if (entry->init_pidfd >= 0) {
+ if (pidfd_send_signal(entry->init_pidfd, 0, NULL, 0))
+ valid = false;
+ } else {
+ struct stat st;
+ char path[LXCFS_PROC_PID_LEN];
- lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
- initsb.st_ctime, e->initpid);
+ snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
- if (e->ctime != initsb.st_ctime)
- return false;
- return true;
+ if (stat(path, &st) || entry->ctime != st.st_ctime)
+ valid = false;
+ }
+
+ return valid;
}
/* Must be called under store_lock */
-static void remove_initpid(struct pidns_init_store *e)
+static void remove_initpid(struct pidns_init_store *entry)
{
- struct pidns_init_store *tmp;
- int h;
+ struct pidns_init_store *it;
+ int ino_hash;
- lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
+ lxcfs_debug("Removing cached entry for pid %d from init pid cache",
+ entry->initpid);
- h = HASH(e->ino);
- if (pidns_hash_table[h] == e) {
- pidns_hash_table[h] = e->next;
- free_disarm(e);
+ ino_hash = HASH(entry->ino);
+ if (pidns_hash_table[ino_hash] == entry) {
+ pidns_hash_table[ino_hash] = entry->next;
+ close_prot_errno_disarm(entry->init_pidfd);
+ free_disarm(entry);
return;
}
- tmp = pidns_hash_table[h];
- while (tmp) {
- if (tmp->next == e) {
- tmp->next = e->next;
- free_disarm(e);
+ it = pidns_hash_table[ino_hash];
+ while (it) {
+ if (it->next == entry) {
+ it->next = entry->next;
+ close_prot_errno_disarm(entry->init_pidfd);
+ free_disarm(entry);
return;
}
- tmp = tmp->next;
+ it = it->next;
}
}
static void prune_initpid_store(void)
{
static long int last_prune = 0;
- struct pidns_init_store *e, *prev, *delme;
long int now, threshold;
- int i;
if (!last_prune) {
last_prune = time(NULL);
return;
}
+
now = time(NULL);
if (now < last_prune + PURGE_SECS)
return;
- lxcfs_debug("%s\n", "Pruning.");
+ lxcfs_debug("Pruning init pid cache");
last_prune = now;
threshold = now - 2 * PURGE_SECS;
- for (i = 0; i < PIDNS_HASH_SIZE; i++) {
- for (prev = NULL, e = pidns_hash_table[i]; e; ) {
- if (e->lastcheck < threshold) {
+ for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
+ for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
+ if (entry->lastcheck < threshold) {
+ struct pidns_init_store *cur = entry;
- lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
+ lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
- delme = e;
if (prev)
- prev->next = e->next;
+ prev->next = entry->next;
else
- pidns_hash_table[i] = e->next;
- e = e->next;
- free_disarm(delme);
+ pidns_hash_table[i] = entry->next;
+ entry = entry->next;
+ close_prot_errno_disarm(cur->init_pidfd);
+ free_disarm(cur);
} else {
- prev = e;
- e = e->next;
+ prev = entry;
+ entry = entry->next;
}
}
}
/* Must be called under store_lock */
static void save_initpid(struct stat *sb, pid_t pid)
{
- struct pidns_init_store *e;
- char fpath[100];
- struct stat procsb;
- int h;
+ __do_free struct pidns_init_store *entry = NULL;
+ __do_close_prot_errno int pidfd = -EBADF;
+ char path[LXCFS_PROC_PID_LEN];
+ struct lxcfs_opts *opts = fuse_get_context()->private_data;
+ struct stat st;
+ int ino_hash;
+
+ if (opts->use_pidfd && can_use_pidfd) {
+ pidfd = pidfd_open(pid, 0);
+ if (pidfd < 0)
+ return;
+ }
- lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
+ snprintf(path, sizeof(path), "/proc/%d", pid);
+ if (stat(path, &st))
+ return;
- snprintf(fpath, 100, "/proc/%d", pid);
- if (stat(fpath, &procsb) < 0)
+ entry = malloc(sizeof(*entry));
+ if (entry)
return;
- do {
- e = malloc(sizeof(*e));
- } while (!e);
- e->ino = sb->st_ino;
- e->initpid = pid;
- e->ctime = procsb.st_ctime;
- h = HASH(e->ino);
- e->next = pidns_hash_table[h];
- e->lastcheck = time(NULL);
- pidns_hash_table[h] = e;
+
+ ino_hash = HASH(entry->ino);
+ *entry = (struct pidns_init_store){
+ .ino = sb->st_ino,
+ .initpid = pid,
+ .ctime = st.st_ctime,
+ .next = pidns_hash_table[ino_hash],
+ .lastcheck = time(NULL),
+ .init_pidfd = move_fd(pidfd),
+ };
+ pidns_hash_table[ino_hash] = move_ptr(entry);
+
+ lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
}
/*
*/
static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
{
- int h = HASH(sb->st_ino);
- struct pidns_init_store *e = pidns_hash_table[h];
-
- while (e) {
- if (e->ino == sb->st_ino) {
- if (initpid_still_valid(e, sb)) {
- e->lastcheck = time(NULL);
- return e;
+ struct pidns_init_store *entry = pidns_hash_table[HASH(sb->st_ino)];
+
+ while (entry) {
+ if (entry->ino == sb->st_ino) {
+ if (initpid_still_valid(entry)) {
+ entry->lastcheck = time(NULL);
+ return entry;
}
- remove_initpid(e);
+
+ remove_initpid(entry);
return NULL;
}
- e = e->next;
+ entry = entry->next;
}
return NULL;
}
-struct cgfs_files {
- char *name;
- uint32_t uid, gid;
- uint32_t mode;
-};
-
-static void print_subsystems(void)
+static int send_creds_clone_wrapper(void *arg)
{
- int i = 0;
-
- fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd);
- fprintf(stderr, "hierarchies:\n");
- for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
- __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
- fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
- }
+ int sock = PTR_TO_INT(arg);
+ char v = '1'; /* we are the child */
+ struct ucred cred = {
+ .uid = 0,
+ .gid = 0,
+ .pid = 1,
+ };
+
+ return send_creds(sock, &cred, v, true) != SEND_CREDS_OK;
}
-bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
+/*
+ * Let's use the "standard stack limit" (i.e. glibc thread size default) for
+ * stack sizes: 8MB.
+ */
+#define __LXCFS_STACK_SIZE (8 * 1024 * 1024)
+static pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags)
{
- int ret, cfd;
- size_t len;
- char *fnam;
+ pid_t ret;
+ void *stack;
- cfd = get_cgroup_fd(controller);
- if (cfd < 0)
- return false;
+ stack = malloc(__LXCFS_STACK_SIZE);
+ if (!stack)
+ return ret_errno(ENOMEM);
- /* Make sure we pass a relative path to *at() family of functions.
- * . + /cgroup + / + file + \0
- */
- len = strlen(cgroup) + strlen(file) + 3;
- fnam = alloca(len);
- ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
- if (ret < 0 || (size_t)ret >= len)
- return false;
-
- return (faccessat(cfd, fnam, F_OK, 0) == 0);
+#ifdef __ia64__
+ ret = __clone2(fn, stack, __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
+#else
+ ret = clone(fn, stack + __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
+#endif
+ return ret;
}
-#define SEND_CREDS_OK 0
-#define SEND_CREDS_NOTSK 1
-#define SEND_CREDS_FAIL 2
-static int wait_for_pid(pid_t pid);
-static int send_creds_clone_wrapper(void *arg);
+#define LXCFS_PROC_PID_NS_LEN \
+ (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
+ STRLITERALLEN("/ns/pid") + 1)
/*
* clone a task which switches to @task's namespace and writes '1'.
*/
static void write_task_init_pid_exit(int sock, pid_t target)
{
- char fnam[100];
+ __do_close_prot_errno int fd = -EBADF;
+ char path[LXCFS_PROC_PID_NS_LEN];
pid_t pid;
- int fd, ret;
- size_t stack_size = sysconf(_SC_PAGESIZE);
- void *stack = alloca(stack_size);
-
- ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
- if (ret < 0 || ret >= sizeof(fnam))
- _exit(1);
-
- fd = open(fnam, O_RDONLY);
- if (fd < 0) {
- perror("write_task_init_pid_exit open of ns/pid");
- _exit(1);
- }
- if (setns(fd, 0)) {
- perror("write_task_init_pid_exit setns 1");
- close(fd);
- _exit(1);
- }
- pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
+
+ snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
+ fd = open(path, O_RDONLY | O_CLOEXEC);
+ if (fd < 0)
+ log_exit("write_task_init_pid_exit open of ns/pid");
+
+ if (setns(fd, 0))
+ log_exit("Failed to setns to pid namespace of process %d", target);
+
+ pid = lxcfs_clone(send_creds_clone_wrapper, INT_TO_PTR(sock), 0);
if (pid < 0)
- _exit(1);
+ _exit(EXIT_FAILURE);
+
if (pid != 0) {
if (!wait_for_pid(pid))
- _exit(1);
- _exit(0);
- }
-}
+ _exit(EXIT_FAILURE);
-static int send_creds_clone_wrapper(void *arg) {
- struct ucred cred;
- char v;
- int sock = *(int *)arg;
-
- /* we are the child */
- cred.uid = 0;
- cred.gid = 0;
- cred.pid = 1;
- v = '1';
- if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
- return 1;
- return 0;
+ _exit(EXIT_SUCCESS);
+ }
}
static pid_t get_init_pid_for_task(pid_t task)
{
- int sock[2];
- pid_t pid;
- pid_t ret = -1;
char v = '0';
+ pid_t pid_ret = -1;
+ pid_t pid;
+ int sock[2];
struct ucred cred;
- if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
- perror("socketpair");
+ if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
return -1;
- }
pid = fork();
if (pid < 0)
goto out;
- if (!pid) {
+
+ if (pid == 0) {
close(sock[1]);
write_task_init_pid_exit(sock[0], task);
- _exit(0);
+ _exit(EXIT_SUCCESS);
}
if (!recv_creds(sock[1], &cred, &v))
goto out;
- ret = cred.pid;
+
+ pid_ret = cred.pid;
out:
close(sock[0]);
close(sock[1]);
if (pid > 0)
wait_for_pid(pid);
- return ret;
+
+ return pid_ret;
}
-pid_t lookup_initpid_in_store(pid_t qpid)
+pid_t lookup_initpid_in_store(pid_t pid)
{
pid_t answer = 0;
- struct stat sb;
- struct pidns_init_store *e;
- char fnam[100];
+ char path[LXCFS_PROC_PID_NS_LEN];
+ struct stat st;
+ struct pidns_init_store *entry;
+
+ snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
- snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
store_lock();
- if (stat(fnam, &sb) < 0)
+ if (stat(path, &st))
goto out;
- e = lookup_verify_initpid(&sb);
- if (e) {
- answer = e->initpid;
+
+ entry = lookup_verify_initpid(&st);
+ if (entry) {
+ answer = entry->initpid;
goto out;
}
- answer = get_init_pid_for_task(qpid);
+
+ answer = get_init_pid_for_task(pid);
if (answer > 0)
- save_initpid(&sb, answer);
+ save_initpid(&st, answer);
out:
- /* we prune at end in case we are returning
- * the value we were about to return */
+ /*
+ * Prune at the end in case we're returning the value we were about to
+ * return.
+ */
prune_initpid_store();
- store_unlock();
- return answer;
-}
-
-static int wait_for_pid(pid_t pid)
-{
- int status, ret;
-
- if (pid <= 0)
- return -1;
-
-again:
- ret = waitpid(pid, &status, 0);
- if (ret == -1) {
- if (errno == EINTR)
- goto again;
- return -1;
- }
- if (ret != pid)
- goto again;
- if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
- return -1;
- return 0;
-}
-
-#define INITSCOPE "/init.scope"
-void prune_init_slice(char *cg)
-{
- char *point;
- size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
- if (cg_len < initscope_len)
- return;
+ store_unlock();
- point = cg + cg_len - initscope_len;
- if (strcmp(point, INITSCOPE) == 0) {
- if (point == cg)
- *(point+1) = '\0';
- else
- *point = '\0';
- }
+ return answer;
}
-struct pid_ns_clone_args {
- int *cpipe;
- int sock;
- pid_t tpid;
- int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
-};
-
/*
* Functions needed to setup cgroups in the __constructor__.
*/
*/
static bool is_on_ramfs(void)
{
- FILE *f;
- char *p, *p2;
- char *line = NULL;
+ __do_free char *line = NULL;
+ __do_fclose FILE *f = NULL;
size_t len = 0;
- int i;
- f = fopen("/proc/self/mountinfo", "r");
+ f = fopen("/proc/self/mountinfo", "re");
if (!f)
return false;
while (getline(&line, &len, f) != -1) {
+ int i;
+ char *p, *p2;
+
for (p = line, i = 0; p && i < 4; i++)
p = strchr(p + 1, ' ');
if (!p)
continue;
+
p2 = strchr(p + 1, ' ');
if (!p2)
continue;
*p2 = '\0';
if (strcmp(p + 1, "/") == 0) {
- // this is '/'. is it the ramfs?
+ /* This is '/'. Is it the ramfs? */
p = strchr(p2 + 1, '-');
- if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
- free(line);
- fclose(f);
+ if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
return true;
- }
}
}
- free(line);
- fclose(f);
+
return false;
}
static int pivot_enter()
{
- int ret = -1, oldroot = -1, newroot = -1;
+ __do_close_prot_errno int oldroot = -EBADF, newroot = -EBADF;
oldroot = open("/", O_DIRECTORY | O_RDONLY);
- if (oldroot < 0) {
- lxcfs_error("%s\n", "Failed to open old root for fchdir.");
- return ret;
- }
+ if (oldroot < 0)
+ return log_error_errno(-1, errno,
+ "Failed to open old root for fchdir");
newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
- if (newroot < 0) {
- lxcfs_error("%s\n", "Failed to open new root for fchdir.");
- goto err;
- }
+ if (newroot < 0)
+ return log_error_errno(-1, errno,
+ "Failed to open new root for fchdir");
/* change into new root fs */
- if (fchdir(newroot) < 0) {
- lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
- goto err;
- }
+ if (fchdir(newroot) < 0)
+ return log_error_errno(-1,
+ errno, "Failed to change directory to new rootfs: %s",
+ ROOTDIR);
/* pivot_root into our new root fs */
- if (pivot_root(".", ".") < 0) {
- lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
- goto err;
- }
+ if (pivot_root(".", ".") < 0)
+ return log_error_errno(-1, errno,
+ "pivot_root() syscall failed: %s",
+ strerror(errno));
/*
* At this point the old-root is mounted on top of our new-root.
* To unmounted it we must not be chdir'd into it, so escape back
* to the old-root.
*/
- if (fchdir(oldroot) < 0) {
- lxcfs_error("%s\n", "Failed to enter old root.");
- goto err;
- }
-
- if (umount2(".", MNT_DETACH) < 0) {
- lxcfs_error("%s\n", "Failed to detach old root.");
- goto err;
- }
+ if (fchdir(oldroot) < 0)
+ return log_error_errno(-1, errno, "Failed to enter old root");
- if (fchdir(newroot) < 0) {
- lxcfs_error("%s\n", "Failed to re-enter new root.");
- goto err;
- }
+ if (umount2(".", MNT_DETACH) < 0)
+ return log_error_errno(-1, errno, "Failed to detach old root");
- ret = 0;
+ if (fchdir(newroot) < 0)
+ return log_error_errno(-1, errno, "Failed to re-enter new root");
-err:
- if (oldroot > 0)
- close(oldroot);
- if (newroot > 0)
- close(newroot);
-
- return ret;
+ return 0;
}
static int chroot_enter()
if (!cgfs_prepare_mounts())
return false;
- if (!cgfs_mount_hierarchies()) {
- lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
- return false;
- }
+ if (!cgfs_mount_hierarchies())
+ return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
if (!permute_root())
return false;
static void __attribute__((constructor)) lxcfs_init(void)
{
- __do_close_prot_errno int init_ns = -EBADF;
+ __do_close_prot_errno int init_ns = -EBADF, pidfd = -EBADF;
+ int i = 0;
+ pid_t pid;
char *cret;
char cwd[MAXPATHLEN];
log_exit("Failed to initialize cgroup support");
/* Preserve initial namespace. */
- init_ns = preserve_ns(getpid(), "mnt");
+ pid = getpid();
+ init_ns = preserve_ns(pid, "mnt");
if (init_ns < 0)
log_exit("Failed to preserve initial mount namespace");
cret = getcwd(cwd, MAXPATHLEN);
+ if (!cret)
log_exit("%s - Could not retrieve current working directory", strerror(errno));
/* This function calls unshare(CLONE_NEWNS) our initial mount namespace
if (!init_cpuview())
log_exit("Failed to init CPU view");
- print_subsystems();
+ fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd);
+ fprintf(stderr, "hierarchies:\n");
+
+ for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
+ __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
+ fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
+ }
+
+ pidfd = pidfd_open(pid, 0);
+ if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
+ can_use_pidfd = true;
+ fprintf(stderr, "Kernel supports pidfds\n");
+ }
+
+ fprintf(stderr, "api_extensions:\n");
+ for (i = 0; i < nr_api_extensions; i++)
+ fprintf(stderr, "- %s\n", api_extensions[i]);
}
static void __attribute__((destructor)) lxcfs_exit(void)