/* SPDX-License-Identifier: LGPL-2.1+ */
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
-#ifndef FUSE_USE_VERSION
-#define FUSE_USE_VERSION 26
-#endif
-
-#define _FILE_OFFSET_BITS 64
+#include "config.h"
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
-#include <fuse.h>
#include <inttypes.h>
#include <libgen.h>
#include <linux/magic.h>
#include <unistd.h>
#include <wait.h>
-#include "api_extensions.h"
#include "bindings.h"
+
+#include "api_extensions.h"
#include "cgroup_fuse.h"
#include "cgroups/cgroup.h"
#include "cgroups/cgroup_utils.h"
-#include "config.h"
#include "memory_utils.h"
#include "proc_cpuview.h"
#include "syscall_numbers.h"
#include "utils.h"
static bool can_use_pidfd;
+static bool can_use_swap;
+static bool can_use_sys_cpu;
+static bool has_versioned_opts;
static volatile sig_atomic_t reload_successful;
return reload_successful != 0;
}
+bool liblxcfs_can_use_swap(void)
+{
+ return can_use_swap;
+}
+
+bool liblxcfs_can_use_sys_cpu(void)
+{
+ return can_use_sys_cpu;
+}
+
+bool liblxcfs_has_versioned_opts(void)
+{
+ return has_versioned_opts;
+}
+
/* Define pivot_root() if missing from the C library */
#ifndef HAVE_PIVOT_ROOT
static int pivot_root(const char *new_root, const char *put_old)
static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
-static void lock_mutex(pthread_mutex_t *l)
+static void mutex_lock(pthread_mutex_t *l)
{
int ret;
struct cgroup_ops *cgroup_ops;
-static void unlock_mutex(pthread_mutex_t *l)
+static void mutex_unlock(pthread_mutex_t *l)
{
int ret;
log_exit("%s - returned %d\n", strerror(ret), ret);
}
-static inline void unlock_mutex_function(pthread_mutex_t **mutex)
+static inline void store_lock(void)
{
- if (*mutex)
- unlock_mutex(*mutex);
+ mutex_lock(&pidns_store_mutex);
}
-#define __do_unlock call_cleaner(unlock_mutex)
-static pthread_mutex_t* __attribute__((warn_unused_result)) store_lock(void)
+static inline void store_unlock(void)
{
- lock_mutex(&pidns_store_mutex);
- return &pidns_store_mutex;
+ mutex_unlock(&pidns_store_mutex);
}
/* /proc/ = 6
}
now = time(NULL);
- if (now < last_prune + PURGE_SECS)
+ if (now < (last_prune + PURGE_SECS))
return;
lxcfs_debug("Pruning init pid cache");
}
}
+static void clear_initpid_store(void)
+{
+ store_lock();
+ for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
+ for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) {
+ struct pidns_init_store *cur = entry;
+
+ lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
+
+ pidns_hash_table[i] = entry->next;
+ entry = entry->next;
+ close_prot_errno_disarm(cur->init_pidfd);
+ free_disarm(cur);
+ }
+ }
+ store_unlock();
+}
+
/* Must be called under store_lock */
static void save_initpid(ino_t pidns_inode, pid_t pid)
{
__do_free struct pidns_init_store *entry = NULL;
__do_close int pidfd = -EBADF;
+ const struct lxcfs_opts *opts = fuse_get_context()->private_data;
char path[LXCFS_PROC_PID_LEN];
- struct lxcfs_opts *opts = fuse_get_context()->private_data;
struct stat st;
int ino_hash;
if (!entry)
return;
- ino_hash = HASH(entry->ino);
+ ino_hash = HASH(pidns_inode);
*entry = (struct pidns_init_store){
.ino = pidns_inode,
.initpid = pid,
* otherwise.
* Must be called under store_lock
*/
-static struct pidns_init_store *lookup_verify_initpid(ino_t pidns_inode)
+static pid_t lookup_verify_initpid(ino_t pidns_inode)
{
struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
if (entry->ino == pidns_inode) {
if (initpid_still_valid(entry)) {
entry->lastcheck = time(NULL);
- return entry;
+ return entry->initpid;
}
remove_initpid(entry);
- return NULL;
+ return ret_errno(ESRCH);
}
entry = entry->next;
}
- return NULL;
+ return ret_errno(ESRCH);
}
-static int send_creds_clone_wrapper(void *arg)
+static bool send_creds_ok(int sock_fd)
{
- int sock = PTR_TO_INT(arg);
char v = '1'; /* we are the child */
struct ucred cred = {
.uid = 0,
.pid = 1,
};
- return send_creds(sock, &cred, v, true) != SEND_CREDS_OK;
+ return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK;
}
-/*
- * Let's use the "standard stack limit" (i.e. glibc thread size default) for
- * stack sizes: 8MB.
- */
-#define __LXCFS_STACK_SIZE (8 * 1024 * 1024)
-pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags)
+__returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd)
{
- pid_t ret;
- void *stack;
+ /*
+ * These flags don't interest at all so we don't jump through any hoops
+ * of retrieving them and passing them to the kernel.
+ */
+ errno = EINVAL;
+ if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
+ CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
+ return -EINVAL;
+
+#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
+ /* On s390/s390x and cris the order of the first and second arguments
+ * of the system call is reversed.
+ */
+ return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd);
+#elif defined(__sparc__) && defined(__arch64__)
+ {
+ /*
+ * sparc64 always returns the other process id in %o0, and a
+ * boolean flag whether this is the child or the parent in %o1.
+ * Inline assembly is needed to get the flag returned in %o1.
+ */
+ register long g1 asm("g1") = __NR_clone;
+ register long o0 asm("o0") = flags | SIGCHLD;
+ register long o1 asm("o1") = 0; /* is parent/child indicator */
+ register long o2 asm("o2") = (unsigned long)pidfd;
+ long is_error, retval, in_child;
+ pid_t child_pid;
+
+ asm volatile(
+#if defined(__arch64__)
+ "t 0x6d\n\t" /* 64-bit trap */
+#else
+ "t 0x10\n\t" /* 32-bit trap */
+#endif
+ /*
+ * catch errors: On sparc, the carry bit (csr) in the
+ * processor status register (psr) is used instead of a
+ * full register.
+ */
+ "addx %%g0, 0, %%g1"
+ : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */
+ : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */
+ : "%cc"); /* clobbers */
+
+ is_error = g1;
+ retval = o0;
+ in_child = o1;
+
+ if (is_error) {
+ errno = retval;
+ return -1;
+ }
- stack = malloc(__LXCFS_STACK_SIZE);
- if (!stack)
- return ret_errno(ENOMEM);
+ if (in_child)
+ return 0;
-#ifdef __ia64__
- ret = __clone2(fn, stack, __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
+ child_pid = retval;
+ return child_pid;
+ }
+#elif defined(__ia64__)
+ /* On ia64 the stack and stack size are passed as separate arguments. */
+ return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd);
#else
- ret = clone(fn, stack + __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
+ return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd);
#endif
- return ret;
}
#define LXCFS_PROC_PID_NS_LEN \
if (setns(fd, 0))
log_exit("Failed to setns to pid namespace of process %d", target);
- pid = lxcfs_clone(send_creds_clone_wrapper, INT_TO_PTR(sock), 0);
+ pid = lxcfs_raw_clone(0, NULL);
if (pid < 0)
_exit(EXIT_FAILURE);
- if (pid != 0) {
- if (!wait_for_pid(pid))
+ if (pid == 0) {
+ if (!send_creds_ok(sock))
_exit(EXIT_FAILURE);
_exit(EXIT_SUCCESS);
}
+
+ if (!wait_for_pid(pid))
+ _exit(EXIT_FAILURE);
+
+ _exit(EXIT_SUCCESS);
}
-static pid_t get_init_pid_for_task(pid_t task)
+static pid_t scm_init_pid(pid_t task)
{
char v = '0';
pid_t pid_ret = -1;
pid_t lookup_initpid_in_store(pid_t pid)
{
- __do_unlock pthread_mutex_t *store_mutex = NULL;
- pid_t answer = 0;
+ pid_t hashed_pid = 0;
char path[LXCFS_PROC_PID_NS_LEN];
struct stat st;
- struct pidns_init_store *entry;
snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
-
if (stat(path, &st))
- goto out;
+ return ret_errno(ESRCH);
- store_mutex = store_lock();
+ store_lock();
- entry = lookup_verify_initpid(st.st_ino);
- if (entry) {
- answer = entry->initpid;
- goto out;
- }
+ hashed_pid = lookup_verify_initpid(st.st_ino);
+ if (hashed_pid < 0) {
+ /* release the mutex as the following call is expensive */
+ store_unlock();
- /* release the mutex as the following call is expensive */
- unlock_mutex(move_ptr(store_mutex));
- answer = get_init_pid_for_task(pid);
- store_mutex = store_lock();
+ hashed_pid = scm_init_pid(pid);
- if (answer > 0)
- save_initpid(st.st_ino, answer);
+ store_lock();
+
+ if (hashed_pid > 0)
+ save_initpid(st.st_ino, hashed_pid);
+ }
-out:
/*
- * Prune at the end in case we're returning the value we were about to
- * return.
+ * Prune at the end in case we're pruning the value
+ * we were about to return.
*/
prune_initpid_store();
+ store_unlock();
- return answer;
+ return hashed_pid;
}
/*
return false;
}
-static int pivot_enter()
+static int pivot_enter(void)
{
__do_close int oldroot = -EBADF, newroot = -EBADF;
return 0;
}
-static int chroot_enter()
+static int chroot_enter(void)
{
if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
please_compiler:
/*
* The write() syscall is a function whose return value needs to be
- * checked. Otherwise the compiler will warn. This is how we
- * please our master. Another one could be to use
- * syscall(__NR_write, ...) directly but whatever.
+ * checked. Otherwise the compiler will warn.Another one could be to
+ * use syscall(__NR_write, ...) directly but whatever.
*/
return;
}
lxcfs_info("Kernel supports pidfds");
}
+ can_use_swap = cgroup_ops->can_use_swap(cgroup_ops);
+ if (can_use_swap)
+ lxcfs_info("Kernel supports swap accounting");
+ else
+ lxcfs_info("Kernel does not support swap accounting");
+
lxcfs_info("api_extensions:");
- for (i = 0; i < nr_api_extensions; i++)
- lxcfs_info("- %s", api_extensions[i]);
+ for (size_t nr = 0; nr < nr_api_extensions; nr++)
+ lxcfs_info("- %s", api_extensions[nr]);
root_fd = open("/", O_PATH | O_CLOEXEC);
if (root_fd < 0)
{
lxcfs_info("Running destructor %s", __func__);
+ clear_initpid_store();
free_cpuview();
cgroup_exit(cgroup_ops);
}
+
+void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data)
+{
+ struct fuse_context *fc = fuse_get_context();
+ can_use_sys_cpu = true;
+ has_versioned_opts = true;
+ return fc->private_data;
+}