]> git.proxmox.com Git - mirror_lxcfs.git/blobdiff - src/bindings.c
Fix build on ia64
[mirror_lxcfs.git] / src / bindings.c
index 0c1973938e5c83a7a838eb550b51340c3b96d639..fe106a65c34ceb28e8a077525a661d0ffc6b1e3b 100644 (file)
@@ -1,19 +1,10 @@
 /* SPDX-License-Identifier: LGPL-2.1+ */
 
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
-#ifndef FUSE_USE_VERSION
-#define FUSE_USE_VERSION 26
-#endif
-
-#define _FILE_OFFSET_BITS 64
+#include "config.h"
 
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
-#include <fuse.h>
 #include <inttypes.h>
 #include <libgen.h>
 #include <linux/magic.h>
 #include <unistd.h>
 #include <wait.h>
 
-#include "api_extensions.h"
 #include "bindings.h"
+
+#include "api_extensions.h"
 #include "cgroup_fuse.h"
 #include "cgroups/cgroup.h"
 #include "cgroups/cgroup_utils.h"
-#include "config.h"
 #include "memory_utils.h"
 #include "proc_cpuview.h"
 #include "syscall_numbers.h"
 #include "utils.h"
 
 static bool can_use_pidfd;
+static bool can_use_swap;
+static bool can_use_sys_cpu;
+static bool has_versioned_opts;
+static bool memory_is_cgroupv2;
 
 static volatile sig_atomic_t reload_successful;
 
@@ -58,6 +53,26 @@ bool liblxcfs_functional(void)
        return reload_successful != 0;
 }
 
+bool liblxcfs_can_use_swap(void)
+{
+       return can_use_swap;
+}
+
+bool liblxcfs_can_use_sys_cpu(void)
+{
+       return can_use_sys_cpu;
+}
+
+bool liblxcfs_has_versioned_opts(void)
+{
+       return has_versioned_opts;
+}
+
+bool liblxcfs_memory_is_cgroupv2(void)
+{
+       return memory_is_cgroupv2;
+}
+
 /* Define pivot_root() if missing from the C library */
 #ifndef HAVE_PIVOT_ROOT
 static int pivot_root(const char *new_root, const char *put_old)
@@ -98,7 +113,7 @@ struct pidns_init_store {
 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-static void lock_mutex(pthread_mutex_t *l)
+static void mutex_lock(pthread_mutex_t *l)
 {
        int ret;
 
@@ -109,7 +124,7 @@ static void lock_mutex(pthread_mutex_t *l)
 
 struct cgroup_ops *cgroup_ops;
 
-static void unlock_mutex(pthread_mutex_t *l)
+static void mutex_unlock(pthread_mutex_t *l)
 {
        int ret;
 
@@ -118,17 +133,14 @@ static void unlock_mutex(pthread_mutex_t *l)
                log_exit("%s - returned %d\n", strerror(ret), ret);
 }
 
-static inline void unlock_mutex_function(pthread_mutex_t **mutex)
+static inline void store_lock(void)
 {
-       if (*mutex)
-               unlock_mutex(*mutex);
+       mutex_lock(&pidns_store_mutex);
 }
-#define __do_unlock call_cleaner(unlock_mutex)
 
-static pthread_mutex_t* __attribute__((warn_unused_result)) store_lock(void)
+static inline void store_unlock(void)
 {
-       lock_mutex(&pidns_store_mutex);
-       return &pidns_store_mutex;
+       mutex_unlock(&pidns_store_mutex);
 }
 
 /* /proc/       =    6
@@ -224,7 +236,7 @@ static void prune_initpid_store(void)
        }
 
        now = time(NULL);
-       if (now < last_prune + PURGE_SECS)
+       if (now < (last_prune + PURGE_SECS))
                return;
 
        lxcfs_debug("Pruning init pid cache");
@@ -254,13 +266,31 @@ static void prune_initpid_store(void)
        }
 }
 
+static void clear_initpid_store(void)
+{
+       store_lock();
+       for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
+               for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) {
+                       struct pidns_init_store *cur = entry;
+
+                       lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
+
+                       pidns_hash_table[i] = entry->next;
+                       entry = entry->next;
+                       close_prot_errno_disarm(cur->init_pidfd);
+                       free_disarm(cur);
+               }
+       }
+       store_unlock();
+}
+
 /* Must be called under store_lock */
 static void save_initpid(ino_t pidns_inode, pid_t pid)
 {
        __do_free struct pidns_init_store *entry = NULL;
        __do_close int pidfd = -EBADF;
+       const struct lxcfs_opts *opts = fuse_get_context()->private_data;
        char path[LXCFS_PROC_PID_LEN];
-       struct lxcfs_opts *opts = fuse_get_context()->private_data;
        struct stat st;
        int ino_hash;
 
@@ -274,11 +304,11 @@ static void save_initpid(ino_t pidns_inode, pid_t pid)
        if (stat(path, &st))
                return;
 
-       entry = malloc(sizeof(*entry));
+       entry = zalloc(sizeof(*entry));
        if (!entry)
                return;
 
-       ino_hash = HASH(entry->ino);
+       ino_hash = HASH(pidns_inode);
        *entry = (struct pidns_init_store){
                .ino            = pidns_inode,
                .initpid        = pid,
@@ -299,7 +329,7 @@ static void save_initpid(ino_t pidns_inode, pid_t pid)
  * otherwise.
  * Must be called under store_lock
  */
-static struct pidns_init_store *lookup_verify_initpid(ino_t pidns_inode)
+static pid_t lookup_verify_initpid(ino_t pidns_inode)
 {
        struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
 
@@ -307,21 +337,20 @@ static struct pidns_init_store *lookup_verify_initpid(ino_t pidns_inode)
                if (entry->ino == pidns_inode) {
                        if (initpid_still_valid(entry)) {
                                entry->lastcheck = time(NULL);
-                               return entry;
+                               return entry->initpid;
                        }
 
                        remove_initpid(entry);
-                       return NULL;
+                       return ret_errno(ESRCH);
                }
                entry = entry->next;
        }
 
-       return NULL;
+       return ret_errno(ESRCH);
 }
 
-static int send_creds_clone_wrapper(void *arg)
+static bool send_creds_ok(int sock_fd)
 {
-       int sock = PTR_TO_INT(arg);
        char v = '1'; /* we are the child */
        struct ucred cred = {
            .uid = 0,
@@ -329,29 +358,76 @@ static int send_creds_clone_wrapper(void *arg)
            .pid = 1,
        };
 
-       return send_creds(sock, &cred, v, true) != SEND_CREDS_OK;
+       return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK;
 }
 
-/*
- * Let's use the "standard stack limit" (i.e. glibc thread size default) for
- * stack sizes: 8MB.
- */
-#define __LXCFS_STACK_SIZE (8 * 1024 * 1024)
-pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags)
+__returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd)
 {
-       pid_t ret;
-       void *stack;
+       /*
+        * These flags don't interest at all so we don't jump through any hoops
+        * of retrieving them and passing them to the kernel.
+        */
+       errno = EINVAL;
+       if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
+                     CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
+               return -EINVAL;
+
+#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
+       /* On s390/s390x and cris the order of the first and second arguments
+        * of the system call is reversed.
+        */
+       return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd);
+#elif defined(__sparc__) && defined(__arch64__)
+       {
+               /*
+                * sparc64 always returns the other process id in %o0, and a
+                * boolean flag whether this is the child or the parent in %o1.
+                * Inline assembly is needed to get the flag returned in %o1.
+                */
+               register long g1 asm("g1") = __NR_clone;
+               register long o0 asm("o0") = flags | SIGCHLD;
+               register long o1 asm("o1") = 0; /* is parent/child indicator */
+               register long o2 asm("o2") = (unsigned long)pidfd;
+               long is_error, retval, in_child;
+               pid_t child_pid;
+
+               asm volatile(
+#if defined(__arch64__)
+                   "t 0x6d\n\t" /* 64-bit trap */
+#else
+                   "t 0x10\n\t" /* 32-bit trap */
+#endif
+                   /*
+                    * catch errors: On sparc, the carry bit (csr) in the
+                    * processor status register (psr) is used instead of a
+                    * full register.
+                    */
+                   "addx %%g0, 0, %%g1"
+                   : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */
+                   : "r"(g1), "r"(o0), "r"(o1), "r"(o2)     /* inputs */
+                   : "%cc");                                /* clobbers */
+
+               is_error = g1;
+               retval = o0;
+               in_child = o1;
+
+               if (is_error) {
+                       errno = retval;
+                       return -1;
+               }
 
-       stack = malloc(__LXCFS_STACK_SIZE);
-       if (!stack)
-               return ret_errno(ENOMEM);
+               if (in_child)
+                       return 0;
 
-#ifdef __ia64__
-       ret = __clone2(fn, stack, __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
+               child_pid = retval;
+               return child_pid;
+       }
+#elif defined(__ia64__)
+       /* On ia64 the stack and stack size are passed as separate arguments. */
+       return syscall(__NR_clone, flags | SIGCHLD, NULL, 0, pidfd);
 #else
-       ret = clone(fn, stack + __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
+       return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd);
 #endif
-       return ret;
 }
 
 #define LXCFS_PROC_PID_NS_LEN                                    \
@@ -382,19 +458,24 @@ static void write_task_init_pid_exit(int sock, pid_t target)
        if (setns(fd, 0))
                log_exit("Failed to setns to pid namespace of process %d", target);
 
-       pid = lxcfs_clone(send_creds_clone_wrapper, INT_TO_PTR(sock), 0);
+       pid = lxcfs_raw_clone(0, NULL);
        if (pid < 0)
                _exit(EXIT_FAILURE);
 
-       if (pid != 0) {
-               if (!wait_for_pid(pid))
+       if (pid == 0) {
+               if (!send_creds_ok(sock))
                        _exit(EXIT_FAILURE);
 
                _exit(EXIT_SUCCESS);
        }
+
+       if (!wait_for_pid(pid))
+               _exit(EXIT_FAILURE);
+
+       _exit(EXIT_SUCCESS);
 }
 
-static pid_t get_init_pid_for_task(pid_t task)
+static pid_t scm_init_pid(pid_t task)
 {
        char v = '0';
        pid_t pid_ret = -1;
@@ -435,41 +516,37 @@ out:
 
 pid_t lookup_initpid_in_store(pid_t pid)
 {
-       __do_unlock pthread_mutex_t *store_mutex = NULL;
-       pid_t answer = 0;
+       pid_t hashed_pid = 0;
        char path[LXCFS_PROC_PID_NS_LEN];
        struct stat st;
-       struct pidns_init_store *entry;
 
        snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
-
        if (stat(path, &st))
-               goto out;
+               return ret_errno(ESRCH);
 
-       store_mutex = store_lock();
+       store_lock();
 
-       entry = lookup_verify_initpid(st.st_ino);
-       if (entry) {
-               answer = entry->initpid;
-               goto out;
-       }
+       hashed_pid = lookup_verify_initpid(st.st_ino);
+       if (hashed_pid < 0) {
+               /* release the mutex as the following call is expensive */
+               store_unlock();
 
-       /* release the mutex as the following call is expensive */
-       unlock_mutex(move_ptr(store_mutex));
-       answer = get_init_pid_for_task(pid);
-       store_mutex = store_lock();
+               hashed_pid = scm_init_pid(pid);
 
-       if (answer > 0)
-               save_initpid(st.st_ino, answer);
+               store_lock();
+
+               if (hashed_pid > 0)
+                       save_initpid(st.st_ino, hashed_pid);
+       }
 
-out:
        /*
-        * Prune at the end in case we're returning the value we were about to
-        * return.
+        * Prune at the end in case we're pruning the value
+        * we were about to return.
         */
        prune_initpid_store();
+       store_unlock();
 
-       return answer;
+       return hashed_pid;
 }
 
 /*
@@ -534,7 +611,7 @@ static bool is_on_ramfs(void)
        return false;
 }
 
-static int pivot_enter()
+static int pivot_enter(void)
 {
        __do_close int oldroot = -EBADF, newroot = -EBADF;
 
@@ -577,7 +654,7 @@ static int pivot_enter()
        return 0;
 }
 
-static int chroot_enter()
+static int chroot_enter(void)
 {
        if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
                lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
@@ -754,9 +831,8 @@ static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extr
 please_compiler:
        /*
         * The write() syscall is a function whose return value needs to be
-        * checked. Otherwise the compiler will warn. This is how we
-        * please our master. Another one could be to use
-        * syscall(__NR_write, ...) directly but whatever.
+        * checked. Otherwise the compiler will warn.Another one could be to
+        * use syscall(__NR_write, ...) directly but whatever.
         */
        return;
 }
@@ -767,6 +843,7 @@ static void __attribute__((constructor)) lxcfs_init(void)
                                  pidfd = -EBADF;
        int i = 0;
        pid_t pid;
+       struct hierarchy *hierarchy;
 
        lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
 
@@ -818,9 +895,18 @@ static void __attribute__((constructor)) lxcfs_init(void)
                lxcfs_info("Kernel supports pidfds");
        }
 
+       can_use_swap = cgroup_ops->can_use_swap(cgroup_ops);
+       if (can_use_swap)
+               lxcfs_info("Kernel supports swap accounting");
+       else
+               lxcfs_info("Kernel does not support swap accounting");
+
+       hierarchy = cgroup_ops->get_hierarchy(cgroup_ops, "memory");
+       memory_is_cgroupv2 = hierarchy && is_unified_hierarchy(hierarchy);
+
        lxcfs_info("api_extensions:");
-       for (i = 0; i < nr_api_extensions; i++)
-               lxcfs_info("- %s", api_extensions[i]);
+       for (size_t nr = 0; nr < nr_api_extensions; nr++)
+               lxcfs_info("- %s", api_extensions[nr]);
 
        root_fd = open("/", O_PATH | O_CLOEXEC);
        if (root_fd < 0)
@@ -845,6 +931,17 @@ static void __attribute__((destructor)) lxcfs_exit(void)
 {
        lxcfs_info("Running destructor %s", __func__);
 
+       clear_initpid_store();
        free_cpuview();
        cgroup_exit(cgroup_ops);
 }
+
+void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data)
+{
+       struct fuse_context *fc = fuse_get_context();
+#if HAVE_FUSE_RETURNS_DT_TYPE
+       can_use_sys_cpu = true;
+#endif
+       has_versioned_opts = true;
+       return fc ? fc->private_data : NULL;
+}