1 /* SPDX-License-Identifier: LGPL-2.1+ */
10 #include <linux/magic.h>
11 #include <linux/sched.h>
20 #include <sys/epoll.h>
22 #include <sys/mount.h>
23 #include <sys/param.h>
24 #include <sys/socket.h>
25 #include <sys/syscall.h>
26 #include <sys/sysinfo.h>
34 #include "api_extensions.h"
35 #include "cgroup_fuse.h"
36 #include "cgroups/cgroup.h"
37 #include "cgroups/cgroup_utils.h"
38 #include "memory_utils.h"
39 #include "proc_cpuview.h"
40 #include "syscall_numbers.h"
43 static bool can_use_pidfd
;
44 static bool can_use_swap
;
45 static bool can_use_sys_cpu
;
46 static bool has_versioned_opts
;
47 static bool memory_is_cgroupv2
;
49 static volatile sig_atomic_t reload_successful
;
51 bool liblxcfs_functional(void)
53 return reload_successful
!= 0;
56 bool liblxcfs_can_use_swap(void)
61 bool liblxcfs_can_use_sys_cpu(void)
63 return can_use_sys_cpu
;
66 bool liblxcfs_has_versioned_opts(void)
68 return has_versioned_opts
;
71 bool liblxcfs_memory_is_cgroupv2(void)
73 return memory_is_cgroupv2
;
76 /* Define pivot_root() if missing from the C library */
77 #ifndef HAVE_PIVOT_ROOT
78 static int pivot_root(const char *new_root
, const char *put_old
)
80 return syscall(__NR_pivot_root
, new_root
, put_old
);
83 extern int pivot_root(const char *new_root
, const char *put_old
);
87 * A table caching which pid is init for a pid namespace.
88 * When looking up which pid is init for $qpid, we first
89 * 1. Stat /proc/$qpid/ns/pid.
90 * 2. Check whether the ino_t is in our store.
91 * a. if not, fork a child in qpid's ns to send us
92 * ucred.pid = 1, and read the initpid. Cache
93 * initpid and creation time for /proc/initpid
94 * in a new store entry.
95 * b. if so, verify that /proc/initpid still matches
96 * what we have saved. If not, clear the store
97 * entry and go back to a. If so, return the
100 struct pidns_init_store
{
101 ino_t ino
; /* inode number for /proc/$pid/ns/pid */
102 pid_t initpid
; /* the pid of nit in that ns */
104 int64_t ctime
; /* the time at which /proc/$initpid was created */
105 struct pidns_init_store
*next
;
109 /* lol - look at how they are allocated in the kernel */
110 #define PIDNS_HASH_SIZE 4096
111 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
113 static struct pidns_init_store
*pidns_hash_table
[PIDNS_HASH_SIZE
];
114 static pthread_mutex_t pidns_store_mutex
= PTHREAD_MUTEX_INITIALIZER
;
116 static void mutex_lock(pthread_mutex_t
*l
)
120 ret
= pthread_mutex_lock(l
);
122 log_exit("%s - returned %d\n", strerror(ret
), ret
);
125 struct cgroup_ops
*cgroup_ops
;
127 static void mutex_unlock(pthread_mutex_t
*l
)
131 ret
= pthread_mutex_unlock(l
);
133 log_exit("%s - returned %d\n", strerror(ret
), ret
);
136 static inline void store_lock(void)
138 mutex_lock(&pidns_store_mutex
);
141 static inline void store_unlock(void)
143 mutex_unlock(&pidns_store_mutex
);
148 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
152 #define LXCFS_PROC_PID_LEN \
153 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
155 static int initpid_still_valid_pidfd(struct pidns_init_store
*entry
)
159 if (entry
->init_pidfd
< 0)
160 return ret_errno(ENOSYS
);
162 ret
= pidfd_send_signal(entry
->init_pidfd
, 0, NULL
, 0);
165 return ret_errno(ENOSYS
);
173 static int initpid_still_valid_stat(struct pidns_init_store
*entry
)
176 char path
[LXCFS_PROC_PID_LEN
];
178 snprintf(path
, sizeof(path
), "/proc/%d", entry
->initpid
);
179 if (stat(path
, &st
) || entry
->ctime
!= st
.st_ctime
)
185 /* Must be called under store_lock */
186 static bool initpid_still_valid(struct pidns_init_store
*entry
)
190 ret
= initpid_still_valid_pidfd(entry
);
192 ret
= initpid_still_valid_stat(entry
);
197 /* Must be called under store_lock */
198 static void remove_initpid(struct pidns_init_store
*entry
)
200 struct pidns_init_store
*it
;
203 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
206 ino_hash
= HASH(entry
->ino
);
207 if (pidns_hash_table
[ino_hash
] == entry
) {
208 pidns_hash_table
[ino_hash
] = entry
->next
;
209 close_prot_errno_disarm(entry
->init_pidfd
);
214 it
= pidns_hash_table
[ino_hash
];
216 if (it
->next
== entry
) {
217 it
->next
= entry
->next
;
218 close_prot_errno_disarm(entry
->init_pidfd
);
227 /* Must be called under store_lock */
228 static void prune_initpid_store(void)
230 static int64_t last_prune
= 0;
231 int64_t now
, threshold
;
234 last_prune
= time(NULL
);
239 if (now
< (last_prune
+ PURGE_SECS
))
242 lxcfs_debug("Pruning init pid cache");
245 threshold
= now
- 2 * PURGE_SECS
;
247 for (int i
= 0; i
< PIDNS_HASH_SIZE
; i
++) {
248 for (struct pidns_init_store
*entry
= pidns_hash_table
[i
], *prev
= NULL
; entry
;) {
249 if (entry
->lastcheck
< threshold
) {
250 struct pidns_init_store
*cur
= entry
;
252 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur
->initpid
);
255 prev
->next
= entry
->next
;
257 pidns_hash_table
[i
] = entry
->next
;
259 close_prot_errno_disarm(cur
->init_pidfd
);
269 static void clear_initpid_store(void)
272 for (int i
= 0; i
< PIDNS_HASH_SIZE
; i
++) {
273 for (struct pidns_init_store
*entry
= pidns_hash_table
[i
]; entry
;) {
274 struct pidns_init_store
*cur
= entry
;
276 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur
->initpid
);
278 pidns_hash_table
[i
] = entry
->next
;
280 close_prot_errno_disarm(cur
->init_pidfd
);
287 /* Must be called under store_lock */
288 static void save_initpid(ino_t pidns_inode
, pid_t pid
)
290 __do_free
struct pidns_init_store
*entry
= NULL
;
291 __do_close
int pidfd
= -EBADF
;
292 const struct lxcfs_opts
*opts
= fuse_get_context()->private_data
;
293 char path
[LXCFS_PROC_PID_LEN
];
297 if (opts
&& opts
->use_pidfd
&& can_use_pidfd
) {
298 pidfd
= pidfd_open(pid
, 0);
303 snprintf(path
, sizeof(path
), "/proc/%d", pid
);
307 entry
= zalloc(sizeof(*entry
));
311 ino_hash
= HASH(pidns_inode
);
312 *entry
= (struct pidns_init_store
){
315 .ctime
= st
.st_ctime
,
316 .next
= pidns_hash_table
[ino_hash
],
317 .lastcheck
= time(NULL
),
318 .init_pidfd
= move_fd(pidfd
),
320 pidns_hash_table
[ino_hash
] = move_ptr(entry
);
322 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash
, pid
);
326 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
327 * entry for the inode number and creation time. Verify that the init pid
328 * is still valid. If not, remove it. Return the entry if valid, NULL
330 * Must be called under store_lock
332 static pid_t
lookup_verify_initpid(ino_t pidns_inode
)
334 struct pidns_init_store
*entry
= pidns_hash_table
[HASH(pidns_inode
)];
337 if (entry
->ino
== pidns_inode
) {
338 if (initpid_still_valid(entry
)) {
339 entry
->lastcheck
= time(NULL
);
340 return entry
->initpid
;
343 remove_initpid(entry
);
344 return ret_errno(ESRCH
);
349 return ret_errno(ESRCH
);
352 static bool send_creds_ok(int sock_fd
)
354 char v
= '1'; /* we are the child */
355 struct ucred cred
= {
361 return send_creds(sock_fd
, &cred
, v
, true) == SEND_CREDS_OK
;
364 __returns_twice pid_t
lxcfs_raw_clone(unsigned long flags
, int *pidfd
)
367 * These flags don't interest at all so we don't jump through any hoops
368 * of retrieving them and passing them to the kernel.
371 if ((flags
& (CLONE_VM
| CLONE_PARENT_SETTID
| CLONE_CHILD_SETTID
|
372 CLONE_CHILD_CLEARTID
| CLONE_SETTLS
)))
375 #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
376 /* On s390/s390x and cris the order of the first and second arguments
377 * of the system call is reversed.
379 return syscall(__NR_clone
, NULL
, flags
| SIGCHLD
, pidfd
);
380 #elif defined(__sparc__) && defined(__arch64__)
383 * sparc64 always returns the other process id in %o0, and a
384 * boolean flag whether this is the child or the parent in %o1.
385 * Inline assembly is needed to get the flag returned in %o1.
387 register long g1
asm("g1") = __NR_clone
;
388 register long o0
asm("o0") = flags
| SIGCHLD
;
389 register long o1
asm("o1") = 0; /* is parent/child indicator */
390 register long o2
asm("o2") = (unsigned long)pidfd
;
391 long is_error
, retval
, in_child
;
395 #if defined(__arch64__)
396 "t 0x6d\n\t" /* 64-bit trap */
398 "t 0x10\n\t" /* 32-bit trap */
401 * catch errors: On sparc, the carry bit (csr) in the
402 * processor status register (psr) is used instead of a
406 : "=r"(g1
), "=r"(o0
), "=r"(o1
), "=r"(o2
) /* outputs */
407 : "r"(g1
), "r"(o0
), "r"(o1
), "r"(o2
) /* inputs */
408 : "%cc"); /* clobbers */
425 #elif defined(__ia64__)
426 /* On ia64 the stack and stack size are passed as separate arguments. */
427 return syscall(__NR_clone
, flags
| SIGCHLD
, NULL
, prctl_arg(0), pidfd
);
429 return syscall(__NR_clone
, flags
| SIGCHLD
, NULL
, pidfd
);
433 #define LXCFS_PROC_PID_NS_LEN \
434 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
435 STRLITERALLEN("/ns/pid") + 1)
438 * clone a task which switches to @task's namespace and writes '1'.
439 * over a unix sock so we can read the task's reaper's pid in our
442 * Note: glibc's fork() does not respect pidns, which can lead to failed
443 * assertions inside glibc (and thus failed forks) if the child's pid in
444 * the pidns and the parent pid outside are identical. Using clone prevents
447 static void write_task_init_pid_exit(int sock
, pid_t target
)
449 __do_close
int fd
= -EBADF
;
450 char path
[LXCFS_PROC_PID_NS_LEN
];
453 snprintf(path
, sizeof(path
), "/proc/%d/ns/pid", (int)target
);
454 fd
= open(path
, O_RDONLY
| O_CLOEXEC
);
456 log_exit("write_task_init_pid_exit open of ns/pid");
459 log_exit("Failed to setns to pid namespace of process %d", target
);
461 pid
= lxcfs_raw_clone(0, NULL
);
466 if (!send_creds_ok(sock
))
472 if (!wait_for_pid(pid
))
478 static pid_t
scm_init_pid(pid_t task
)
482 struct ucred cred
= {
490 if (socketpair(AF_UNIX
, SOCK_DGRAM
, 0, sock
) < 0)
499 write_task_init_pid_exit(sock
[0], task
);
503 if (!recv_creds(sock
[1], &cred
, &v
))
517 pid_t
lookup_initpid_in_store(pid_t pid
)
519 pid_t hashed_pid
= 0;
520 char path
[LXCFS_PROC_PID_NS_LEN
];
523 snprintf(path
, sizeof(path
), "/proc/%d/ns/pid", pid
);
525 return ret_errno(ESRCH
);
529 hashed_pid
= lookup_verify_initpid(st
.st_ino
);
530 if (hashed_pid
< 0) {
531 /* release the mutex as the following call is expensive */
534 hashed_pid
= scm_init_pid(pid
);
539 save_initpid(st
.st_ino
, hashed_pid
);
543 * Prune at the end in case we're pruning the value
544 * we were about to return.
546 prune_initpid_store();
553 * Functions needed to setup cgroups in the __constructor__.
556 static bool umount_if_mounted(void)
558 if (umount2(BASEDIR
, MNT_DETACH
) < 0 && errno
!= EINVAL
) {
559 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR
, strerror(errno
));
565 /* __typeof__ should be safe to use with all compilers. */
566 typedef __typeof__(((struct statfs
*)NULL
)->f_type
) fs_type_magic
;
567 static bool has_fs_type(const struct statfs
*fs
, fs_type_magic magic_val
)
569 return (fs
->f_type
== (fs_type_magic
)magic_val
);
573 * looking at fs/proc_namespace.c, it appears we can
574 * actually expect the rootfs entry to very specifically contain
575 * " - rootfs rootfs "
576 * IIUC, so long as we've chrooted so that rootfs is not our root,
577 * the rootfs entry should always be skipped in mountinfo contents.
579 static bool is_on_ramfs(void)
581 __do_free
char *line
= NULL
;
582 __do_free
void *fopen_cache
= NULL
;
583 __do_fclose
FILE *f
= NULL
;
586 f
= fopen_cached("/proc/self/mountinfo", "re", &fopen_cache
);
590 while (getline(&line
, &len
, f
) != -1) {
594 for (p
= line
, i
= 0; p
&& i
< 4; i
++)
595 p
= strchr(p
+ 1, ' ');
599 p2
= strchr(p
+ 1, ' ');
603 if (strcmp(p
+ 1, "/") == 0) {
604 /* This is '/'. Is it the ramfs? */
605 p
= strchr(p2
+ 1, '-');
606 if (p
&& strncmp(p
, "- rootfs rootfs ", 16) == 0)
614 static int pivot_enter(void)
616 __do_close
int oldroot
= -EBADF
, newroot
= -EBADF
;
618 oldroot
= open("/", O_DIRECTORY
| O_RDONLY
| O_CLOEXEC
);
620 return log_error_errno(-1, errno
,
621 "Failed to open old root for fchdir");
623 newroot
= open(ROOTDIR
, O_DIRECTORY
| O_RDONLY
| O_CLOEXEC
);
625 return log_error_errno(-1, errno
,
626 "Failed to open new root for fchdir");
628 /* change into new root fs */
629 if (fchdir(newroot
) < 0)
630 return log_error_errno(-1,
631 errno
, "Failed to change directory to new rootfs: %s",
634 /* pivot_root into our new root fs */
635 if (pivot_root(".", ".") < 0)
636 return log_error_errno(-1, errno
,
637 "pivot_root() syscall failed: %s",
641 * At this point the old-root is mounted on top of our new-root.
642 * To unmounted it we must not be chdir'd into it, so escape back
645 if (fchdir(oldroot
) < 0)
646 return log_error_errno(-1, errno
, "Failed to enter old root");
648 if (umount2(".", MNT_DETACH
) < 0)
649 return log_error_errno(-1, errno
, "Failed to detach old root");
651 if (fchdir(newroot
) < 0)
652 return log_error_errno(-1, errno
, "Failed to re-enter new root");
657 static int chroot_enter(void)
659 if (mount(ROOTDIR
, "/", NULL
, MS_REC
| MS_BIND
, NULL
)) {
660 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR
);
664 if (chroot(".") < 0) {
665 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno
));
669 if (chdir("/") < 0) {
670 lxcfs_error("Failed to change directory: %s.\n", strerror(errno
));
677 static int permute_and_enter(void)
681 if (statfs("/", &sb
) < 0) {
682 lxcfs_error("%s\n", "Could not stat / mountpoint.");
686 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
687 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
688 * /proc/1/mountinfo. */
689 if (has_fs_type(&sb
, RAMFS_MAGIC
) || is_on_ramfs())
690 return chroot_enter();
692 if (pivot_enter() < 0) {
693 lxcfs_error("%s\n", "Could not perform pivot root.");
700 /* Prepare our new clean root. */
701 static int permute_prepare(void)
703 if (mkdir(ROOTDIR
, 0700) < 0 && errno
!= EEXIST
) {
704 lxcfs_error("%s\n", "Failed to create directory for new root.");
708 if (mount("/", ROOTDIR
, NULL
, MS_BIND
, 0) < 0) {
709 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno
));
713 if (mount(RUNTIME_PATH
, ROOTDIR RUNTIME_PATH
, NULL
, MS_BIND
, 0) < 0) {
714 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno
));
718 if (mount(BASEDIR
, ROOTDIR BASEDIR
, NULL
, MS_REC
| MS_MOVE
, 0) < 0) {
719 printf("Failed to move " BASEDIR
" into new root: %s.\n", strerror(errno
));
726 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
727 static bool permute_root(void)
729 /* Prepare new root. */
730 if (permute_prepare() < 0)
733 /* Pivot into new root. */
734 if (permute_and_enter() < 0)
740 static bool cgfs_prepare_mounts(void)
742 if (!mkdir_p(BASEDIR
, 0700)) {
743 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
747 if (!umount_if_mounted()) {
748 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
752 if (unshare(CLONE_NEWNS
) < 0) {
753 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno
));
757 cgroup_ops
->mntns_fd
= preserve_ns(getpid(), "mnt");
758 if (cgroup_ops
->mntns_fd
< 0) {
759 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno
));
763 if (mount(NULL
, "/", NULL
, MS_REC
| MS_PRIVATE
, 0) < 0) {
764 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno
));
768 if (mount("tmpfs", BASEDIR
, "tmpfs", 0, "size=100000,mode=700") < 0) {
769 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
776 static bool cgfs_mount_hierarchies(void)
778 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT
, 0755))
781 if (!cgroup_ops
->mount(cgroup_ops
, BASEDIR
))
784 for (struct hierarchy
**h
= cgroup_ops
->hierarchies
; h
&& *h
; h
++) {
785 __do_free
char *path
= must_make_path(BASEDIR
, (*h
)->mountpoint
, NULL
);
786 (*h
)->fd
= open(path
, O_DIRECTORY
| O_CLOEXEC
| O_NOFOLLOW
);
794 static bool cgfs_setup_controllers(void)
796 if (!cgfs_prepare_mounts())
799 if (!cgfs_mount_hierarchies())
800 return log_error_errno(false, errno
, "Failed to set up private lxcfs cgroup mounts");
808 static void sigusr2_toggle_virtualization(int signo
, siginfo_t
*info
, void *extra
)
812 if (reload_successful
) {
813 reload_successful
= 0;
815 /* write() is async signal safe */
816 ret
= write(STDERR_FILENO
,
817 "Switched into non-virtualization mode\n",
818 STRLITERALLEN("Switched into non-virtualization mode\n"));
820 goto please_compiler
;
822 reload_successful
= 1;
824 /* write() is async signal safe */
825 ret
= write(STDERR_FILENO
, "Switched into virtualization mode\n",
826 STRLITERALLEN("Switched into virtualization mode\n"));
828 goto please_compiler
;
833 * The write() syscall is a function whose return value needs to be
834 * checked. Otherwise the compiler will warn.Another one could be to
835 * use syscall(__NR_write, ...) directly but whatever.
840 static void __attribute__((constructor
)) lxcfs_init(void)
842 __do_close
int init_ns
= -EBADF
, root_fd
= -EBADF
,
846 struct hierarchy
*hierarchy
;
848 lxcfs_info("Running constructor %s to reload liblxcfs", __func__
);
850 cgroup_ops
= cgroup_init();
852 lxcfs_info("Failed to initialize cgroup support");
856 /* Preserve initial namespace. */
858 init_ns
= preserve_ns(pid
, "mnt");
860 lxcfs_info("Failed to preserve initial mount namespace");
864 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
865 * to privately mount lxcfs cgroups. */
866 if (!cgfs_setup_controllers()) {
867 log_exit("Failed to setup private cgroup mounts for lxcfs");
871 if (setns(init_ns
, 0) < 0) {
872 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno
));
876 if (!init_cpuview()) {
877 log_exit("Failed to init CPU view");
881 lxcfs_info("mount namespace: %d", cgroup_ops
->mntns_fd
);
882 lxcfs_info("hierarchies:");
884 for (struct hierarchy
**h
= cgroup_ops
->hierarchies
; h
&& *h
; h
++, i
++) {
885 char **controller_list
= (*h
)->controllers
;
886 __do_free
char *controllers
= NULL
;
887 if (controller_list
&& *controller_list
)
888 controllers
= lxc_string_join(",", (const char **)controller_list
, false);
889 lxcfs_info(" %2d: fd: %3d: %s", i
, (*h
)->fd
, controllers
?: "");
892 pidfd
= pidfd_open(pid
, 0);
893 if (pidfd
>= 0 && pidfd_send_signal(pidfd
, 0, NULL
, 0) == 0) {
894 can_use_pidfd
= true;
895 lxcfs_info("Kernel supports pidfds");
898 can_use_swap
= cgroup_ops
->can_use_swap(cgroup_ops
);
900 lxcfs_info("Kernel supports swap accounting");
902 lxcfs_info("Kernel does not support swap accounting");
904 hierarchy
= cgroup_ops
->get_hierarchy(cgroup_ops
, "memory");
905 memory_is_cgroupv2
= hierarchy
&& is_unified_hierarchy(hierarchy
);
907 lxcfs_info("api_extensions:");
908 for (size_t nr
= 0; nr
< nr_api_extensions
; nr
++)
909 lxcfs_info("- %s", api_extensions
[nr
]);
911 root_fd
= open("/", O_PATH
| O_CLOEXEC
);
913 lxcfs_info("%s - Failed to open root directory", strerror(errno
));
914 else if (fchdir(root_fd
) < 0)
915 lxcfs_info("%s - Failed to change to root directory", strerror(errno
));
917 if (install_signal_handler(SIGUSR2
, sigusr2_toggle_virtualization
)) {
918 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno
));
922 reload_successful
= 1;
926 reload_successful
= 0;
927 lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__
);
930 static void __attribute__((destructor
)) lxcfs_exit(void)
932 lxcfs_info("Running destructor %s", __func__
);
934 clear_initpid_store();
936 cgroup_exit(cgroup_ops
);
939 void *lxcfs_fuse_init(struct fuse_conn_info
*conn
, void *data
)
941 struct fuse_context
*fc
= fuse_get_context();
942 #if HAVE_FUSE_RETURNS_DT_TYPE
943 can_use_sys_cpu
= true;
945 has_versioned_opts
= true;
946 return fc
->private_data
;