]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/bindings.c
Release LXCFS 6.0.0
[mirror_lxcfs.git] / src / bindings.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include "config.h"
4
5 #include <dirent.h>
6 #include <errno.h>
7 #include <fcntl.h>
8 #include <inttypes.h>
9 #include <libgen.h>
10 #include <linux/magic.h>
11 #include <linux/sched.h>
12 #include <pthread.h>
13 #include <sched.h>
14 #include <stdarg.h>
15 #include <stdbool.h>
16 #include <stdint.h>
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <sys/epoll.h>
21 #include <sys/mman.h>
22 #include <sys/mount.h>
23 #include <sys/param.h>
24 #include <sys/socket.h>
25 #include <sys/syscall.h>
26 #include <sys/sysinfo.h>
27 #include <sys/vfs.h>
28 #include <time.h>
29 #include <unistd.h>
30 #include <wait.h>
31
32 #include "bindings.h"
33
34 #include "api_extensions.h"
35 #include "cgroup_fuse.h"
36 #include "cgroups/cgroup.h"
37 #include "cgroups/cgroup_utils.h"
38 #include "memory_utils.h"
39 #include "proc_cpuview.h"
40 #include "syscall_numbers.h"
41 #include "utils.h"
42
43 static bool can_use_pidfd;
44 static bool can_use_swap;
45 static bool can_use_sys_cpu;
46 static bool has_versioned_opts;
47 static bool memory_is_cgroupv2;
48 static __u32 host_personality;
49
50 static volatile sig_atomic_t reload_successful;
51
52 bool liblxcfs_functional(void)
53 {
54 return reload_successful != 0;
55 }
56
57 bool liblxcfs_can_use_swap(void)
58 {
59 return can_use_swap;
60 }
61
62 bool liblxcfs_can_use_sys_cpu(void)
63 {
64 return can_use_sys_cpu;
65 }
66
67 bool liblxcfs_has_versioned_opts(void)
68 {
69 return has_versioned_opts;
70 }
71
72 bool liblxcfs_memory_is_cgroupv2(void)
73 {
74 return memory_is_cgroupv2;
75 }
76
77 __u32 liblxcfs_personality(void)
78 {
79 return host_personality;
80 }
81
82 /* Define pivot_root() if missing from the C library */
83 #ifndef HAVE_PIVOT_ROOT
84 static int pivot_root(const char *new_root, const char *put_old)
85 {
86 return syscall(__NR_pivot_root, new_root, put_old);
87 }
88 #else
89 extern int pivot_root(const char *new_root, const char *put_old);
90 #endif
91
92 /*
93 * A table caching which pid is init for a pid namespace.
94 * When looking up which pid is init for $qpid, we first
95 * 1. Stat /proc/$qpid/ns/pid.
96 * 2. Check whether the ino_t is in our store.
97 * a. if not, fork a child in qpid's ns to send us
98 * ucred.pid = 1, and read the initpid. Cache
99 * initpid and creation time for /proc/initpid
100 * in a new store entry.
101 * b. if so, verify that /proc/initpid still matches
102 * what we have saved. If not, clear the store
103 * entry and go back to a. If so, return the
104 * cached initpid.
105 */
106 struct pidns_init_store {
107 ino_t ino; /* inode number for /proc/$pid/ns/pid */
108 pid_t initpid; /* the pid of nit in that ns */
109 int init_pidfd;
110 int64_t ctime; /* the time at which /proc/$initpid was created */
111 struct pidns_init_store *next;
112 int64_t lastcheck;
113 };
114
115 /* lol - look at how they are allocated in the kernel */
116 #define PIDNS_HASH_SIZE 4096
117 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
118
119 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
120 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
121
122 static void mutex_lock(pthread_mutex_t *l)
123 {
124 int ret;
125
126 ret = pthread_mutex_lock(l);
127 if (ret)
128 log_exit("%s - returned %d\n", strerror(ret), ret);
129 }
130
131 struct cgroup_ops *cgroup_ops;
132
133 static void mutex_unlock(pthread_mutex_t *l)
134 {
135 int ret;
136
137 ret = pthread_mutex_unlock(l);
138 if (ret)
139 log_exit("%s - returned %d\n", strerror(ret), ret);
140 }
141
142 static inline void store_lock(void)
143 {
144 mutex_lock(&pidns_store_mutex);
145 }
146
147 static inline void store_unlock(void)
148 {
149 mutex_unlock(&pidns_store_mutex);
150 }
151
152 #define define_interruptible_lock(type, lockname, lockfn) \
153 int lockname##_interruptible(type *l) \
154 { \
155 int ret = ETIMEDOUT; \
156 while (!fuse_interrupted() && (ret == ETIMEDOUT)) { \
157 struct timespec deadline; \
158 clock_gettime(CLOCK_REALTIME, &deadline); \
159 deadline.tv_sec += 1; \
160 ret = lockfn(l, &deadline); \
161 } \
162 return -ret; \
163 }
164
165 define_interruptible_lock(pthread_mutex_t, mutex_lock, pthread_mutex_timedlock)
166 define_interruptible_lock(pthread_rwlock_t, rwlock_rdlock, pthread_rwlock_timedrdlock)
167 define_interruptible_lock(pthread_rwlock_t, rwlock_wrlock, pthread_rwlock_timedwrlock)
168
169 #undef define_interruptible_lock
170
171 /* /proc/ = 6
172 * +
173 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
174 * +
175 * \0 = 1
176 */
177 #define LXCFS_PROC_PID_LEN \
178 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
179
180 static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
181 {
182 int ret;
183
184 if (entry->init_pidfd < 0)
185 return ret_errno(ENOSYS);
186
187 ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
188 if (ret < 0) {
189 if (errno == ENOSYS)
190 return ret_errno(ENOSYS);
191
192 return 0;
193 }
194
195 return 1;
196 }
197
198 static int initpid_still_valid_stat(struct pidns_init_store *entry)
199 {
200 struct stat st;
201 char path[LXCFS_PROC_PID_LEN];
202
203 snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
204 if (stat(path, &st) || entry->ctime != st.st_ctime)
205 return 0;
206
207 return 1;
208 }
209
210 /* Must be called under store_lock */
211 static bool initpid_still_valid(struct pidns_init_store *entry)
212 {
213 int ret;
214
215 ret = initpid_still_valid_pidfd(entry);
216 if (ret < 0)
217 ret = initpid_still_valid_stat(entry);
218
219 return ret == 1;
220 }
221
222 /* Must be called under store_lock */
223 static void remove_initpid(struct pidns_init_store *entry)
224 {
225 struct pidns_init_store *it;
226 int ino_hash;
227
228 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
229 entry->initpid);
230
231 ino_hash = HASH(entry->ino);
232 if (pidns_hash_table[ino_hash] == entry) {
233 pidns_hash_table[ino_hash] = entry->next;
234 close_prot_errno_disarm(entry->init_pidfd);
235 free_disarm(entry);
236 return;
237 }
238
239 it = pidns_hash_table[ino_hash];
240 while (it) {
241 if (it->next == entry) {
242 it->next = entry->next;
243 close_prot_errno_disarm(entry->init_pidfd);
244 free_disarm(entry);
245 return;
246 }
247 it = it->next;
248 }
249 }
250
251 #define PURGE_SECS 5
252 /* Must be called under store_lock */
253 static void prune_initpid_store(void)
254 {
255 static int64_t last_prune = 0;
256 int64_t now, threshold;
257
258 if (!last_prune) {
259 last_prune = time(NULL);
260 return;
261 }
262
263 now = time(NULL);
264 if (now < (last_prune + PURGE_SECS))
265 return;
266
267 lxcfs_debug("Pruning init pid cache");
268
269 last_prune = now;
270 threshold = now - 2 * PURGE_SECS;
271
272 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
273 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
274 if (entry->lastcheck < threshold) {
275 struct pidns_init_store *cur = entry;
276
277 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
278
279 if (prev)
280 prev->next = entry->next;
281 else
282 pidns_hash_table[i] = entry->next;
283 entry = entry->next;
284 close_prot_errno_disarm(cur->init_pidfd);
285 free_disarm(cur);
286 } else {
287 prev = entry;
288 entry = entry->next;
289 }
290 }
291 }
292 }
293
294 static void clear_initpid_store(void)
295 {
296 store_lock();
297 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
298 for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) {
299 struct pidns_init_store *cur = entry;
300
301 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
302
303 pidns_hash_table[i] = entry->next;
304 entry = entry->next;
305 close_prot_errno_disarm(cur->init_pidfd);
306 free_disarm(cur);
307 }
308 }
309 store_unlock();
310 }
311
312 /* Must be called under store_lock */
313 static void save_initpid(ino_t pidns_inode, pid_t pid)
314 {
315 __do_free struct pidns_init_store *entry = NULL;
316 __do_close int pidfd = -EBADF;
317 const struct lxcfs_opts *opts = fuse_get_context()->private_data;
318 char path[LXCFS_PROC_PID_LEN];
319 struct stat st;
320 int ino_hash;
321
322 if (opts && opts->use_pidfd && can_use_pidfd) {
323 pidfd = pidfd_open(pid, 0);
324 if (pidfd < 0)
325 return;
326 }
327
328 snprintf(path, sizeof(path), "/proc/%d", pid);
329 if (stat(path, &st))
330 return;
331
332 entry = zalloc(sizeof(*entry));
333 if (!entry)
334 return;
335
336 ino_hash = HASH(pidns_inode);
337 *entry = (struct pidns_init_store){
338 .ino = pidns_inode,
339 .initpid = pid,
340 .ctime = st.st_ctime,
341 .next = pidns_hash_table[ino_hash],
342 .lastcheck = time(NULL),
343 .init_pidfd = move_fd(pidfd),
344 };
345 pidns_hash_table[ino_hash] = move_ptr(entry);
346
347 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
348 }
349
350 /*
351 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
352 * entry for the inode number and creation time. Verify that the init pid
353 * is still valid. If not, remove it. Return the entry if valid, NULL
354 * otherwise.
355 * Must be called under store_lock
356 */
357 static pid_t lookup_verify_initpid(ino_t pidns_inode)
358 {
359 struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
360
361 while (entry) {
362 if (entry->ino == pidns_inode) {
363 if (initpid_still_valid(entry)) {
364 entry->lastcheck = time(NULL);
365 return entry->initpid;
366 }
367
368 remove_initpid(entry);
369 return ret_errno(ESRCH);
370 }
371 entry = entry->next;
372 }
373
374 return ret_errno(ESRCH);
375 }
376
377 static bool send_creds_ok(int sock_fd)
378 {
379 char v = '1'; /* we are the child */
380 struct ucred cred = {
381 .uid = 0,
382 .gid = 0,
383 .pid = 1,
384 };
385
386 return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK;
387 }
388
389 __returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd)
390 {
391 /*
392 * These flags don't interest at all so we don't jump through any hoops
393 * of retrieving them and passing them to the kernel.
394 */
395 errno = EINVAL;
396 if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
397 CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
398 return -EINVAL;
399
400 #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
401 /* On s390/s390x and cris the order of the first and second arguments
402 * of the system call is reversed.
403 */
404 return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd);
405 #elif defined(__sparc__) && defined(__arch64__)
406 {
407 /*
408 * sparc64 always returns the other process id in %o0, and a
409 * boolean flag whether this is the child or the parent in %o1.
410 * Inline assembly is needed to get the flag returned in %o1.
411 */
412 register long g1 asm("g1") = __NR_clone;
413 register long o0 asm("o0") = flags | SIGCHLD;
414 register long o1 asm("o1") = 0; /* is parent/child indicator */
415 register long o2 asm("o2") = (unsigned long)pidfd;
416 long is_error, retval, in_child;
417 pid_t child_pid;
418
419 asm volatile(
420 #if defined(__arch64__)
421 "t 0x6d\n\t" /* 64-bit trap */
422 #else
423 "t 0x10\n\t" /* 32-bit trap */
424 #endif
425 /*
426 * catch errors: On sparc, the carry bit (csr) in the
427 * processor status register (psr) is used instead of a
428 * full register.
429 */
430 "addx %%g0, 0, %%g1"
431 : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */
432 : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */
433 : "%cc"); /* clobbers */
434
435 is_error = g1;
436 retval = o0;
437 in_child = o1;
438
439 if (is_error) {
440 errno = retval;
441 return -1;
442 }
443
444 if (in_child)
445 return 0;
446
447 child_pid = retval;
448 return child_pid;
449 }
450 #elif defined(__ia64__)
451 /* On ia64 the stack and stack size are passed as separate arguments. */
452 return syscall(__NR_clone, flags | SIGCHLD, NULL, 0, pidfd);
453 #else
454 return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd);
455 #endif
456 }
457
458 #define LXCFS_PROC_PID_NS_LEN \
459 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
460 STRLITERALLEN("/ns/pid") + 1)
461
462 /*
463 * clone a task which switches to @task's namespace and writes '1'.
464 * over a unix sock so we can read the task's reaper's pid in our
465 * namespace
466 *
467 * Note: glibc's fork() does not respect pidns, which can lead to failed
468 * assertions inside glibc (and thus failed forks) if the child's pid in
469 * the pidns and the parent pid outside are identical. Using clone prevents
470 * this issue.
471 */
472 static void write_task_init_pid_exit(int sock, pid_t target)
473 {
474 __do_close int fd = -EBADF;
475 char path[LXCFS_PROC_PID_NS_LEN];
476 pid_t pid;
477
478 snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
479 fd = open(path, O_RDONLY | O_CLOEXEC);
480 if (fd < 0)
481 log_exit("write_task_init_pid_exit open of ns/pid");
482
483 if (setns(fd, 0))
484 log_exit("Failed to setns to pid namespace of process %d", target);
485
486 pid = lxcfs_raw_clone(0, NULL);
487 if (pid < 0)
488 _exit(EXIT_FAILURE);
489
490 if (pid == 0) {
491 if (!send_creds_ok(sock))
492 _exit(EXIT_FAILURE);
493
494 _exit(EXIT_SUCCESS);
495 }
496
497 if (!wait_for_pid(pid))
498 _exit(EXIT_FAILURE);
499
500 _exit(EXIT_SUCCESS);
501 }
502
503 static pid_t scm_init_pid(pid_t task)
504 {
505 char v = '0';
506 pid_t pid_ret = -1;
507 struct ucred cred = {
508 .pid = -1,
509 .uid = -1,
510 .gid = -1,
511 };
512 pid_t pid;
513 int sock[2];
514
515 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
516 return -1;
517
518 pid = fork();
519 if (pid < 0)
520 goto out;
521
522 if (pid == 0) {
523 close(sock[1]);
524 write_task_init_pid_exit(sock[0], task);
525 _exit(EXIT_SUCCESS);
526 }
527
528 if (!recv_creds(sock[1], &cred, &v))
529 goto out;
530
531 pid_ret = cred.pid;
532
533 out:
534 close(sock[0]);
535 close(sock[1]);
536 if (pid > 0)
537 wait_for_pid(pid);
538
539 return pid_ret;
540 }
541
542 pid_t lookup_initpid_in_store(pid_t pid)
543 {
544 pid_t hashed_pid = 0;
545 char path[LXCFS_PROC_PID_NS_LEN];
546 struct stat st;
547
548 snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
549 if (stat(path, &st))
550 return ret_errno(ESRCH);
551
552 store_lock();
553
554 hashed_pid = lookup_verify_initpid(st.st_ino);
555 if (hashed_pid < 0) {
556 /* release the mutex as the following call is expensive */
557 store_unlock();
558
559 hashed_pid = scm_init_pid(pid);
560
561 store_lock();
562
563 if (hashed_pid > 0)
564 save_initpid(st.st_ino, hashed_pid);
565 }
566
567 /*
568 * Prune at the end in case we're pruning the value
569 * we were about to return.
570 */
571 prune_initpid_store();
572 store_unlock();
573
574 return hashed_pid;
575 }
576
577 /*
578 * Functions needed to setup cgroups in the __constructor__.
579 */
580
581 static bool umount_if_mounted(void)
582 {
583 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
584 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
585 return false;
586 }
587 return true;
588 }
589
590 /* __typeof__ should be safe to use with all compilers. */
591 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
592 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
593 {
594 return (fs->f_type == (fs_type_magic)magic_val);
595 }
596
597 /*
598 * looking at fs/proc_namespace.c, it appears we can
599 * actually expect the rootfs entry to very specifically contain
600 * " - rootfs rootfs "
601 * IIUC, so long as we've chrooted so that rootfs is not our root,
602 * the rootfs entry should always be skipped in mountinfo contents.
603 */
604 static bool is_on_ramfs(void)
605 {
606 __do_free char *line = NULL;
607 __do_free void *fopen_cache = NULL;
608 __do_fclose FILE *f = NULL;
609 size_t len = 0;
610
611 f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
612 if (!f)
613 return false;
614
615 while (getline(&line, &len, f) != -1) {
616 int i;
617 char *p, *p2;
618
619 for (p = line, i = 0; p && i < 4; i++)
620 p = strchr(p + 1, ' ');
621 if (!p)
622 continue;
623
624 p2 = strchr(p + 1, ' ');
625 if (!p2)
626 continue;
627 *p2 = '\0';
628 if (strcmp(p + 1, "/") == 0) {
629 /* This is '/'. Is it the ramfs? */
630 p = strchr(p2 + 1, '-');
631 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
632 return true;
633 }
634 }
635
636 return false;
637 }
638
639 static int pivot_enter(void)
640 {
641 __do_close int oldroot = -EBADF, newroot = -EBADF;
642
643 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
644 if (oldroot < 0)
645 return log_error_errno(-1, errno,
646 "Failed to open old root for fchdir");
647
648 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
649 if (newroot < 0)
650 return log_error_errno(-1, errno,
651 "Failed to open new root for fchdir");
652
653 /* change into new root fs */
654 if (fchdir(newroot) < 0)
655 return log_error_errno(-1,
656 errno, "Failed to change directory to new rootfs: %s",
657 ROOTDIR);
658
659 /* pivot_root into our new root fs */
660 if (pivot_root(".", ".") < 0)
661 return log_error_errno(-1, errno,
662 "pivot_root() syscall failed: %s",
663 strerror(errno));
664
665 /*
666 * At this point the old-root is mounted on top of our new-root.
667 * To unmounted it we must not be chdir'd into it, so escape back
668 * to the old-root.
669 */
670 if (fchdir(oldroot) < 0)
671 return log_error_errno(-1, errno, "Failed to enter old root");
672
673 if (umount2(".", MNT_DETACH) < 0)
674 return log_error_errno(-1, errno, "Failed to detach old root");
675
676 if (fchdir(newroot) < 0)
677 return log_error_errno(-1, errno, "Failed to re-enter new root");
678
679 return 0;
680 }
681
682 static int chroot_enter(void)
683 {
684 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
685 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
686 return -1;
687 }
688
689 if (chroot(".") < 0) {
690 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
691 return -1;
692 }
693
694 if (chdir("/") < 0) {
695 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
696 return -1;
697 }
698
699 return 0;
700 }
701
702 static int permute_and_enter(void)
703 {
704 struct statfs sb;
705
706 if (statfs("/", &sb) < 0) {
707 lxcfs_error("%s\n", "Could not stat / mountpoint.");
708 return -1;
709 }
710
711 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
712 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
713 * /proc/1/mountinfo. */
714 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
715 return chroot_enter();
716
717 if (pivot_enter() < 0) {
718 lxcfs_error("%s\n", "Could not perform pivot root.");
719 return -1;
720 }
721
722 return 0;
723 }
724
725 /* Prepare our new clean root. */
726 static int permute_prepare(void)
727 {
728 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
729 lxcfs_error("%s\n", "Failed to create directory for new root.");
730 return -1;
731 }
732
733 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
734 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
735 return -1;
736 }
737
738 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
739 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
740 return -1;
741 }
742
743 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
744 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
745 return -1;
746 }
747
748 return 0;
749 }
750
751 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
752 static bool permute_root(void)
753 {
754 /* Prepare new root. */
755 if (permute_prepare() < 0)
756 return false;
757
758 /* Pivot into new root. */
759 if (permute_and_enter() < 0)
760 return false;
761
762 return true;
763 }
764
765 static bool cgfs_prepare_mounts(void)
766 {
767 if (!mkdir_p(BASEDIR, 0700)) {
768 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
769 return false;
770 }
771
772 if (!umount_if_mounted()) {
773 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
774 return false;
775 }
776
777 if (unshare(CLONE_NEWNS) < 0) {
778 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
779 return false;
780 }
781
782 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
783 if (cgroup_ops->mntns_fd < 0) {
784 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
785 return false;
786 }
787
788 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
789 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
790 return false;
791 }
792
793 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
794 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
795 return false;
796 }
797
798 return true;
799 }
800
801 static bool cgfs_mount_hierarchies(void)
802 {
803 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
804 return false;
805
806 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
807 return false;
808
809 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
810 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
811 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
812 if ((*h)->fd < 0)
813 return false;
814 }
815
816 return true;
817 }
818
819 static bool cgfs_setup_controllers(void)
820 {
821 if (!cgfs_prepare_mounts())
822 return false;
823
824 if (!cgfs_mount_hierarchies())
825 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
826
827 if (!permute_root())
828 return false;
829
830 return true;
831 }
832
833 static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra)
834 {
835 int ret;
836
837 if (reload_successful) {
838 reload_successful = 0;
839
840 /* write() is async signal safe */
841 ret = write(STDERR_FILENO,
842 "Switched into non-virtualization mode\n",
843 STRLITERALLEN("Switched into non-virtualization mode\n"));
844 if (ret < 0)
845 goto please_compiler;
846 } else {
847 reload_successful = 1;
848
849 /* write() is async signal safe */
850 ret = write(STDERR_FILENO, "Switched into virtualization mode\n",
851 STRLITERALLEN("Switched into virtualization mode\n"));
852 if (ret < 0)
853 goto please_compiler;
854 }
855
856 please_compiler:
857 /*
858 * The write() syscall is a function whose return value needs to be
859 * checked. Otherwise the compiler will warn.Another one could be to
860 * use syscall(__NR_write, ...) directly but whatever.
861 */
862 return;
863 }
864
865 static void __attribute__((constructor)) lxcfs_init(void)
866 {
867 __do_close int init_ns = -EBADF, root_fd = -EBADF,
868 pidfd = -EBADF;
869 __do_free char *cgroup = NULL;
870 int i = 0;
871 pid_t pid;
872 struct hierarchy *hierarchy;
873
874 lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
875
876 cgroup_ops = cgroup_init();
877 if (!cgroup_ops) {
878 lxcfs_info("Failed to initialize cgroup support");
879 goto broken_upgrade;
880 }
881
882 /* Preserve initial namespace. */
883 pid = getpid();
884 init_ns = preserve_ns(pid, "mnt");
885 if (init_ns < 0) {
886 lxcfs_info("Failed to preserve initial mount namespace");
887 goto broken_upgrade;
888 }
889
890 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
891 * to privately mount lxcfs cgroups. */
892 if (!cgfs_setup_controllers()) {
893 log_exit("Failed to setup private cgroup mounts for lxcfs");
894 goto broken_upgrade;
895 }
896
897 if (setns(init_ns, 0) < 0) {
898 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
899 goto broken_upgrade;
900 }
901
902 if (!init_cpuview()) {
903 log_exit("Failed to init CPU view");
904 goto broken_upgrade;
905 }
906
907 lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
908 lxcfs_info("hierarchies:");
909
910 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
911 char **controller_list = (*h)->controllers;
912 __do_free char *controllers = NULL;
913 if (controller_list && *controller_list)
914 controllers = lxc_string_join(",", (const char **)controller_list, false);
915 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
916 }
917
918 pidfd = pidfd_open(pid, 0);
919 if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
920 can_use_pidfd = true;
921 lxcfs_info("Kernel supports pidfds");
922 }
923
924 cgroup = get_pid_cgroup(pid, "memory");
925 can_use_swap = cgroup && cgroup_ops->can_use_swap(cgroup_ops, cgroup);
926 if (can_use_swap)
927 lxcfs_info("Kernel supports swap accounting");
928 else
929 lxcfs_info("Kernel does not support swap accounting");
930
931 hierarchy = cgroup_ops->get_hierarchy(cgroup_ops, "memory");
932 memory_is_cgroupv2 = hierarchy && is_unified_hierarchy(hierarchy);
933
934 lxcfs_info("api_extensions:");
935 for (size_t nr = 0; nr < nr_api_extensions; nr++)
936 lxcfs_info("- %s", api_extensions[nr]);
937
938 root_fd = open("/", O_PATH | O_CLOEXEC);
939 if (root_fd < 0)
940 lxcfs_info("%s - Failed to open root directory", strerror(errno));
941 else if (fchdir(root_fd) < 0)
942 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
943
944 if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) {
945 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno));
946 goto broken_upgrade;
947 }
948
949 if (get_task_personality(getpid(), &host_personality) < 0) {
950 lxcfs_info("Failed to retrieve host personality");
951 goto broken_upgrade;
952 }
953
954 reload_successful = 1;
955 return;
956
957 broken_upgrade:
958 reload_successful = 0;
959 lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
960 }
961
962 static void __attribute__((destructor)) lxcfs_exit(void)
963 {
964 lxcfs_info("Running destructor %s", __func__);
965
966 clear_initpid_store();
967 free_cpuview();
968 cgroup_exit(cgroup_ops);
969 }
970
971 void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data)
972 {
973 struct fuse_context *fc = fuse_get_context();
974 #if HAVE_FUSE_RETURNS_DT_TYPE
975 can_use_sys_cpu = true;
976 #endif
977 has_versioned_opts = true;
978 return fc ? fc->private_data : NULL;
979 }