]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/bindings.c
70906c67d6d019b3997ce31dad5d85692f90ddc2
[mirror_lxcfs.git] / src / bindings.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE
5 #endif
6
7 #include "config.h"
8
9 #ifdef HAVE_FUSE3
10 #ifndef FUSE_USE_VERSION
11 #define FUSE_USE_VERSION 30
12 #endif
13 #else
14 #ifndef FUSE_USE_VERSION
15 #define FUSE_USE_VERSION 26
16 #endif
17 #endif
18
19 #define _FILE_OFFSET_BITS 64
20
21 #include <dirent.h>
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <fuse.h>
25 #include <inttypes.h>
26 #include <libgen.h>
27 #include <linux/magic.h>
28 #include <linux/sched.h>
29 #include <pthread.h>
30 #include <sched.h>
31 #include <stdarg.h>
32 #include <stdbool.h>
33 #include <stdint.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <sys/epoll.h>
38 #include <sys/mman.h>
39 #include <sys/mount.h>
40 #include <sys/param.h>
41 #include <sys/socket.h>
42 #include <sys/syscall.h>
43 #include <sys/sysinfo.h>
44 #include <sys/vfs.h>
45 #include <time.h>
46 #include <unistd.h>
47 #include <wait.h>
48
49 #include "api_extensions.h"
50 #include "bindings.h"
51 #include "cgroup_fuse.h"
52 #include "cgroups/cgroup.h"
53 #include "cgroups/cgroup_utils.h"
54 #include "memory_utils.h"
55 #include "proc_cpuview.h"
56 #include "syscall_numbers.h"
57 #include "utils.h"
58
59 static bool can_use_pidfd;
60 static bool can_use_swap;
61 static bool can_use_sys_cpu;
62 static bool has_versioned_opts;
63
64 static volatile sig_atomic_t reload_successful;
65
66 bool liblxcfs_functional(void)
67 {
68 return reload_successful != 0;
69 }
70
71 bool liblxcfs_can_use_swap(void)
72 {
73 return can_use_swap;
74 }
75
76 bool liblxcfs_can_use_sys_cpu(void)
77 {
78 return can_use_sys_cpu;
79 }
80
81 bool liblxcfs_has_versioned_opts(void)
82 {
83 return has_versioned_opts;
84 }
85
86 /* Define pivot_root() if missing from the C library */
87 #ifndef HAVE_PIVOT_ROOT
88 static int pivot_root(const char *new_root, const char *put_old)
89 {
90 return syscall(__NR_pivot_root, new_root, put_old);
91 }
92 #else
93 extern int pivot_root(const char *new_root, const char *put_old);
94 #endif
95
96 /*
97 * A table caching which pid is init for a pid namespace.
98 * When looking up which pid is init for $qpid, we first
99 * 1. Stat /proc/$qpid/ns/pid.
100 * 2. Check whether the ino_t is in our store.
101 * a. if not, fork a child in qpid's ns to send us
102 * ucred.pid = 1, and read the initpid. Cache
103 * initpid and creation time for /proc/initpid
104 * in a new store entry.
105 * b. if so, verify that /proc/initpid still matches
106 * what we have saved. If not, clear the store
107 * entry and go back to a. If so, return the
108 * cached initpid.
109 */
110 struct pidns_init_store {
111 ino_t ino; /* inode number for /proc/$pid/ns/pid */
112 pid_t initpid; /* the pid of nit in that ns */
113 int init_pidfd;
114 int64_t ctime; /* the time at which /proc/$initpid was created */
115 struct pidns_init_store *next;
116 int64_t lastcheck;
117 };
118
119 /* lol - look at how they are allocated in the kernel */
120 #define PIDNS_HASH_SIZE 4096
121 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
122
123 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
124 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
125
126 static void mutex_lock(pthread_mutex_t *l)
127 {
128 int ret;
129
130 ret = pthread_mutex_lock(l);
131 if (ret)
132 log_exit("%s - returned %d\n", strerror(ret), ret);
133 }
134
135 struct cgroup_ops *cgroup_ops;
136
137 static void mutex_unlock(pthread_mutex_t *l)
138 {
139 int ret;
140
141 ret = pthread_mutex_unlock(l);
142 if (ret)
143 log_exit("%s - returned %d\n", strerror(ret), ret);
144 }
145
146 static inline void store_lock(void)
147 {
148 mutex_lock(&pidns_store_mutex);
149 }
150
151 static inline void store_unlock(void)
152 {
153 mutex_unlock(&pidns_store_mutex);
154 }
155
156 /* /proc/ = 6
157 * +
158 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
159 * +
160 * \0 = 1
161 */
162 #define LXCFS_PROC_PID_LEN \
163 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
164
165 static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
166 {
167 int ret;
168
169 if (entry->init_pidfd < 0)
170 return ret_errno(ENOSYS);
171
172 ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
173 if (ret < 0) {
174 if (errno == ENOSYS)
175 return ret_errno(ENOSYS);
176
177 return 0;
178 }
179
180 return 1;
181 }
182
183 static int initpid_still_valid_stat(struct pidns_init_store *entry)
184 {
185 struct stat st;
186 char path[LXCFS_PROC_PID_LEN];
187
188 snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
189 if (stat(path, &st) || entry->ctime != st.st_ctime)
190 return 0;
191
192 return 1;
193 }
194
195 /* Must be called under store_lock */
196 static bool initpid_still_valid(struct pidns_init_store *entry)
197 {
198 int ret;
199
200 ret = initpid_still_valid_pidfd(entry);
201 if (ret < 0)
202 ret = initpid_still_valid_stat(entry);
203
204 return ret == 1;
205 }
206
207 /* Must be called under store_lock */
208 static void remove_initpid(struct pidns_init_store *entry)
209 {
210 struct pidns_init_store *it;
211 int ino_hash;
212
213 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
214 entry->initpid);
215
216 ino_hash = HASH(entry->ino);
217 if (pidns_hash_table[ino_hash] == entry) {
218 pidns_hash_table[ino_hash] = entry->next;
219 close_prot_errno_disarm(entry->init_pidfd);
220 free_disarm(entry);
221 return;
222 }
223
224 it = pidns_hash_table[ino_hash];
225 while (it) {
226 if (it->next == entry) {
227 it->next = entry->next;
228 close_prot_errno_disarm(entry->init_pidfd);
229 free_disarm(entry);
230 return;
231 }
232 it = it->next;
233 }
234 }
235
236 #define PURGE_SECS 5
237 /* Must be called under store_lock */
238 static void prune_initpid_store(void)
239 {
240 static int64_t last_prune = 0;
241 int64_t now, threshold;
242
243 if (!last_prune) {
244 last_prune = time(NULL);
245 return;
246 }
247
248 now = time(NULL);
249 if (now < (last_prune + PURGE_SECS))
250 return;
251
252 lxcfs_debug("Pruning init pid cache");
253
254 last_prune = now;
255 threshold = now - 2 * PURGE_SECS;
256
257 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
258 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
259 if (entry->lastcheck < threshold) {
260 struct pidns_init_store *cur = entry;
261
262 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
263
264 if (prev)
265 prev->next = entry->next;
266 else
267 pidns_hash_table[i] = entry->next;
268 entry = entry->next;
269 close_prot_errno_disarm(cur->init_pidfd);
270 free_disarm(cur);
271 } else {
272 prev = entry;
273 entry = entry->next;
274 }
275 }
276 }
277 }
278
279 static void clear_initpid_store(void)
280 {
281 store_lock();
282 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
283 for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) {
284 struct pidns_init_store *cur = entry;
285
286 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
287
288 pidns_hash_table[i] = entry->next;
289 entry = entry->next;
290 close_prot_errno_disarm(cur->init_pidfd);
291 free_disarm(cur);
292 }
293 }
294 store_unlock();
295 }
296
297 /* Must be called under store_lock */
298 static void save_initpid(ino_t pidns_inode, pid_t pid)
299 {
300 __do_free struct pidns_init_store *entry = NULL;
301 __do_close int pidfd = -EBADF;
302 const struct lxcfs_opts *opts = fuse_get_context()->private_data;
303 char path[LXCFS_PROC_PID_LEN];
304 struct stat st;
305 int ino_hash;
306
307 if (opts && opts->use_pidfd && can_use_pidfd) {
308 pidfd = pidfd_open(pid, 0);
309 if (pidfd < 0)
310 return;
311 }
312
313 snprintf(path, sizeof(path), "/proc/%d", pid);
314 if (stat(path, &st))
315 return;
316
317 entry = zalloc(sizeof(*entry));
318 if (!entry)
319 return;
320
321 ino_hash = HASH(pidns_inode);
322 *entry = (struct pidns_init_store){
323 .ino = pidns_inode,
324 .initpid = pid,
325 .ctime = st.st_ctime,
326 .next = pidns_hash_table[ino_hash],
327 .lastcheck = time(NULL),
328 .init_pidfd = move_fd(pidfd),
329 };
330 pidns_hash_table[ino_hash] = move_ptr(entry);
331
332 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
333 }
334
335 /*
336 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
337 * entry for the inode number and creation time. Verify that the init pid
338 * is still valid. If not, remove it. Return the entry if valid, NULL
339 * otherwise.
340 * Must be called under store_lock
341 */
342 static pid_t lookup_verify_initpid(ino_t pidns_inode)
343 {
344 struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
345
346 while (entry) {
347 if (entry->ino == pidns_inode) {
348 if (initpid_still_valid(entry)) {
349 entry->lastcheck = time(NULL);
350 return entry->initpid;
351 }
352
353 remove_initpid(entry);
354 return ret_errno(ESRCH);
355 }
356 entry = entry->next;
357 }
358
359 return ret_errno(ESRCH);
360 }
361
362 static bool send_creds_ok(int sock_fd)
363 {
364 char v = '1'; /* we are the child */
365 struct ucred cred = {
366 .uid = 0,
367 .gid = 0,
368 .pid = 1,
369 };
370
371 return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK;
372 }
373
374 __returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd)
375 {
376 /*
377 * These flags don't interest at all so we don't jump through any hoops
378 * of retrieving them and passing them to the kernel.
379 */
380 errno = EINVAL;
381 if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
382 CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
383 return -EINVAL;
384
385 #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
386 /* On s390/s390x and cris the order of the first and second arguments
387 * of the system call is reversed.
388 */
389 return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd);
390 #elif defined(__sparc__) && defined(__arch64__)
391 {
392 /*
393 * sparc64 always returns the other process id in %o0, and a
394 * boolean flag whether this is the child or the parent in %o1.
395 * Inline assembly is needed to get the flag returned in %o1.
396 */
397 register long g1 asm("g1") = __NR_clone;
398 register long o0 asm("o0") = flags | SIGCHLD;
399 register long o1 asm("o1") = 0; /* is parent/child indicator */
400 register long o2 asm("o2") = (unsigned long)pidfd;
401 long is_error, retval, in_child;
402 pid_t child_pid;
403
404 asm volatile(
405 #if defined(__arch64__)
406 "t 0x6d\n\t" /* 64-bit trap */
407 #else
408 "t 0x10\n\t" /* 32-bit trap */
409 #endif
410 /*
411 * catch errors: On sparc, the carry bit (csr) in the
412 * processor status register (psr) is used instead of a
413 * full register.
414 */
415 "addx %%g0, 0, %%g1"
416 : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */
417 : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */
418 : "%cc"); /* clobbers */
419
420 is_error = g1;
421 retval = o0;
422 in_child = o1;
423
424 if (is_error) {
425 errno = retval;
426 return -1;
427 }
428
429 if (in_child)
430 return 0;
431
432 child_pid = retval;
433 return child_pid;
434 }
435 #elif defined(__ia64__)
436 /* On ia64 the stack and stack size are passed as separate arguments. */
437 return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd);
438 #else
439 return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd);
440 #endif
441 }
442
443 #define LXCFS_PROC_PID_NS_LEN \
444 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
445 STRLITERALLEN("/ns/pid") + 1)
446
447 /*
448 * clone a task which switches to @task's namespace and writes '1'.
449 * over a unix sock so we can read the task's reaper's pid in our
450 * namespace
451 *
452 * Note: glibc's fork() does not respect pidns, which can lead to failed
453 * assertions inside glibc (and thus failed forks) if the child's pid in
454 * the pidns and the parent pid outside are identical. Using clone prevents
455 * this issue.
456 */
457 static void write_task_init_pid_exit(int sock, pid_t target)
458 {
459 __do_close int fd = -EBADF;
460 char path[LXCFS_PROC_PID_NS_LEN];
461 pid_t pid;
462
463 snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
464 fd = open(path, O_RDONLY | O_CLOEXEC);
465 if (fd < 0)
466 log_exit("write_task_init_pid_exit open of ns/pid");
467
468 if (setns(fd, 0))
469 log_exit("Failed to setns to pid namespace of process %d", target);
470
471 pid = lxcfs_raw_clone(0, NULL);
472 if (pid < 0)
473 _exit(EXIT_FAILURE);
474
475 if (pid == 0) {
476 if (!send_creds_ok(sock))
477 _exit(EXIT_FAILURE);
478
479 _exit(EXIT_SUCCESS);
480 }
481
482 if (!wait_for_pid(pid))
483 _exit(EXIT_FAILURE);
484
485 _exit(EXIT_SUCCESS);
486 }
487
488 static pid_t scm_init_pid(pid_t task)
489 {
490 char v = '0';
491 pid_t pid_ret = -1;
492 struct ucred cred = {
493 .pid = -1,
494 .uid = -1,
495 .gid = -1,
496 };
497 pid_t pid;
498 int sock[2];
499
500 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
501 return -1;
502
503 pid = fork();
504 if (pid < 0)
505 goto out;
506
507 if (pid == 0) {
508 close(sock[1]);
509 write_task_init_pid_exit(sock[0], task);
510 _exit(EXIT_SUCCESS);
511 }
512
513 if (!recv_creds(sock[1], &cred, &v))
514 goto out;
515
516 pid_ret = cred.pid;
517
518 out:
519 close(sock[0]);
520 close(sock[1]);
521 if (pid > 0)
522 wait_for_pid(pid);
523
524 return pid_ret;
525 }
526
527 pid_t lookup_initpid_in_store(pid_t pid)
528 {
529 pid_t hashed_pid = 0;
530 char path[LXCFS_PROC_PID_NS_LEN];
531 struct stat st;
532
533 snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
534 if (stat(path, &st))
535 return ret_errno(ESRCH);
536
537 store_lock();
538
539 hashed_pid = lookup_verify_initpid(st.st_ino);
540 if (hashed_pid < 0) {
541 /* release the mutex as the following call is expensive */
542 store_unlock();
543
544 hashed_pid = scm_init_pid(pid);
545
546 store_lock();
547
548 if (hashed_pid > 0)
549 save_initpid(st.st_ino, hashed_pid);
550 }
551
552 /*
553 * Prune at the end in case we're pruning the value
554 * we were about to return.
555 */
556 prune_initpid_store();
557 store_unlock();
558
559 return hashed_pid;
560 }
561
562 /*
563 * Functions needed to setup cgroups in the __constructor__.
564 */
565
566 static bool umount_if_mounted(void)
567 {
568 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
569 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
570 return false;
571 }
572 return true;
573 }
574
575 /* __typeof__ should be safe to use with all compilers. */
576 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
577 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
578 {
579 return (fs->f_type == (fs_type_magic)magic_val);
580 }
581
582 /*
583 * looking at fs/proc_namespace.c, it appears we can
584 * actually expect the rootfs entry to very specifically contain
585 * " - rootfs rootfs "
586 * IIUC, so long as we've chrooted so that rootfs is not our root,
587 * the rootfs entry should always be skipped in mountinfo contents.
588 */
589 static bool is_on_ramfs(void)
590 {
591 __do_free char *line = NULL;
592 __do_free void *fopen_cache = NULL;
593 __do_fclose FILE *f = NULL;
594 size_t len = 0;
595
596 f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
597 if (!f)
598 return false;
599
600 while (getline(&line, &len, f) != -1) {
601 int i;
602 char *p, *p2;
603
604 for (p = line, i = 0; p && i < 4; i++)
605 p = strchr(p + 1, ' ');
606 if (!p)
607 continue;
608
609 p2 = strchr(p + 1, ' ');
610 if (!p2)
611 continue;
612 *p2 = '\0';
613 if (strcmp(p + 1, "/") == 0) {
614 /* This is '/'. Is it the ramfs? */
615 p = strchr(p2 + 1, '-');
616 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
617 return true;
618 }
619 }
620
621 return false;
622 }
623
624 static int pivot_enter()
625 {
626 __do_close int oldroot = -EBADF, newroot = -EBADF;
627
628 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
629 if (oldroot < 0)
630 return log_error_errno(-1, errno,
631 "Failed to open old root for fchdir");
632
633 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
634 if (newroot < 0)
635 return log_error_errno(-1, errno,
636 "Failed to open new root for fchdir");
637
638 /* change into new root fs */
639 if (fchdir(newroot) < 0)
640 return log_error_errno(-1,
641 errno, "Failed to change directory to new rootfs: %s",
642 ROOTDIR);
643
644 /* pivot_root into our new root fs */
645 if (pivot_root(".", ".") < 0)
646 return log_error_errno(-1, errno,
647 "pivot_root() syscall failed: %s",
648 strerror(errno));
649
650 /*
651 * At this point the old-root is mounted on top of our new-root.
652 * To unmounted it we must not be chdir'd into it, so escape back
653 * to the old-root.
654 */
655 if (fchdir(oldroot) < 0)
656 return log_error_errno(-1, errno, "Failed to enter old root");
657
658 if (umount2(".", MNT_DETACH) < 0)
659 return log_error_errno(-1, errno, "Failed to detach old root");
660
661 if (fchdir(newroot) < 0)
662 return log_error_errno(-1, errno, "Failed to re-enter new root");
663
664 return 0;
665 }
666
667 static int chroot_enter()
668 {
669 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
670 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
671 return -1;
672 }
673
674 if (chroot(".") < 0) {
675 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
676 return -1;
677 }
678
679 if (chdir("/") < 0) {
680 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
681 return -1;
682 }
683
684 return 0;
685 }
686
687 static int permute_and_enter(void)
688 {
689 struct statfs sb;
690
691 if (statfs("/", &sb) < 0) {
692 lxcfs_error("%s\n", "Could not stat / mountpoint.");
693 return -1;
694 }
695
696 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
697 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
698 * /proc/1/mountinfo. */
699 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
700 return chroot_enter();
701
702 if (pivot_enter() < 0) {
703 lxcfs_error("%s\n", "Could not perform pivot root.");
704 return -1;
705 }
706
707 return 0;
708 }
709
710 /* Prepare our new clean root. */
711 static int permute_prepare(void)
712 {
713 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
714 lxcfs_error("%s\n", "Failed to create directory for new root.");
715 return -1;
716 }
717
718 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
719 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
720 return -1;
721 }
722
723 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
724 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
725 return -1;
726 }
727
728 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
729 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
730 return -1;
731 }
732
733 return 0;
734 }
735
736 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
737 static bool permute_root(void)
738 {
739 /* Prepare new root. */
740 if (permute_prepare() < 0)
741 return false;
742
743 /* Pivot into new root. */
744 if (permute_and_enter() < 0)
745 return false;
746
747 return true;
748 }
749
750 static bool cgfs_prepare_mounts(void)
751 {
752 if (!mkdir_p(BASEDIR, 0700)) {
753 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
754 return false;
755 }
756
757 if (!umount_if_mounted()) {
758 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
759 return false;
760 }
761
762 if (unshare(CLONE_NEWNS) < 0) {
763 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
764 return false;
765 }
766
767 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
768 if (cgroup_ops->mntns_fd < 0) {
769 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
770 return false;
771 }
772
773 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
774 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
775 return false;
776 }
777
778 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
779 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
780 return false;
781 }
782
783 return true;
784 }
785
786 static bool cgfs_mount_hierarchies(void)
787 {
788 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
789 return false;
790
791 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
792 return false;
793
794 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
795 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
796 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
797 if ((*h)->fd < 0)
798 return false;
799 }
800
801 return true;
802 }
803
804 static bool cgfs_setup_controllers(void)
805 {
806 if (!cgfs_prepare_mounts())
807 return false;
808
809 if (!cgfs_mount_hierarchies())
810 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
811
812 if (!permute_root())
813 return false;
814
815 return true;
816 }
817
818 static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra)
819 {
820 int ret;
821
822 if (reload_successful) {
823 reload_successful = 0;
824
825 /* write() is async signal safe */
826 ret = write(STDERR_FILENO,
827 "Switched into non-virtualization mode\n",
828 STRLITERALLEN("Switched into non-virtualization mode\n"));
829 if (ret < 0)
830 goto please_compiler;
831 } else {
832 reload_successful = 1;
833
834 /* write() is async signal safe */
835 ret = write(STDERR_FILENO, "Switched into virtualization mode\n",
836 STRLITERALLEN("Switched into virtualization mode\n"));
837 if (ret < 0)
838 goto please_compiler;
839 }
840
841 please_compiler:
842 /*
843 * The write() syscall is a function whose return value needs to be
844 * checked. Otherwise the compiler will warn.Another one could be to
845 * use syscall(__NR_write, ...) directly but whatever.
846 */
847 return;
848 }
849
850 static void __attribute__((constructor)) lxcfs_init(void)
851 {
852 __do_close int init_ns = -EBADF, root_fd = -EBADF,
853 pidfd = -EBADF;
854 int i = 0;
855 pid_t pid;
856
857 lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
858
859 cgroup_ops = cgroup_init();
860 if (!cgroup_ops) {
861 lxcfs_info("Failed to initialize cgroup support");
862 goto broken_upgrade;
863 }
864
865 /* Preserve initial namespace. */
866 pid = getpid();
867 init_ns = preserve_ns(pid, "mnt");
868 if (init_ns < 0) {
869 lxcfs_info("Failed to preserve initial mount namespace");
870 goto broken_upgrade;
871 }
872
873 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
874 * to privately mount lxcfs cgroups. */
875 if (!cgfs_setup_controllers()) {
876 log_exit("Failed to setup private cgroup mounts for lxcfs");
877 goto broken_upgrade;
878 }
879
880 if (setns(init_ns, 0) < 0) {
881 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
882 goto broken_upgrade;
883 }
884
885 if (!init_cpuview()) {
886 log_exit("Failed to init CPU view");
887 goto broken_upgrade;
888 }
889
890 lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
891 lxcfs_info("hierarchies:");
892
893 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
894 char **controller_list = (*h)->controllers;
895 __do_free char *controllers = NULL;
896 if (controller_list && *controller_list)
897 controllers = lxc_string_join(",", (const char **)controller_list, false);
898 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
899 }
900
901 pidfd = pidfd_open(pid, 0);
902 if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
903 can_use_pidfd = true;
904 lxcfs_info("Kernel supports pidfds");
905 }
906
907 can_use_swap = cgroup_ops->can_use_swap(cgroup_ops);
908 if (can_use_swap)
909 lxcfs_info("Kernel supports swap accounting");
910 else
911 lxcfs_info("Kernel does not support swap accounting");
912
913 lxcfs_info("api_extensions:");
914 for (i = 0; i < nr_api_extensions; i++)
915 lxcfs_info("- %s", api_extensions[i]);
916
917 root_fd = open("/", O_PATH | O_CLOEXEC);
918 if (root_fd < 0)
919 lxcfs_info("%s - Failed to open root directory", strerror(errno));
920 else if (fchdir(root_fd) < 0)
921 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
922
923 if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) {
924 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno));
925 goto broken_upgrade;
926 }
927
928 reload_successful = 1;
929 return;
930
931 broken_upgrade:
932 reload_successful = 0;
933 lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
934 }
935
936 static void __attribute__((destructor)) lxcfs_exit(void)
937 {
938 lxcfs_info("Running destructor %s", __func__);
939
940 clear_initpid_store();
941 free_cpuview();
942 cgroup_exit(cgroup_ops);
943 }
944
945 void *lxcfs_fuse_init(struct fuse_conn_info *conn, struct fuse_config *cfg)
946 {
947 struct fuse_context *fc = fuse_get_context();
948 can_use_sys_cpu = true;
949 has_versioned_opts = true;
950 return fc->private_data;
951 }