]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/bindings.c
make meminfo and swaps cgroupv2 aware
[mirror_lxcfs.git] / src / bindings.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
237e200e 2
f834b6bf
SP
3#include "config.h"
4
237e200e 5#include <dirent.h>
29a73c2f 6#include <errno.h>
237e200e 7#include <fcntl.h>
0ecddf02 8#include <inttypes.h>
237e200e 9#include <libgen.h>
dee86006
CB
10#include <linux/magic.h>
11#include <linux/sched.h>
237e200e 12#include <pthread.h>
29a73c2f 13#include <sched.h>
db1b32f6 14#include <stdarg.h>
29a73c2f 15#include <stdbool.h>
0ecddf02 16#include <stdint.h>
29a73c2f
CB
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
29a73c2f
CB
20#include <sys/epoll.h>
21#include <sys/mman.h>
22#include <sys/mount.h>
237e200e
SH
23#include <sys/param.h>
24#include <sys/socket.h>
29a73c2f 25#include <sys/syscall.h>
0ecddf02 26#include <sys/sysinfo.h>
d89504c4 27#include <sys/vfs.h>
dee86006
CB
28#include <time.h>
29#include <unistd.h>
30#include <wait.h>
237e200e 31
237e200e 32#include "bindings.h"
e01afbb7
CB
33
34#include "api_extensions.h"
580fe4df 35#include "cgroup_fuse.h"
5fbea8a6
CB
36#include "cgroups/cgroup.h"
37#include "cgroups/cgroup_utils.h"
c9236032 38#include "memory_utils.h"
1f5596dd 39#include "proc_cpuview.h"
8364a99c 40#include "syscall_numbers.h"
1d81c6a6 41#include "utils.h"
237e200e 42
2aa59b2e 43static bool can_use_pidfd;
c6805016 44static bool can_use_swap;
285aea40
CB
45static bool can_use_sys_cpu;
46static bool has_versioned_opts;
50f7faee 47static bool memory_is_cgroupv2;
b9b6bdc9
CB
48
49static volatile sig_atomic_t reload_successful;
cbfc55fd
CB
50
51bool liblxcfs_functional(void)
52{
b9b6bdc9 53 return reload_successful != 0;
cbfc55fd 54}
2aa59b2e 55
c6805016
CB
56bool liblxcfs_can_use_swap(void)
57{
58 return can_use_swap;
59}
60
285aea40
CB
61bool liblxcfs_can_use_sys_cpu(void)
62{
63 return can_use_sys_cpu;
64}
65
66bool liblxcfs_has_versioned_opts(void)
67{
68 return has_versioned_opts;
69}
70
50f7faee
WB
71bool liblxcfs_memory_is_cgroupv2(void)
72{
73 return memory_is_cgroupv2;
74}
75
29a73c2f
CB
76/* Define pivot_root() if missing from the C library */
77#ifndef HAVE_PIVOT_ROOT
4ec5c9da 78static int pivot_root(const char *new_root, const char *put_old)
29a73c2f 79{
4ec5c9da 80 return syscall(__NR_pivot_root, new_root, put_old);
29a73c2f
CB
81}
82#else
4ec5c9da 83extern int pivot_root(const char *new_root, const char *put_old);
29a73c2f
CB
84#endif
85
237e200e
SH
86/*
87 * A table caching which pid is init for a pid namespace.
88 * When looking up which pid is init for $qpid, we first
89 * 1. Stat /proc/$qpid/ns/pid.
90 * 2. Check whether the ino_t is in our store.
91 * a. if not, fork a child in qpid's ns to send us
92 * ucred.pid = 1, and read the initpid. Cache
93 * initpid and creation time for /proc/initpid
94 * in a new store entry.
95 * b. if so, verify that /proc/initpid still matches
96 * what we have saved. If not, clear the store
97 * entry and go back to a. If so, return the
98 * cached initpid.
99 */
100struct pidns_init_store {
2aa59b2e
CB
101 ino_t ino; /* inode number for /proc/$pid/ns/pid */
102 pid_t initpid; /* the pid of nit in that ns */
103 int init_pidfd;
1ba088ae 104 int64_t ctime; /* the time at which /proc/$initpid was created */
237e200e 105 struct pidns_init_store *next;
1ba088ae 106 int64_t lastcheck;
237e200e
SH
107};
108
109/* lol - look at how they are allocated in the kernel */
110#define PIDNS_HASH_SIZE 4096
111#define HASH(x) ((x) % PIDNS_HASH_SIZE)
112
113static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
114static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
4ec5c9da 115
4e1e4115 116static void mutex_lock(pthread_mutex_t *l)
237e200e
SH
117{
118 int ret;
119
4ec5c9da
CB
120 ret = pthread_mutex_lock(l);
121 if (ret)
122 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
123}
124
77f4399a 125struct cgroup_ops *cgroup_ops;
29a73c2f 126
4e1e4115 127static void mutex_unlock(pthread_mutex_t *l)
237e200e
SH
128{
129 int ret;
130
4ec5c9da
CB
131 ret = pthread_mutex_unlock(l);
132 if (ret)
133 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
134}
135
4e1e4115 136static inline void store_lock(void)
237e200e 137{
4e1e4115 138 mutex_lock(&pidns_store_mutex);
237e200e
SH
139}
140
4e1e4115 141static inline void store_unlock(void)
237e200e 142{
4e1e4115 143 mutex_unlock(&pidns_store_mutex);
237e200e
SH
144}
145
2aa59b2e
CB
146/* /proc/ = 6
147 * +
148 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
149 * +
150 * \0 = 1
151 */
152#define LXCFS_PROC_PID_LEN \
153 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
154
bc189096 155static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
237e200e 156{
bc189096 157 int ret;
237e200e 158
bc189096
CB
159 if (entry->init_pidfd < 0)
160 return ret_errno(ENOSYS);
7dd6560a 161
bc189096
CB
162 ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
163 if (ret < 0) {
164 if (errno == ENOSYS)
165 return ret_errno(ENOSYS);
7dd6560a 166
bc189096 167 return 0;
2aa59b2e
CB
168 }
169
bc189096
CB
170 return 1;
171}
172
173static int initpid_still_valid_stat(struct pidns_init_store *entry)
174{
175 struct stat st;
176 char path[LXCFS_PROC_PID_LEN];
177
178 snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
179 if (stat(path, &st) || entry->ctime != st.st_ctime)
180 return 0;
181
182 return 1;
183}
184
185/* Must be called under store_lock */
186static bool initpid_still_valid(struct pidns_init_store *entry)
187{
188 int ret;
189
190 ret = initpid_still_valid_pidfd(entry);
191 if (ret < 0)
192 ret = initpid_still_valid_stat(entry);
193
194 return ret == 1;
237e200e
SH
195}
196
197/* Must be called under store_lock */
2aa59b2e 198static void remove_initpid(struct pidns_init_store *entry)
237e200e 199{
2aa59b2e
CB
200 struct pidns_init_store *it;
201 int ino_hash;
237e200e 202
2aa59b2e
CB
203 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
204 entry->initpid);
7dd6560a 205
2aa59b2e
CB
206 ino_hash = HASH(entry->ino);
207 if (pidns_hash_table[ino_hash] == entry) {
208 pidns_hash_table[ino_hash] = entry->next;
209 close_prot_errno_disarm(entry->init_pidfd);
210 free_disarm(entry);
237e200e
SH
211 return;
212 }
213
2aa59b2e
CB
214 it = pidns_hash_table[ino_hash];
215 while (it) {
216 if (it->next == entry) {
217 it->next = entry->next;
218 close_prot_errno_disarm(entry->init_pidfd);
219 free_disarm(entry);
237e200e
SH
220 return;
221 }
2aa59b2e 222 it = it->next;
237e200e
SH
223 }
224}
225
226#define PURGE_SECS 5
227/* Must be called under store_lock */
228static void prune_initpid_store(void)
229{
1ba088ae
CB
230 static int64_t last_prune = 0;
231 int64_t now, threshold;
237e200e
SH
232
233 if (!last_prune) {
234 last_prune = time(NULL);
235 return;
236 }
2aa59b2e 237
237e200e 238 now = time(NULL);
b18d6121 239 if (now < (last_prune + PURGE_SECS))
237e200e 240 return;
7dd6560a 241
2aa59b2e 242 lxcfs_debug("Pruning init pid cache");
7dd6560a 243
237e200e
SH
244 last_prune = now;
245 threshold = now - 2 * PURGE_SECS;
246
2aa59b2e
CB
247 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
248 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
249 if (entry->lastcheck < threshold) {
250 struct pidns_init_store *cur = entry;
7dd6560a 251
2aa59b2e 252 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
7dd6560a 253
237e200e 254 if (prev)
2aa59b2e 255 prev->next = entry->next;
237e200e 256 else
2aa59b2e
CB
257 pidns_hash_table[i] = entry->next;
258 entry = entry->next;
259 close_prot_errno_disarm(cur->init_pidfd);
260 free_disarm(cur);
237e200e 261 } else {
2aa59b2e
CB
262 prev = entry;
263 entry = entry->next;
237e200e
SH
264 }
265 }
266 }
267}
268
c8f77ce4
CB
269static void clear_initpid_store(void)
270{
271 store_lock();
272 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
273 for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) {
274 struct pidns_init_store *cur = entry;
275
276 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
277
278 pidns_hash_table[i] = entry->next;
279 entry = entry->next;
280 close_prot_errno_disarm(cur->init_pidfd);
281 free_disarm(cur);
282 }
283 }
284 store_unlock();
285}
286
237e200e 287/* Must be called under store_lock */
fcdedd16 288static void save_initpid(ino_t pidns_inode, pid_t pid)
237e200e 289{
1e5d03fe 290 __do_free struct pidns_init_store *entry = NULL;
05b7a16d 291 __do_close int pidfd = -EBADF;
536620fd 292 const struct lxcfs_opts *opts = fuse_get_context()->private_data;
2aa59b2e 293 char path[LXCFS_PROC_PID_LEN];
2aa59b2e
CB
294 struct stat st;
295 int ino_hash;
296
9973cc06 297 if (opts && opts->use_pidfd && can_use_pidfd) {
2aa59b2e
CB
298 pidfd = pidfd_open(pid, 0);
299 if (pidfd < 0)
300 return;
301 }
237e200e 302
2aa59b2e
CB
303 snprintf(path, sizeof(path), "/proc/%d", pid);
304 if (stat(path, &st))
305 return;
7dd6560a 306
5ec289bf 307 entry = zalloc(sizeof(*entry));
0eb3756b 308 if (!entry)
237e200e 309 return;
2aa59b2e 310
97017213 311 ino_hash = HASH(pidns_inode);
1e5d03fe 312 *entry = (struct pidns_init_store){
fcdedd16 313 .ino = pidns_inode,
1e5d03fe
CB
314 .initpid = pid,
315 .ctime = st.st_ctime,
316 .next = pidns_hash_table[ino_hash],
317 .lastcheck = time(NULL),
318 .init_pidfd = move_fd(pidfd),
319 };
320 pidns_hash_table[ino_hash] = move_ptr(entry);
2aa59b2e
CB
321
322 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
237e200e
SH
323}
324
325/*
326 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
327 * entry for the inode number and creation time. Verify that the init pid
328 * is still valid. If not, remove it. Return the entry if valid, NULL
329 * otherwise.
330 * Must be called under store_lock
331 */
cfda2e8a 332static pid_t lookup_verify_initpid(ino_t pidns_inode)
237e200e 333{
fcdedd16 334 struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
2aa59b2e
CB
335
336 while (entry) {
fcdedd16 337 if (entry->ino == pidns_inode) {
2aa59b2e
CB
338 if (initpid_still_valid(entry)) {
339 entry->lastcheck = time(NULL);
cfda2e8a 340 return entry->initpid;
237e200e 341 }
2aa59b2e
CB
342
343 remove_initpid(entry);
cfda2e8a 344 return ret_errno(ESRCH);
237e200e 345 }
2aa59b2e 346 entry = entry->next;
237e200e
SH
347 }
348
cfda2e8a 349 return ret_errno(ESRCH);
237e200e
SH
350}
351
35acc247 352static bool send_creds_ok(int sock_fd)
237e200e 353{
f1744de4
CB
354 char v = '1'; /* we are the child */
355 struct ucred cred = {
356 .uid = 0,
357 .gid = 0,
358 .pid = 1,
359 };
360
35acc247 361 return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK;
237e200e
SH
362}
363
35acc247 364__returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd)
87f7558b 365{
35acc247
CB
366 /*
367 * These flags don't interest at all so we don't jump through any hoops
368 * of retrieving them and passing them to the kernel.
369 */
370 errno = EINVAL;
371 if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
372 CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
373 return -EINVAL;
374
375#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
376 /* On s390/s390x and cris the order of the first and second arguments
377 * of the system call is reversed.
378 */
379 return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd);
380#elif defined(__sparc__) && defined(__arch64__)
381 {
382 /*
383 * sparc64 always returns the other process id in %o0, and a
384 * boolean flag whether this is the child or the parent in %o1.
385 * Inline assembly is needed to get the flag returned in %o1.
386 */
387 register long g1 asm("g1") = __NR_clone;
388 register long o0 asm("o0") = flags | SIGCHLD;
389 register long o1 asm("o1") = 0; /* is parent/child indicator */
390 register long o2 asm("o2") = (unsigned long)pidfd;
391 long is_error, retval, in_child;
392 pid_t child_pid;
393
394 asm volatile(
395#if defined(__arch64__)
396 "t 0x6d\n\t" /* 64-bit trap */
397#else
398 "t 0x10\n\t" /* 32-bit trap */
399#endif
400 /*
401 * catch errors: On sparc, the carry bit (csr) in the
402 * processor status register (psr) is used instead of a
403 * full register.
404 */
405 "addx %%g0, 0, %%g1"
406 : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */
407 : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */
408 : "%cc"); /* clobbers */
409
410 is_error = g1;
411 retval = o0;
412 in_child = o1;
413
414 if (is_error) {
415 errno = retval;
416 return -1;
417 }
87f7558b 418
35acc247
CB
419 if (in_child)
420 return 0;
87f7558b 421
35acc247
CB
422 child_pid = retval;
423 return child_pid;
424 }
425#elif defined(__ia64__)
426 /* On ia64 the stack and stack size are passed as separate arguments. */
427 return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd);
87f7558b 428#else
35acc247 429 return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd);
87f7558b 430#endif
87f7558b
CB
431}
432
433#define LXCFS_PROC_PID_NS_LEN \
434 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
435 STRLITERALLEN("/ns/pid") + 1)
436
580fe4df
CB
437/*
438 * clone a task which switches to @task's namespace and writes '1'.
439 * over a unix sock so we can read the task's reaper's pid in our
440 * namespace
441 *
442 * Note: glibc's fork() does not respect pidns, which can lead to failed
443 * assertions inside glibc (and thus failed forks) if the child's pid in
444 * the pidns and the parent pid outside are identical. Using clone prevents
445 * this issue.
446 */
447static void write_task_init_pid_exit(int sock, pid_t target)
448{
05b7a16d 449 __do_close int fd = -EBADF;
87f7558b 450 char path[LXCFS_PROC_PID_NS_LEN];
580fe4df 451 pid_t pid;
87f7558b
CB
452
453 snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
454 fd = open(path, O_RDONLY | O_CLOEXEC);
455 if (fd < 0)
456 log_exit("write_task_init_pid_exit open of ns/pid");
457
458 if (setns(fd, 0))
459 log_exit("Failed to setns to pid namespace of process %d", target);
460
35acc247 461 pid = lxcfs_raw_clone(0, NULL);
580fe4df 462 if (pid < 0)
87f7558b
CB
463 _exit(EXIT_FAILURE);
464
35acc247
CB
465 if (pid == 0) {
466 if (!send_creds_ok(sock))
87f7558b
CB
467 _exit(EXIT_FAILURE);
468
469 _exit(EXIT_SUCCESS);
237e200e 470 }
35acc247
CB
471
472 if (!wait_for_pid(pid))
473 _exit(EXIT_FAILURE);
474
475 _exit(EXIT_SUCCESS);
237e200e
SH
476}
477
8a07696e 478static pid_t scm_init_pid(pid_t task)
237e200e 479{
580fe4df 480 char v = '0';
87f7558b 481 pid_t pid_ret = -1;
dac3dc93
CB
482 struct ucred cred = {
483 .pid = -1,
484 .uid = -1,
485 .gid = -1,
486 };
87f7558b
CB
487 pid_t pid;
488 int sock[2];
237e200e 489
87f7558b 490 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
580fe4df 491 return -1;
237e200e 492
580fe4df
CB
493 pid = fork();
494 if (pid < 0)
495 goto out;
87f7558b
CB
496
497 if (pid == 0) {
580fe4df
CB
498 close(sock[1]);
499 write_task_init_pid_exit(sock[0], task);
87f7558b 500 _exit(EXIT_SUCCESS);
237e200e 501 }
7213ec5c 502
580fe4df
CB
503 if (!recv_creds(sock[1], &cred, &v))
504 goto out;
87f7558b
CB
505
506 pid_ret = cred.pid;
237e200e 507
580fe4df
CB
508out:
509 close(sock[0]);
510 close(sock[1]);
511 if (pid > 0)
512 wait_for_pid(pid);
237e200e 513
87f7558b
CB
514 return pid_ret;
515}
2aa59b2e
CB
516
517pid_t lookup_initpid_in_store(pid_t pid)
237e200e 518{
cfda2e8a 519 pid_t hashed_pid = 0;
2aa59b2e
CB
520 char path[LXCFS_PROC_PID_NS_LEN];
521 struct stat st;
2aa59b2e
CB
522
523 snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
2aa59b2e 524 if (stat(path, &st))
4e1e4115 525 return ret_errno(ESRCH);
2aa59b2e 526
4e1e4115 527 store_lock();
fcdedd16 528
cfda2e8a
CB
529 hashed_pid = lookup_verify_initpid(st.st_ino);
530 if (hashed_pid < 0) {
531 /* release the mutex as the following call is expensive */
532 store_unlock();
2aa59b2e 533
8a07696e 534 hashed_pid = scm_init_pid(pid);
4e1e4115 535
cfda2e8a 536 store_lock();
4e1e4115 537
cfda2e8a
CB
538 if (hashed_pid > 0)
539 save_initpid(st.st_ino, hashed_pid);
540 }
b7672ded 541
2aa59b2e 542 /*
cfda2e8a
CB
543 * Prune at the end in case we're pruning the value
544 * we were about to return.
2aa59b2e 545 */
580fe4df 546 prune_initpid_store();
4e1e4115 547 store_unlock();
2aa59b2e 548
cfda2e8a 549 return hashed_pid;
237e200e
SH
550}
551
29a73c2f
CB
552/*
553 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
554 */
555
29a73c2f
CB
556static bool umount_if_mounted(void)
557{
558 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 559 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
560 return false;
561 }
562 return true;
563}
564
2283e240
CB
565/* __typeof__ should be safe to use with all compilers. */
566typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
567static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
568{
569 return (fs->f_type == (fs_type_magic)magic_val);
570}
571
0a4dea41
CB
572/*
573 * looking at fs/proc_namespace.c, it appears we can
574 * actually expect the rootfs entry to very specifically contain
575 * " - rootfs rootfs "
576 * IIUC, so long as we've chrooted so that rootfs is not our root,
577 * the rootfs entry should always be skipped in mountinfo contents.
578 */
579static bool is_on_ramfs(void)
580{
87f7558b 581 __do_free char *line = NULL;
757a63e7 582 __do_free void *fopen_cache = NULL;
87f7558b 583 __do_fclose FILE *f = NULL;
0a4dea41 584 size_t len = 0;
0a4dea41 585
757a63e7 586 f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
0a4dea41
CB
587 if (!f)
588 return false;
589
590 while (getline(&line, &len, f) != -1) {
87f7558b
CB
591 int i;
592 char *p, *p2;
593
0a4dea41
CB
594 for (p = line, i = 0; p && i < 4; i++)
595 p = strchr(p + 1, ' ');
596 if (!p)
597 continue;
87f7558b 598
0a4dea41
CB
599 p2 = strchr(p + 1, ' ');
600 if (!p2)
601 continue;
602 *p2 = '\0';
603 if (strcmp(p + 1, "/") == 0) {
87f7558b 604 /* This is '/'. Is it the ramfs? */
0a4dea41 605 p = strchr(p2 + 1, '-');
87f7558b 606 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
0a4dea41 607 return true;
0a4dea41
CB
608 }
609 }
87f7558b 610
0a4dea41
CB
611 return false;
612}
613
9b96e96e 614static int pivot_enter(void)
0a4dea41 615{
05b7a16d 616 __do_close int oldroot = -EBADF, newroot = -EBADF;
cc309f33 617
3326c17e 618 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
619 if (oldroot < 0)
620 return log_error_errno(-1, errno,
621 "Failed to open old root for fchdir");
cc309f33 622
3326c17e 623 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
624 if (newroot < 0)
625 return log_error_errno(-1, errno,
626 "Failed to open new root for fchdir");
cc309f33
CB
627
628 /* change into new root fs */
87f7558b
CB
629 if (fchdir(newroot) < 0)
630 return log_error_errno(-1,
631 errno, "Failed to change directory to new rootfs: %s",
632 ROOTDIR);
cc309f33 633
0a4dea41 634 /* pivot_root into our new root fs */
87f7558b
CB
635 if (pivot_root(".", ".") < 0)
636 return log_error_errno(-1, errno,
637 "pivot_root() syscall failed: %s",
638 strerror(errno));
0a4dea41
CB
639
640 /*
641 * At this point the old-root is mounted on top of our new-root.
642 * To unmounted it we must not be chdir'd into it, so escape back
643 * to the old-root.
644 */
87f7558b
CB
645 if (fchdir(oldroot) < 0)
646 return log_error_errno(-1, errno, "Failed to enter old root");
0a4dea41 647
87f7558b
CB
648 if (umount2(".", MNT_DETACH) < 0)
649 return log_error_errno(-1, errno, "Failed to detach old root");
0a4dea41 650
87f7558b
CB
651 if (fchdir(newroot) < 0)
652 return log_error_errno(-1, errno, "Failed to re-enter new root");
cc309f33 653
87f7558b 654 return 0;
0a4dea41
CB
655}
656
9b96e96e 657static int chroot_enter(void)
0a4dea41
CB
658{
659 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
660 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
661 return -1;
662 }
663
664 if (chroot(".") < 0) {
665 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
666 return -1;
667 }
668
669 if (chdir("/") < 0) {
670 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
671 return -1;
672 }
673
674 return 0;
675}
676
0232cbac 677static int permute_and_enter(void)
29a73c2f 678{
0a4dea41
CB
679 struct statfs sb;
680
681 if (statfs("/", &sb) < 0) {
682 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 683 return -1;
0a4dea41
CB
684 }
685
686 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
687 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
688 * /proc/1/mountinfo. */
689 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
690 return chroot_enter();
29a73c2f 691
cc309f33 692 if (pivot_enter() < 0) {
0a4dea41 693 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 694 return -1;
29a73c2f
CB
695 }
696
cc309f33 697 return 0;
29a73c2f
CB
698}
699
700/* Prepare our new clean root. */
0232cbac 701static int permute_prepare(void)
29a73c2f
CB
702{
703 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 704 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
705 return -1;
706 }
707
708 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 709 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
710 return -1;
711 }
712
713 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 714 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
715 return -1;
716 }
717
718 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 719 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
720 return -1;
721 }
722
723 return 0;
724}
725
0232cbac
CB
726/* Calls chroot() on ramfs, pivot_root() in all other cases. */
727static bool permute_root(void)
29a73c2f
CB
728{
729 /* Prepare new root. */
0232cbac 730 if (permute_prepare() < 0)
29a73c2f
CB
731 return false;
732
733 /* Pivot into new root. */
0232cbac 734 if (permute_and_enter() < 0)
29a73c2f
CB
735 return false;
736
737 return true;
738}
739
0a4dea41 740static bool cgfs_prepare_mounts(void)
29a73c2f
CB
741{
742 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 743 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
744 return false;
745 }
480262c9 746
29a73c2f 747 if (!umount_if_mounted()) {
b8defc3d 748 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
749 return false;
750 }
751
752 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 753 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
754 return false;
755 }
756
1d81c6a6 757 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
0646f250 758 if (cgroup_ops->mntns_fd < 0) {
a257a8ee
CB
759 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
760 return false;
761 }
762
480262c9 763 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 764 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
765 return false;
766 }
480262c9 767
29a73c2f 768 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 769 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
770 return false;
771 }
480262c9 772
29a73c2f
CB
773 return true;
774}
775
0a4dea41 776static bool cgfs_mount_hierarchies(void)
29a73c2f 777{
5fbea8a6
CB
778 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
779 return false;
51c7ca35 780
5fbea8a6
CB
781 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
782 return false;
29a73c2f 783
5fbea8a6
CB
784 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
785 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
786 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
787 if ((*h)->fd < 0)
29a73c2f 788 return false;
29a73c2f 789 }
5fbea8a6 790
29a73c2f
CB
791 return true;
792}
793
480262c9 794static bool cgfs_setup_controllers(void)
29a73c2f 795{
0a4dea41 796 if (!cgfs_prepare_mounts())
29a73c2f 797 return false;
29a73c2f 798
2b8eff1d
CB
799 if (!cgfs_mount_hierarchies())
800 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
29a73c2f 801
0232cbac 802 if (!permute_root())
29a73c2f
CB
803 return false;
804
805 return true;
806}
807
dee86006 808static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra)
b9b6bdc9
CB
809{
810 int ret;
811
812 if (reload_successful) {
813 reload_successful = 0;
814
815 /* write() is async signal safe */
816 ret = write(STDERR_FILENO,
817 "Switched into non-virtualization mode\n",
818 STRLITERALLEN("Switched into non-virtualization mode\n"));
819 if (ret < 0)
820 goto please_compiler;
821 } else {
822 reload_successful = 1;
823
824 /* write() is async signal safe */
825 ret = write(STDERR_FILENO, "Switched into virtualization mode\n",
826 STRLITERALLEN("Switched into virtualization mode\n"));
827 if (ret < 0)
828 goto please_compiler;
829 }
830
831please_compiler:
832 /*
833 * The write() syscall is a function whose return value needs to be
4210ee1d
CB
834 * checked. Otherwise the compiler will warn.Another one could be to
835 * use syscall(__NR_write, ...) directly but whatever.
b9b6bdc9
CB
836 */
837 return;
838}
839
2243c5a9 840static void __attribute__((constructor)) lxcfs_init(void)
237e200e 841{
05b7a16d 842 __do_close int init_ns = -EBADF, root_fd = -EBADF,
de69569b 843 pidfd = -EBADF;
4ec5c9da 844 int i = 0;
2aa59b2e 845 pid_t pid;
50f7faee 846 struct hierarchy *hierarchy;
237e200e 847
c2357135 848 lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
cc42d0c7 849
5fbea8a6 850 cgroup_ops = cgroup_init();
c2357135
CB
851 if (!cgroup_ops) {
852 lxcfs_info("Failed to initialize cgroup support");
853 goto broken_upgrade;
854 }
237e200e 855
480262c9 856 /* Preserve initial namespace. */
2aa59b2e
CB
857 pid = getpid();
858 init_ns = preserve_ns(pid, "mnt");
c2357135
CB
859 if (init_ns < 0) {
860 lxcfs_info("Failed to preserve initial mount namespace");
861 goto broken_upgrade;
862 }
480262c9 863
480262c9
CB
864 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
865 * to privately mount lxcfs cgroups. */
c2357135 866 if (!cgfs_setup_controllers()) {
2243c5a9 867 log_exit("Failed to setup private cgroup mounts for lxcfs");
c2357135
CB
868 goto broken_upgrade;
869 }
480262c9 870
c2357135 871 if (setns(init_ns, 0) < 0) {
2243c5a9 872 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
c2357135
CB
873 goto broken_upgrade;
874 }
29a73c2f 875
c2357135 876 if (!init_cpuview()) {
2243c5a9 877 log_exit("Failed to init CPU view");
c2357135
CB
878 goto broken_upgrade;
879 }
056adcef 880
cc42d0c7
CB
881 lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
882 lxcfs_info("hierarchies:");
4ec5c9da
CB
883
884 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
cc42d0c7
CB
885 char **controller_list = (*h)->controllers;
886 __do_free char *controllers = NULL;
887 if (controller_list && *controller_list)
888 controllers = lxc_string_join(",", (const char **)controller_list, false);
889 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
4ec5c9da 890 }
2aa59b2e
CB
891
892 pidfd = pidfd_open(pid, 0);
893 if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
894 can_use_pidfd = true;
cc42d0c7 895 lxcfs_info("Kernel supports pidfds");
2aa59b2e 896 }
ce8fc84c 897
c6805016
CB
898 can_use_swap = cgroup_ops->can_use_swap(cgroup_ops);
899 if (can_use_swap)
900 lxcfs_info("Kernel supports swap accounting");
901 else
902 lxcfs_info("Kernel does not support swap accounting");
903
50f7faee
WB
904 hierarchy = cgroup_ops->get_hierarchy(cgroup_ops, "memory");
905 memory_is_cgroupv2 = hierarchy && is_unified_hierarchy(hierarchy);
906
cc42d0c7 907 lxcfs_info("api_extensions:");
3cf1e562
CB
908 for (size_t nr = 0; nr < nr_api_extensions; nr++)
909 lxcfs_info("- %s", api_extensions[nr]);
de69569b
CB
910
911 root_fd = open("/", O_PATH | O_CLOEXEC);
c2357135
CB
912 if (root_fd < 0)
913 lxcfs_info("%s - Failed to open root directory", strerror(errno));
914 else if (fchdir(root_fd) < 0)
915 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
916
dee86006
CB
917 if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) {
918 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno));
b9b6bdc9 919 goto broken_upgrade;
dee86006 920 }
b9b6bdc9
CB
921
922 reload_successful = 1;
c2357135 923 return;
de69569b 924
c2357135 925broken_upgrade:
b9b6bdc9 926 reload_successful = 0;
c2357135 927 lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
237e200e
SH
928}
929
2243c5a9 930static void __attribute__((destructor)) lxcfs_exit(void)
237e200e 931{
cc42d0c7
CB
932 lxcfs_info("Running destructor %s", __func__);
933
c8f77ce4 934 clear_initpid_store();
056adcef 935 free_cpuview();
2243c5a9 936 cgroup_exit(cgroup_ops);
1c4b4e38 937}
285aea40 938
0d5383b7 939void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data)
285aea40
CB
940{
941 struct fuse_context *fc = fuse_get_context();
942 can_use_sys_cpu = true;
943 has_versioned_opts = true;
944 return fc->private_data;
945}