]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/bindings.c
tree-wide: fix fuse header inclusion
[mirror_lxcfs.git] / src / bindings.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
237e200e 2
1f5596dd
CB
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE
5#endif
6
f834b6bf
SP
7#include "config.h"
8
237e200e 9#include <dirent.h>
29a73c2f 10#include <errno.h>
237e200e 11#include <fcntl.h>
0ecddf02 12#include <inttypes.h>
237e200e 13#include <libgen.h>
dee86006
CB
14#include <linux/magic.h>
15#include <linux/sched.h>
237e200e 16#include <pthread.h>
29a73c2f 17#include <sched.h>
db1b32f6 18#include <stdarg.h>
29a73c2f 19#include <stdbool.h>
0ecddf02 20#include <stdint.h>
29a73c2f
CB
21#include <stdio.h>
22#include <stdlib.h>
23#include <string.h>
29a73c2f
CB
24#include <sys/epoll.h>
25#include <sys/mman.h>
26#include <sys/mount.h>
237e200e
SH
27#include <sys/param.h>
28#include <sys/socket.h>
29a73c2f 29#include <sys/syscall.h>
0ecddf02 30#include <sys/sysinfo.h>
d89504c4 31#include <sys/vfs.h>
dee86006
CB
32#include <time.h>
33#include <unistd.h>
34#include <wait.h>
237e200e 35
237e200e 36#include "bindings.h"
e01afbb7
CB
37
38#include "api_extensions.h"
580fe4df 39#include "cgroup_fuse.h"
5fbea8a6
CB
40#include "cgroups/cgroup.h"
41#include "cgroups/cgroup_utils.h"
c9236032 42#include "memory_utils.h"
1f5596dd 43#include "proc_cpuview.h"
8364a99c 44#include "syscall_numbers.h"
1d81c6a6 45#include "utils.h"
237e200e 46
2aa59b2e 47static bool can_use_pidfd;
c6805016 48static bool can_use_swap;
285aea40
CB
49static bool can_use_sys_cpu;
50static bool has_versioned_opts;
b9b6bdc9
CB
51
52static volatile sig_atomic_t reload_successful;
cbfc55fd
CB
53
54bool liblxcfs_functional(void)
55{
b9b6bdc9 56 return reload_successful != 0;
cbfc55fd 57}
2aa59b2e 58
c6805016
CB
59bool liblxcfs_can_use_swap(void)
60{
61 return can_use_swap;
62}
63
285aea40
CB
64bool liblxcfs_can_use_sys_cpu(void)
65{
66 return can_use_sys_cpu;
67}
68
69bool liblxcfs_has_versioned_opts(void)
70{
71 return has_versioned_opts;
72}
73
29a73c2f
CB
74/* Define pivot_root() if missing from the C library */
75#ifndef HAVE_PIVOT_ROOT
4ec5c9da 76static int pivot_root(const char *new_root, const char *put_old)
29a73c2f 77{
4ec5c9da 78 return syscall(__NR_pivot_root, new_root, put_old);
29a73c2f
CB
79}
80#else
4ec5c9da 81extern int pivot_root(const char *new_root, const char *put_old);
29a73c2f
CB
82#endif
83
237e200e
SH
84/*
85 * A table caching which pid is init for a pid namespace.
86 * When looking up which pid is init for $qpid, we first
87 * 1. Stat /proc/$qpid/ns/pid.
88 * 2. Check whether the ino_t is in our store.
89 * a. if not, fork a child in qpid's ns to send us
90 * ucred.pid = 1, and read the initpid. Cache
91 * initpid and creation time for /proc/initpid
92 * in a new store entry.
93 * b. if so, verify that /proc/initpid still matches
94 * what we have saved. If not, clear the store
95 * entry and go back to a. If so, return the
96 * cached initpid.
97 */
98struct pidns_init_store {
2aa59b2e
CB
99 ino_t ino; /* inode number for /proc/$pid/ns/pid */
100 pid_t initpid; /* the pid of nit in that ns */
101 int init_pidfd;
1ba088ae 102 int64_t ctime; /* the time at which /proc/$initpid was created */
237e200e 103 struct pidns_init_store *next;
1ba088ae 104 int64_t lastcheck;
237e200e
SH
105};
106
107/* lol - look at how they are allocated in the kernel */
108#define PIDNS_HASH_SIZE 4096
109#define HASH(x) ((x) % PIDNS_HASH_SIZE)
110
111static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
112static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
4ec5c9da 113
4e1e4115 114static void mutex_lock(pthread_mutex_t *l)
237e200e
SH
115{
116 int ret;
117
4ec5c9da
CB
118 ret = pthread_mutex_lock(l);
119 if (ret)
120 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
121}
122
77f4399a 123struct cgroup_ops *cgroup_ops;
29a73c2f 124
4e1e4115 125static void mutex_unlock(pthread_mutex_t *l)
237e200e
SH
126{
127 int ret;
128
4ec5c9da
CB
129 ret = pthread_mutex_unlock(l);
130 if (ret)
131 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
132}
133
4e1e4115 134static inline void store_lock(void)
237e200e 135{
4e1e4115 136 mutex_lock(&pidns_store_mutex);
237e200e
SH
137}
138
4e1e4115 139static inline void store_unlock(void)
237e200e 140{
4e1e4115 141 mutex_unlock(&pidns_store_mutex);
237e200e
SH
142}
143
2aa59b2e
CB
144/* /proc/ = 6
145 * +
146 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
147 * +
148 * \0 = 1
149 */
150#define LXCFS_PROC_PID_LEN \
151 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
152
bc189096 153static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
237e200e 154{
bc189096 155 int ret;
237e200e 156
bc189096
CB
157 if (entry->init_pidfd < 0)
158 return ret_errno(ENOSYS);
7dd6560a 159
bc189096
CB
160 ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
161 if (ret < 0) {
162 if (errno == ENOSYS)
163 return ret_errno(ENOSYS);
7dd6560a 164
bc189096 165 return 0;
2aa59b2e
CB
166 }
167
bc189096
CB
168 return 1;
169}
170
171static int initpid_still_valid_stat(struct pidns_init_store *entry)
172{
173 struct stat st;
174 char path[LXCFS_PROC_PID_LEN];
175
176 snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
177 if (stat(path, &st) || entry->ctime != st.st_ctime)
178 return 0;
179
180 return 1;
181}
182
183/* Must be called under store_lock */
184static bool initpid_still_valid(struct pidns_init_store *entry)
185{
186 int ret;
187
188 ret = initpid_still_valid_pidfd(entry);
189 if (ret < 0)
190 ret = initpid_still_valid_stat(entry);
191
192 return ret == 1;
237e200e
SH
193}
194
195/* Must be called under store_lock */
2aa59b2e 196static void remove_initpid(struct pidns_init_store *entry)
237e200e 197{
2aa59b2e
CB
198 struct pidns_init_store *it;
199 int ino_hash;
237e200e 200
2aa59b2e
CB
201 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
202 entry->initpid);
7dd6560a 203
2aa59b2e
CB
204 ino_hash = HASH(entry->ino);
205 if (pidns_hash_table[ino_hash] == entry) {
206 pidns_hash_table[ino_hash] = entry->next;
207 close_prot_errno_disarm(entry->init_pidfd);
208 free_disarm(entry);
237e200e
SH
209 return;
210 }
211
2aa59b2e
CB
212 it = pidns_hash_table[ino_hash];
213 while (it) {
214 if (it->next == entry) {
215 it->next = entry->next;
216 close_prot_errno_disarm(entry->init_pidfd);
217 free_disarm(entry);
237e200e
SH
218 return;
219 }
2aa59b2e 220 it = it->next;
237e200e
SH
221 }
222}
223
224#define PURGE_SECS 5
225/* Must be called under store_lock */
226static void prune_initpid_store(void)
227{
1ba088ae
CB
228 static int64_t last_prune = 0;
229 int64_t now, threshold;
237e200e
SH
230
231 if (!last_prune) {
232 last_prune = time(NULL);
233 return;
234 }
2aa59b2e 235
237e200e 236 now = time(NULL);
b18d6121 237 if (now < (last_prune + PURGE_SECS))
237e200e 238 return;
7dd6560a 239
2aa59b2e 240 lxcfs_debug("Pruning init pid cache");
7dd6560a 241
237e200e
SH
242 last_prune = now;
243 threshold = now - 2 * PURGE_SECS;
244
2aa59b2e
CB
245 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
246 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
247 if (entry->lastcheck < threshold) {
248 struct pidns_init_store *cur = entry;
7dd6560a 249
2aa59b2e 250 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
7dd6560a 251
237e200e 252 if (prev)
2aa59b2e 253 prev->next = entry->next;
237e200e 254 else
2aa59b2e
CB
255 pidns_hash_table[i] = entry->next;
256 entry = entry->next;
257 close_prot_errno_disarm(cur->init_pidfd);
258 free_disarm(cur);
237e200e 259 } else {
2aa59b2e
CB
260 prev = entry;
261 entry = entry->next;
237e200e
SH
262 }
263 }
264 }
265}
266
c8f77ce4
CB
267static void clear_initpid_store(void)
268{
269 store_lock();
270 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
271 for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) {
272 struct pidns_init_store *cur = entry;
273
274 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
275
276 pidns_hash_table[i] = entry->next;
277 entry = entry->next;
278 close_prot_errno_disarm(cur->init_pidfd);
279 free_disarm(cur);
280 }
281 }
282 store_unlock();
283}
284
237e200e 285/* Must be called under store_lock */
fcdedd16 286static void save_initpid(ino_t pidns_inode, pid_t pid)
237e200e 287{
1e5d03fe 288 __do_free struct pidns_init_store *entry = NULL;
05b7a16d 289 __do_close int pidfd = -EBADF;
536620fd 290 const struct lxcfs_opts *opts = fuse_get_context()->private_data;
2aa59b2e 291 char path[LXCFS_PROC_PID_LEN];
2aa59b2e
CB
292 struct stat st;
293 int ino_hash;
294
9973cc06 295 if (opts && opts->use_pidfd && can_use_pidfd) {
2aa59b2e
CB
296 pidfd = pidfd_open(pid, 0);
297 if (pidfd < 0)
298 return;
299 }
237e200e 300
2aa59b2e
CB
301 snprintf(path, sizeof(path), "/proc/%d", pid);
302 if (stat(path, &st))
303 return;
7dd6560a 304
5ec289bf 305 entry = zalloc(sizeof(*entry));
0eb3756b 306 if (!entry)
237e200e 307 return;
2aa59b2e 308
97017213 309 ino_hash = HASH(pidns_inode);
1e5d03fe 310 *entry = (struct pidns_init_store){
fcdedd16 311 .ino = pidns_inode,
1e5d03fe
CB
312 .initpid = pid,
313 .ctime = st.st_ctime,
314 .next = pidns_hash_table[ino_hash],
315 .lastcheck = time(NULL),
316 .init_pidfd = move_fd(pidfd),
317 };
318 pidns_hash_table[ino_hash] = move_ptr(entry);
2aa59b2e
CB
319
320 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
237e200e
SH
321}
322
323/*
324 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
325 * entry for the inode number and creation time. Verify that the init pid
326 * is still valid. If not, remove it. Return the entry if valid, NULL
327 * otherwise.
328 * Must be called under store_lock
329 */
cfda2e8a 330static pid_t lookup_verify_initpid(ino_t pidns_inode)
237e200e 331{
fcdedd16 332 struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
2aa59b2e
CB
333
334 while (entry) {
fcdedd16 335 if (entry->ino == pidns_inode) {
2aa59b2e
CB
336 if (initpid_still_valid(entry)) {
337 entry->lastcheck = time(NULL);
cfda2e8a 338 return entry->initpid;
237e200e 339 }
2aa59b2e
CB
340
341 remove_initpid(entry);
cfda2e8a 342 return ret_errno(ESRCH);
237e200e 343 }
2aa59b2e 344 entry = entry->next;
237e200e
SH
345 }
346
cfda2e8a 347 return ret_errno(ESRCH);
237e200e
SH
348}
349
35acc247 350static bool send_creds_ok(int sock_fd)
237e200e 351{
f1744de4
CB
352 char v = '1'; /* we are the child */
353 struct ucred cred = {
354 .uid = 0,
355 .gid = 0,
356 .pid = 1,
357 };
358
35acc247 359 return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK;
237e200e
SH
360}
361
35acc247 362__returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd)
87f7558b 363{
35acc247
CB
364 /*
365 * These flags don't interest at all so we don't jump through any hoops
366 * of retrieving them and passing them to the kernel.
367 */
368 errno = EINVAL;
369 if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
370 CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
371 return -EINVAL;
372
373#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
374 /* On s390/s390x and cris the order of the first and second arguments
375 * of the system call is reversed.
376 */
377 return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd);
378#elif defined(__sparc__) && defined(__arch64__)
379 {
380 /*
381 * sparc64 always returns the other process id in %o0, and a
382 * boolean flag whether this is the child or the parent in %o1.
383 * Inline assembly is needed to get the flag returned in %o1.
384 */
385 register long g1 asm("g1") = __NR_clone;
386 register long o0 asm("o0") = flags | SIGCHLD;
387 register long o1 asm("o1") = 0; /* is parent/child indicator */
388 register long o2 asm("o2") = (unsigned long)pidfd;
389 long is_error, retval, in_child;
390 pid_t child_pid;
391
392 asm volatile(
393#if defined(__arch64__)
394 "t 0x6d\n\t" /* 64-bit trap */
395#else
396 "t 0x10\n\t" /* 32-bit trap */
397#endif
398 /*
399 * catch errors: On sparc, the carry bit (csr) in the
400 * processor status register (psr) is used instead of a
401 * full register.
402 */
403 "addx %%g0, 0, %%g1"
404 : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */
405 : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */
406 : "%cc"); /* clobbers */
407
408 is_error = g1;
409 retval = o0;
410 in_child = o1;
411
412 if (is_error) {
413 errno = retval;
414 return -1;
415 }
87f7558b 416
35acc247
CB
417 if (in_child)
418 return 0;
87f7558b 419
35acc247
CB
420 child_pid = retval;
421 return child_pid;
422 }
423#elif defined(__ia64__)
424 /* On ia64 the stack and stack size are passed as separate arguments. */
425 return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd);
87f7558b 426#else
35acc247 427 return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd);
87f7558b 428#endif
87f7558b
CB
429}
430
431#define LXCFS_PROC_PID_NS_LEN \
432 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
433 STRLITERALLEN("/ns/pid") + 1)
434
580fe4df
CB
435/*
436 * clone a task which switches to @task's namespace and writes '1'.
437 * over a unix sock so we can read the task's reaper's pid in our
438 * namespace
439 *
440 * Note: glibc's fork() does not respect pidns, which can lead to failed
441 * assertions inside glibc (and thus failed forks) if the child's pid in
442 * the pidns and the parent pid outside are identical. Using clone prevents
443 * this issue.
444 */
445static void write_task_init_pid_exit(int sock, pid_t target)
446{
05b7a16d 447 __do_close int fd = -EBADF;
87f7558b 448 char path[LXCFS_PROC_PID_NS_LEN];
580fe4df 449 pid_t pid;
87f7558b
CB
450
451 snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
452 fd = open(path, O_RDONLY | O_CLOEXEC);
453 if (fd < 0)
454 log_exit("write_task_init_pid_exit open of ns/pid");
455
456 if (setns(fd, 0))
457 log_exit("Failed to setns to pid namespace of process %d", target);
458
35acc247 459 pid = lxcfs_raw_clone(0, NULL);
580fe4df 460 if (pid < 0)
87f7558b
CB
461 _exit(EXIT_FAILURE);
462
35acc247
CB
463 if (pid == 0) {
464 if (!send_creds_ok(sock))
87f7558b
CB
465 _exit(EXIT_FAILURE);
466
467 _exit(EXIT_SUCCESS);
237e200e 468 }
35acc247
CB
469
470 if (!wait_for_pid(pid))
471 _exit(EXIT_FAILURE);
472
473 _exit(EXIT_SUCCESS);
237e200e
SH
474}
475
8a07696e 476static pid_t scm_init_pid(pid_t task)
237e200e 477{
580fe4df 478 char v = '0';
87f7558b 479 pid_t pid_ret = -1;
dac3dc93
CB
480 struct ucred cred = {
481 .pid = -1,
482 .uid = -1,
483 .gid = -1,
484 };
87f7558b
CB
485 pid_t pid;
486 int sock[2];
237e200e 487
87f7558b 488 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
580fe4df 489 return -1;
237e200e 490
580fe4df
CB
491 pid = fork();
492 if (pid < 0)
493 goto out;
87f7558b
CB
494
495 if (pid == 0) {
580fe4df
CB
496 close(sock[1]);
497 write_task_init_pid_exit(sock[0], task);
87f7558b 498 _exit(EXIT_SUCCESS);
237e200e 499 }
7213ec5c 500
580fe4df
CB
501 if (!recv_creds(sock[1], &cred, &v))
502 goto out;
87f7558b
CB
503
504 pid_ret = cred.pid;
237e200e 505
580fe4df
CB
506out:
507 close(sock[0]);
508 close(sock[1]);
509 if (pid > 0)
510 wait_for_pid(pid);
237e200e 511
87f7558b
CB
512 return pid_ret;
513}
2aa59b2e
CB
514
515pid_t lookup_initpid_in_store(pid_t pid)
237e200e 516{
cfda2e8a 517 pid_t hashed_pid = 0;
2aa59b2e
CB
518 char path[LXCFS_PROC_PID_NS_LEN];
519 struct stat st;
2aa59b2e
CB
520
521 snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
2aa59b2e 522 if (stat(path, &st))
4e1e4115 523 return ret_errno(ESRCH);
2aa59b2e 524
4e1e4115 525 store_lock();
fcdedd16 526
cfda2e8a
CB
527 hashed_pid = lookup_verify_initpid(st.st_ino);
528 if (hashed_pid < 0) {
529 /* release the mutex as the following call is expensive */
530 store_unlock();
2aa59b2e 531
8a07696e 532 hashed_pid = scm_init_pid(pid);
4e1e4115 533
cfda2e8a 534 store_lock();
4e1e4115 535
cfda2e8a
CB
536 if (hashed_pid > 0)
537 save_initpid(st.st_ino, hashed_pid);
538 }
b7672ded 539
2aa59b2e 540 /*
cfda2e8a
CB
541 * Prune at the end in case we're pruning the value
542 * we were about to return.
2aa59b2e 543 */
580fe4df 544 prune_initpid_store();
4e1e4115 545 store_unlock();
2aa59b2e 546
cfda2e8a 547 return hashed_pid;
237e200e
SH
548}
549
29a73c2f
CB
550/*
551 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
552 */
553
29a73c2f
CB
554static bool umount_if_mounted(void)
555{
556 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 557 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
558 return false;
559 }
560 return true;
561}
562
2283e240
CB
563/* __typeof__ should be safe to use with all compilers. */
564typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
565static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
566{
567 return (fs->f_type == (fs_type_magic)magic_val);
568}
569
0a4dea41
CB
570/*
571 * looking at fs/proc_namespace.c, it appears we can
572 * actually expect the rootfs entry to very specifically contain
573 * " - rootfs rootfs "
574 * IIUC, so long as we've chrooted so that rootfs is not our root,
575 * the rootfs entry should always be skipped in mountinfo contents.
576 */
577static bool is_on_ramfs(void)
578{
87f7558b 579 __do_free char *line = NULL;
757a63e7 580 __do_free void *fopen_cache = NULL;
87f7558b 581 __do_fclose FILE *f = NULL;
0a4dea41 582 size_t len = 0;
0a4dea41 583
757a63e7 584 f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
0a4dea41
CB
585 if (!f)
586 return false;
587
588 while (getline(&line, &len, f) != -1) {
87f7558b
CB
589 int i;
590 char *p, *p2;
591
0a4dea41
CB
592 for (p = line, i = 0; p && i < 4; i++)
593 p = strchr(p + 1, ' ');
594 if (!p)
595 continue;
87f7558b 596
0a4dea41
CB
597 p2 = strchr(p + 1, ' ');
598 if (!p2)
599 continue;
600 *p2 = '\0';
601 if (strcmp(p + 1, "/") == 0) {
87f7558b 602 /* This is '/'. Is it the ramfs? */
0a4dea41 603 p = strchr(p2 + 1, '-');
87f7558b 604 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
0a4dea41 605 return true;
0a4dea41
CB
606 }
607 }
87f7558b 608
0a4dea41
CB
609 return false;
610}
611
cc309f33 612static int pivot_enter()
0a4dea41 613{
05b7a16d 614 __do_close int oldroot = -EBADF, newroot = -EBADF;
cc309f33 615
3326c17e 616 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
617 if (oldroot < 0)
618 return log_error_errno(-1, errno,
619 "Failed to open old root for fchdir");
cc309f33 620
3326c17e 621 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
622 if (newroot < 0)
623 return log_error_errno(-1, errno,
624 "Failed to open new root for fchdir");
cc309f33
CB
625
626 /* change into new root fs */
87f7558b
CB
627 if (fchdir(newroot) < 0)
628 return log_error_errno(-1,
629 errno, "Failed to change directory to new rootfs: %s",
630 ROOTDIR);
cc309f33 631
0a4dea41 632 /* pivot_root into our new root fs */
87f7558b
CB
633 if (pivot_root(".", ".") < 0)
634 return log_error_errno(-1, errno,
635 "pivot_root() syscall failed: %s",
636 strerror(errno));
0a4dea41
CB
637
638 /*
639 * At this point the old-root is mounted on top of our new-root.
640 * To unmounted it we must not be chdir'd into it, so escape back
641 * to the old-root.
642 */
87f7558b
CB
643 if (fchdir(oldroot) < 0)
644 return log_error_errno(-1, errno, "Failed to enter old root");
0a4dea41 645
87f7558b
CB
646 if (umount2(".", MNT_DETACH) < 0)
647 return log_error_errno(-1, errno, "Failed to detach old root");
0a4dea41 648
87f7558b
CB
649 if (fchdir(newroot) < 0)
650 return log_error_errno(-1, errno, "Failed to re-enter new root");
cc309f33 651
87f7558b 652 return 0;
0a4dea41
CB
653}
654
655static int chroot_enter()
656{
657 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
658 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
659 return -1;
660 }
661
662 if (chroot(".") < 0) {
663 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
664 return -1;
665 }
666
667 if (chdir("/") < 0) {
668 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
669 return -1;
670 }
671
672 return 0;
673}
674
0232cbac 675static int permute_and_enter(void)
29a73c2f 676{
0a4dea41
CB
677 struct statfs sb;
678
679 if (statfs("/", &sb) < 0) {
680 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 681 return -1;
0a4dea41
CB
682 }
683
684 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
685 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
686 * /proc/1/mountinfo. */
687 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
688 return chroot_enter();
29a73c2f 689
cc309f33 690 if (pivot_enter() < 0) {
0a4dea41 691 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 692 return -1;
29a73c2f
CB
693 }
694
cc309f33 695 return 0;
29a73c2f
CB
696}
697
698/* Prepare our new clean root. */
0232cbac 699static int permute_prepare(void)
29a73c2f
CB
700{
701 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 702 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
703 return -1;
704 }
705
706 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 707 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
708 return -1;
709 }
710
711 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 712 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
713 return -1;
714 }
715
716 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 717 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
718 return -1;
719 }
720
721 return 0;
722}
723
0232cbac
CB
724/* Calls chroot() on ramfs, pivot_root() in all other cases. */
725static bool permute_root(void)
29a73c2f
CB
726{
727 /* Prepare new root. */
0232cbac 728 if (permute_prepare() < 0)
29a73c2f
CB
729 return false;
730
731 /* Pivot into new root. */
0232cbac 732 if (permute_and_enter() < 0)
29a73c2f
CB
733 return false;
734
735 return true;
736}
737
0a4dea41 738static bool cgfs_prepare_mounts(void)
29a73c2f
CB
739{
740 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 741 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
742 return false;
743 }
480262c9 744
29a73c2f 745 if (!umount_if_mounted()) {
b8defc3d 746 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
747 return false;
748 }
749
750 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 751 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
752 return false;
753 }
754
1d81c6a6 755 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
0646f250 756 if (cgroup_ops->mntns_fd < 0) {
a257a8ee
CB
757 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
758 return false;
759 }
760
480262c9 761 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 762 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
763 return false;
764 }
480262c9 765
29a73c2f 766 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 767 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
768 return false;
769 }
480262c9 770
29a73c2f
CB
771 return true;
772}
773
0a4dea41 774static bool cgfs_mount_hierarchies(void)
29a73c2f 775{
5fbea8a6
CB
776 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
777 return false;
51c7ca35 778
5fbea8a6
CB
779 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
780 return false;
29a73c2f 781
5fbea8a6
CB
782 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
783 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
784 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
785 if ((*h)->fd < 0)
29a73c2f 786 return false;
29a73c2f 787 }
5fbea8a6 788
29a73c2f
CB
789 return true;
790}
791
480262c9 792static bool cgfs_setup_controllers(void)
29a73c2f 793{
0a4dea41 794 if (!cgfs_prepare_mounts())
29a73c2f 795 return false;
29a73c2f 796
2b8eff1d
CB
797 if (!cgfs_mount_hierarchies())
798 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
29a73c2f 799
0232cbac 800 if (!permute_root())
29a73c2f
CB
801 return false;
802
803 return true;
804}
805
dee86006 806static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra)
b9b6bdc9
CB
807{
808 int ret;
809
810 if (reload_successful) {
811 reload_successful = 0;
812
813 /* write() is async signal safe */
814 ret = write(STDERR_FILENO,
815 "Switched into non-virtualization mode\n",
816 STRLITERALLEN("Switched into non-virtualization mode\n"));
817 if (ret < 0)
818 goto please_compiler;
819 } else {
820 reload_successful = 1;
821
822 /* write() is async signal safe */
823 ret = write(STDERR_FILENO, "Switched into virtualization mode\n",
824 STRLITERALLEN("Switched into virtualization mode\n"));
825 if (ret < 0)
826 goto please_compiler;
827 }
828
829please_compiler:
830 /*
831 * The write() syscall is a function whose return value needs to be
4210ee1d
CB
832 * checked. Otherwise the compiler will warn.Another one could be to
833 * use syscall(__NR_write, ...) directly but whatever.
b9b6bdc9
CB
834 */
835 return;
836}
837
2243c5a9 838static void __attribute__((constructor)) lxcfs_init(void)
237e200e 839{
05b7a16d 840 __do_close int init_ns = -EBADF, root_fd = -EBADF,
de69569b 841 pidfd = -EBADF;
4ec5c9da 842 int i = 0;
2aa59b2e 843 pid_t pid;
237e200e 844
c2357135 845 lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
cc42d0c7 846
5fbea8a6 847 cgroup_ops = cgroup_init();
c2357135
CB
848 if (!cgroup_ops) {
849 lxcfs_info("Failed to initialize cgroup support");
850 goto broken_upgrade;
851 }
237e200e 852
480262c9 853 /* Preserve initial namespace. */
2aa59b2e
CB
854 pid = getpid();
855 init_ns = preserve_ns(pid, "mnt");
c2357135
CB
856 if (init_ns < 0) {
857 lxcfs_info("Failed to preserve initial mount namespace");
858 goto broken_upgrade;
859 }
480262c9 860
480262c9
CB
861 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
862 * to privately mount lxcfs cgroups. */
c2357135 863 if (!cgfs_setup_controllers()) {
2243c5a9 864 log_exit("Failed to setup private cgroup mounts for lxcfs");
c2357135
CB
865 goto broken_upgrade;
866 }
480262c9 867
c2357135 868 if (setns(init_ns, 0) < 0) {
2243c5a9 869 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
c2357135
CB
870 goto broken_upgrade;
871 }
29a73c2f 872
c2357135 873 if (!init_cpuview()) {
2243c5a9 874 log_exit("Failed to init CPU view");
c2357135
CB
875 goto broken_upgrade;
876 }
056adcef 877
cc42d0c7
CB
878 lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
879 lxcfs_info("hierarchies:");
4ec5c9da
CB
880
881 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
cc42d0c7
CB
882 char **controller_list = (*h)->controllers;
883 __do_free char *controllers = NULL;
884 if (controller_list && *controller_list)
885 controllers = lxc_string_join(",", (const char **)controller_list, false);
886 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
4ec5c9da 887 }
2aa59b2e
CB
888
889 pidfd = pidfd_open(pid, 0);
890 if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
891 can_use_pidfd = true;
cc42d0c7 892 lxcfs_info("Kernel supports pidfds");
2aa59b2e 893 }
ce8fc84c 894
c6805016
CB
895 can_use_swap = cgroup_ops->can_use_swap(cgroup_ops);
896 if (can_use_swap)
897 lxcfs_info("Kernel supports swap accounting");
898 else
899 lxcfs_info("Kernel does not support swap accounting");
900
cc42d0c7 901 lxcfs_info("api_extensions:");
ce8fc84c 902 for (i = 0; i < nr_api_extensions; i++)
cc42d0c7 903 lxcfs_info("- %s", api_extensions[i]);
de69569b
CB
904
905 root_fd = open("/", O_PATH | O_CLOEXEC);
c2357135
CB
906 if (root_fd < 0)
907 lxcfs_info("%s - Failed to open root directory", strerror(errno));
908 else if (fchdir(root_fd) < 0)
909 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
910
dee86006
CB
911 if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) {
912 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno));
b9b6bdc9 913 goto broken_upgrade;
dee86006 914 }
b9b6bdc9
CB
915
916 reload_successful = 1;
c2357135 917 return;
de69569b 918
c2357135 919broken_upgrade:
b9b6bdc9 920 reload_successful = 0;
c2357135 921 lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
237e200e
SH
922}
923
2243c5a9 924static void __attribute__((destructor)) lxcfs_exit(void)
237e200e 925{
cc42d0c7
CB
926 lxcfs_info("Running destructor %s", __func__);
927
c8f77ce4 928 clear_initpid_store();
056adcef 929 free_cpuview();
2243c5a9 930 cgroup_exit(cgroup_ops);
1c4b4e38 931}
285aea40 932
0d5383b7 933void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data)
285aea40
CB
934{
935 struct fuse_context *fc = fuse_get_context();
936 can_use_sys_cpu = true;
937 has_versioned_opts = true;
938 return fc->private_data;
939}