]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/bindings.c
tree-wide: use a single fuse header
[mirror_lxcfs.git] / src / bindings.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
237e200e 2
f834b6bf
SP
3#include "config.h"
4
237e200e 5#include <dirent.h>
29a73c2f 6#include <errno.h>
237e200e 7#include <fcntl.h>
0ecddf02 8#include <inttypes.h>
237e200e 9#include <libgen.h>
dee86006
CB
10#include <linux/magic.h>
11#include <linux/sched.h>
237e200e 12#include <pthread.h>
29a73c2f 13#include <sched.h>
db1b32f6 14#include <stdarg.h>
29a73c2f 15#include <stdbool.h>
0ecddf02 16#include <stdint.h>
29a73c2f
CB
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
29a73c2f
CB
20#include <sys/epoll.h>
21#include <sys/mman.h>
22#include <sys/mount.h>
237e200e
SH
23#include <sys/param.h>
24#include <sys/socket.h>
29a73c2f 25#include <sys/syscall.h>
0ecddf02 26#include <sys/sysinfo.h>
d89504c4 27#include <sys/vfs.h>
dee86006
CB
28#include <time.h>
29#include <unistd.h>
30#include <wait.h>
237e200e 31
237e200e 32#include "bindings.h"
e01afbb7
CB
33
34#include "api_extensions.h"
580fe4df 35#include "cgroup_fuse.h"
5fbea8a6
CB
36#include "cgroups/cgroup.h"
37#include "cgroups/cgroup_utils.h"
c9236032 38#include "memory_utils.h"
1f5596dd 39#include "proc_cpuview.h"
8364a99c 40#include "syscall_numbers.h"
1d81c6a6 41#include "utils.h"
237e200e 42
2aa59b2e 43static bool can_use_pidfd;
c6805016 44static bool can_use_swap;
285aea40
CB
45static bool can_use_sys_cpu;
46static bool has_versioned_opts;
b9b6bdc9
CB
47
48static volatile sig_atomic_t reload_successful;
cbfc55fd
CB
49
50bool liblxcfs_functional(void)
51{
b9b6bdc9 52 return reload_successful != 0;
cbfc55fd 53}
2aa59b2e 54
c6805016
CB
55bool liblxcfs_can_use_swap(void)
56{
57 return can_use_swap;
58}
59
285aea40
CB
60bool liblxcfs_can_use_sys_cpu(void)
61{
62 return can_use_sys_cpu;
63}
64
65bool liblxcfs_has_versioned_opts(void)
66{
67 return has_versioned_opts;
68}
69
29a73c2f
CB
70/* Define pivot_root() if missing from the C library */
71#ifndef HAVE_PIVOT_ROOT
4ec5c9da 72static int pivot_root(const char *new_root, const char *put_old)
29a73c2f 73{
4ec5c9da 74 return syscall(__NR_pivot_root, new_root, put_old);
29a73c2f
CB
75}
76#else
4ec5c9da 77extern int pivot_root(const char *new_root, const char *put_old);
29a73c2f
CB
78#endif
79
237e200e
SH
80/*
81 * A table caching which pid is init for a pid namespace.
82 * When looking up which pid is init for $qpid, we first
83 * 1. Stat /proc/$qpid/ns/pid.
84 * 2. Check whether the ino_t is in our store.
85 * a. if not, fork a child in qpid's ns to send us
86 * ucred.pid = 1, and read the initpid. Cache
87 * initpid and creation time for /proc/initpid
88 * in a new store entry.
89 * b. if so, verify that /proc/initpid still matches
90 * what we have saved. If not, clear the store
91 * entry and go back to a. If so, return the
92 * cached initpid.
93 */
94struct pidns_init_store {
2aa59b2e
CB
95 ino_t ino; /* inode number for /proc/$pid/ns/pid */
96 pid_t initpid; /* the pid of nit in that ns */
97 int init_pidfd;
1ba088ae 98 int64_t ctime; /* the time at which /proc/$initpid was created */
237e200e 99 struct pidns_init_store *next;
1ba088ae 100 int64_t lastcheck;
237e200e
SH
101};
102
103/* lol - look at how they are allocated in the kernel */
104#define PIDNS_HASH_SIZE 4096
105#define HASH(x) ((x) % PIDNS_HASH_SIZE)
106
107static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
108static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
4ec5c9da 109
4e1e4115 110static void mutex_lock(pthread_mutex_t *l)
237e200e
SH
111{
112 int ret;
113
4ec5c9da
CB
114 ret = pthread_mutex_lock(l);
115 if (ret)
116 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
117}
118
77f4399a 119struct cgroup_ops *cgroup_ops;
29a73c2f 120
4e1e4115 121static void mutex_unlock(pthread_mutex_t *l)
237e200e
SH
122{
123 int ret;
124
4ec5c9da
CB
125 ret = pthread_mutex_unlock(l);
126 if (ret)
127 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
128}
129
4e1e4115 130static inline void store_lock(void)
237e200e 131{
4e1e4115 132 mutex_lock(&pidns_store_mutex);
237e200e
SH
133}
134
4e1e4115 135static inline void store_unlock(void)
237e200e 136{
4e1e4115 137 mutex_unlock(&pidns_store_mutex);
237e200e
SH
138}
139
2aa59b2e
CB
140/* /proc/ = 6
141 * +
142 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
143 * +
144 * \0 = 1
145 */
146#define LXCFS_PROC_PID_LEN \
147 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
148
bc189096 149static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
237e200e 150{
bc189096 151 int ret;
237e200e 152
bc189096
CB
153 if (entry->init_pidfd < 0)
154 return ret_errno(ENOSYS);
7dd6560a 155
bc189096
CB
156 ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
157 if (ret < 0) {
158 if (errno == ENOSYS)
159 return ret_errno(ENOSYS);
7dd6560a 160
bc189096 161 return 0;
2aa59b2e
CB
162 }
163
bc189096
CB
164 return 1;
165}
166
167static int initpid_still_valid_stat(struct pidns_init_store *entry)
168{
169 struct stat st;
170 char path[LXCFS_PROC_PID_LEN];
171
172 snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
173 if (stat(path, &st) || entry->ctime != st.st_ctime)
174 return 0;
175
176 return 1;
177}
178
179/* Must be called under store_lock */
180static bool initpid_still_valid(struct pidns_init_store *entry)
181{
182 int ret;
183
184 ret = initpid_still_valid_pidfd(entry);
185 if (ret < 0)
186 ret = initpid_still_valid_stat(entry);
187
188 return ret == 1;
237e200e
SH
189}
190
191/* Must be called under store_lock */
2aa59b2e 192static void remove_initpid(struct pidns_init_store *entry)
237e200e 193{
2aa59b2e
CB
194 struct pidns_init_store *it;
195 int ino_hash;
237e200e 196
2aa59b2e
CB
197 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
198 entry->initpid);
7dd6560a 199
2aa59b2e
CB
200 ino_hash = HASH(entry->ino);
201 if (pidns_hash_table[ino_hash] == entry) {
202 pidns_hash_table[ino_hash] = entry->next;
203 close_prot_errno_disarm(entry->init_pidfd);
204 free_disarm(entry);
237e200e
SH
205 return;
206 }
207
2aa59b2e
CB
208 it = pidns_hash_table[ino_hash];
209 while (it) {
210 if (it->next == entry) {
211 it->next = entry->next;
212 close_prot_errno_disarm(entry->init_pidfd);
213 free_disarm(entry);
237e200e
SH
214 return;
215 }
2aa59b2e 216 it = it->next;
237e200e
SH
217 }
218}
219
220#define PURGE_SECS 5
221/* Must be called under store_lock */
222static void prune_initpid_store(void)
223{
1ba088ae
CB
224 static int64_t last_prune = 0;
225 int64_t now, threshold;
237e200e
SH
226
227 if (!last_prune) {
228 last_prune = time(NULL);
229 return;
230 }
2aa59b2e 231
237e200e 232 now = time(NULL);
b18d6121 233 if (now < (last_prune + PURGE_SECS))
237e200e 234 return;
7dd6560a 235
2aa59b2e 236 lxcfs_debug("Pruning init pid cache");
7dd6560a 237
237e200e
SH
238 last_prune = now;
239 threshold = now - 2 * PURGE_SECS;
240
2aa59b2e
CB
241 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
242 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
243 if (entry->lastcheck < threshold) {
244 struct pidns_init_store *cur = entry;
7dd6560a 245
2aa59b2e 246 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
7dd6560a 247
237e200e 248 if (prev)
2aa59b2e 249 prev->next = entry->next;
237e200e 250 else
2aa59b2e
CB
251 pidns_hash_table[i] = entry->next;
252 entry = entry->next;
253 close_prot_errno_disarm(cur->init_pidfd);
254 free_disarm(cur);
237e200e 255 } else {
2aa59b2e
CB
256 prev = entry;
257 entry = entry->next;
237e200e
SH
258 }
259 }
260 }
261}
262
c8f77ce4
CB
263static void clear_initpid_store(void)
264{
265 store_lock();
266 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
267 for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) {
268 struct pidns_init_store *cur = entry;
269
270 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
271
272 pidns_hash_table[i] = entry->next;
273 entry = entry->next;
274 close_prot_errno_disarm(cur->init_pidfd);
275 free_disarm(cur);
276 }
277 }
278 store_unlock();
279}
280
237e200e 281/* Must be called under store_lock */
fcdedd16 282static void save_initpid(ino_t pidns_inode, pid_t pid)
237e200e 283{
1e5d03fe 284 __do_free struct pidns_init_store *entry = NULL;
05b7a16d 285 __do_close int pidfd = -EBADF;
536620fd 286 const struct lxcfs_opts *opts = fuse_get_context()->private_data;
2aa59b2e 287 char path[LXCFS_PROC_PID_LEN];
2aa59b2e
CB
288 struct stat st;
289 int ino_hash;
290
9973cc06 291 if (opts && opts->use_pidfd && can_use_pidfd) {
2aa59b2e
CB
292 pidfd = pidfd_open(pid, 0);
293 if (pidfd < 0)
294 return;
295 }
237e200e 296
2aa59b2e
CB
297 snprintf(path, sizeof(path), "/proc/%d", pid);
298 if (stat(path, &st))
299 return;
7dd6560a 300
5ec289bf 301 entry = zalloc(sizeof(*entry));
0eb3756b 302 if (!entry)
237e200e 303 return;
2aa59b2e 304
97017213 305 ino_hash = HASH(pidns_inode);
1e5d03fe 306 *entry = (struct pidns_init_store){
fcdedd16 307 .ino = pidns_inode,
1e5d03fe
CB
308 .initpid = pid,
309 .ctime = st.st_ctime,
310 .next = pidns_hash_table[ino_hash],
311 .lastcheck = time(NULL),
312 .init_pidfd = move_fd(pidfd),
313 };
314 pidns_hash_table[ino_hash] = move_ptr(entry);
2aa59b2e
CB
315
316 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
237e200e
SH
317}
318
319/*
320 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
321 * entry for the inode number and creation time. Verify that the init pid
322 * is still valid. If not, remove it. Return the entry if valid, NULL
323 * otherwise.
324 * Must be called under store_lock
325 */
cfda2e8a 326static pid_t lookup_verify_initpid(ino_t pidns_inode)
237e200e 327{
fcdedd16 328 struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
2aa59b2e
CB
329
330 while (entry) {
fcdedd16 331 if (entry->ino == pidns_inode) {
2aa59b2e
CB
332 if (initpid_still_valid(entry)) {
333 entry->lastcheck = time(NULL);
cfda2e8a 334 return entry->initpid;
237e200e 335 }
2aa59b2e
CB
336
337 remove_initpid(entry);
cfda2e8a 338 return ret_errno(ESRCH);
237e200e 339 }
2aa59b2e 340 entry = entry->next;
237e200e
SH
341 }
342
cfda2e8a 343 return ret_errno(ESRCH);
237e200e
SH
344}
345
35acc247 346static bool send_creds_ok(int sock_fd)
237e200e 347{
f1744de4
CB
348 char v = '1'; /* we are the child */
349 struct ucred cred = {
350 .uid = 0,
351 .gid = 0,
352 .pid = 1,
353 };
354
35acc247 355 return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK;
237e200e
SH
356}
357
35acc247 358__returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd)
87f7558b 359{
35acc247
CB
360 /*
361 * These flags don't interest at all so we don't jump through any hoops
362 * of retrieving them and passing them to the kernel.
363 */
364 errno = EINVAL;
365 if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
366 CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
367 return -EINVAL;
368
369#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
370 /* On s390/s390x and cris the order of the first and second arguments
371 * of the system call is reversed.
372 */
373 return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd);
374#elif defined(__sparc__) && defined(__arch64__)
375 {
376 /*
377 * sparc64 always returns the other process id in %o0, and a
378 * boolean flag whether this is the child or the parent in %o1.
379 * Inline assembly is needed to get the flag returned in %o1.
380 */
381 register long g1 asm("g1") = __NR_clone;
382 register long o0 asm("o0") = flags | SIGCHLD;
383 register long o1 asm("o1") = 0; /* is parent/child indicator */
384 register long o2 asm("o2") = (unsigned long)pidfd;
385 long is_error, retval, in_child;
386 pid_t child_pid;
387
388 asm volatile(
389#if defined(__arch64__)
390 "t 0x6d\n\t" /* 64-bit trap */
391#else
392 "t 0x10\n\t" /* 32-bit trap */
393#endif
394 /*
395 * catch errors: On sparc, the carry bit (csr) in the
396 * processor status register (psr) is used instead of a
397 * full register.
398 */
399 "addx %%g0, 0, %%g1"
400 : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */
401 : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */
402 : "%cc"); /* clobbers */
403
404 is_error = g1;
405 retval = o0;
406 in_child = o1;
407
408 if (is_error) {
409 errno = retval;
410 return -1;
411 }
87f7558b 412
35acc247
CB
413 if (in_child)
414 return 0;
87f7558b 415
35acc247
CB
416 child_pid = retval;
417 return child_pid;
418 }
419#elif defined(__ia64__)
420 /* On ia64 the stack and stack size are passed as separate arguments. */
421 return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd);
87f7558b 422#else
35acc247 423 return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd);
87f7558b 424#endif
87f7558b
CB
425}
426
427#define LXCFS_PROC_PID_NS_LEN \
428 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
429 STRLITERALLEN("/ns/pid") + 1)
430
580fe4df
CB
431/*
432 * clone a task which switches to @task's namespace and writes '1'.
433 * over a unix sock so we can read the task's reaper's pid in our
434 * namespace
435 *
436 * Note: glibc's fork() does not respect pidns, which can lead to failed
437 * assertions inside glibc (and thus failed forks) if the child's pid in
438 * the pidns and the parent pid outside are identical. Using clone prevents
439 * this issue.
440 */
441static void write_task_init_pid_exit(int sock, pid_t target)
442{
05b7a16d 443 __do_close int fd = -EBADF;
87f7558b 444 char path[LXCFS_PROC_PID_NS_LEN];
580fe4df 445 pid_t pid;
87f7558b
CB
446
447 snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
448 fd = open(path, O_RDONLY | O_CLOEXEC);
449 if (fd < 0)
450 log_exit("write_task_init_pid_exit open of ns/pid");
451
452 if (setns(fd, 0))
453 log_exit("Failed to setns to pid namespace of process %d", target);
454
35acc247 455 pid = lxcfs_raw_clone(0, NULL);
580fe4df 456 if (pid < 0)
87f7558b
CB
457 _exit(EXIT_FAILURE);
458
35acc247
CB
459 if (pid == 0) {
460 if (!send_creds_ok(sock))
87f7558b
CB
461 _exit(EXIT_FAILURE);
462
463 _exit(EXIT_SUCCESS);
237e200e 464 }
35acc247
CB
465
466 if (!wait_for_pid(pid))
467 _exit(EXIT_FAILURE);
468
469 _exit(EXIT_SUCCESS);
237e200e
SH
470}
471
8a07696e 472static pid_t scm_init_pid(pid_t task)
237e200e 473{
580fe4df 474 char v = '0';
87f7558b 475 pid_t pid_ret = -1;
dac3dc93
CB
476 struct ucred cred = {
477 .pid = -1,
478 .uid = -1,
479 .gid = -1,
480 };
87f7558b
CB
481 pid_t pid;
482 int sock[2];
237e200e 483
87f7558b 484 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
580fe4df 485 return -1;
237e200e 486
580fe4df
CB
487 pid = fork();
488 if (pid < 0)
489 goto out;
87f7558b
CB
490
491 if (pid == 0) {
580fe4df
CB
492 close(sock[1]);
493 write_task_init_pid_exit(sock[0], task);
87f7558b 494 _exit(EXIT_SUCCESS);
237e200e 495 }
7213ec5c 496
580fe4df
CB
497 if (!recv_creds(sock[1], &cred, &v))
498 goto out;
87f7558b
CB
499
500 pid_ret = cred.pid;
237e200e 501
580fe4df
CB
502out:
503 close(sock[0]);
504 close(sock[1]);
505 if (pid > 0)
506 wait_for_pid(pid);
237e200e 507
87f7558b
CB
508 return pid_ret;
509}
2aa59b2e
CB
510
511pid_t lookup_initpid_in_store(pid_t pid)
237e200e 512{
cfda2e8a 513 pid_t hashed_pid = 0;
2aa59b2e
CB
514 char path[LXCFS_PROC_PID_NS_LEN];
515 struct stat st;
2aa59b2e
CB
516
517 snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
2aa59b2e 518 if (stat(path, &st))
4e1e4115 519 return ret_errno(ESRCH);
2aa59b2e 520
4e1e4115 521 store_lock();
fcdedd16 522
cfda2e8a
CB
523 hashed_pid = lookup_verify_initpid(st.st_ino);
524 if (hashed_pid < 0) {
525 /* release the mutex as the following call is expensive */
526 store_unlock();
2aa59b2e 527
8a07696e 528 hashed_pid = scm_init_pid(pid);
4e1e4115 529
cfda2e8a 530 store_lock();
4e1e4115 531
cfda2e8a
CB
532 if (hashed_pid > 0)
533 save_initpid(st.st_ino, hashed_pid);
534 }
b7672ded 535
2aa59b2e 536 /*
cfda2e8a
CB
537 * Prune at the end in case we're pruning the value
538 * we were about to return.
2aa59b2e 539 */
580fe4df 540 prune_initpid_store();
4e1e4115 541 store_unlock();
2aa59b2e 542
cfda2e8a 543 return hashed_pid;
237e200e
SH
544}
545
29a73c2f
CB
546/*
547 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
548 */
549
29a73c2f
CB
550static bool umount_if_mounted(void)
551{
552 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 553 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
554 return false;
555 }
556 return true;
557}
558
2283e240
CB
559/* __typeof__ should be safe to use with all compilers. */
560typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
561static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
562{
563 return (fs->f_type == (fs_type_magic)magic_val);
564}
565
0a4dea41
CB
566/*
567 * looking at fs/proc_namespace.c, it appears we can
568 * actually expect the rootfs entry to very specifically contain
569 * " - rootfs rootfs "
570 * IIUC, so long as we've chrooted so that rootfs is not our root,
571 * the rootfs entry should always be skipped in mountinfo contents.
572 */
573static bool is_on_ramfs(void)
574{
87f7558b 575 __do_free char *line = NULL;
757a63e7 576 __do_free void *fopen_cache = NULL;
87f7558b 577 __do_fclose FILE *f = NULL;
0a4dea41 578 size_t len = 0;
0a4dea41 579
757a63e7 580 f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
0a4dea41
CB
581 if (!f)
582 return false;
583
584 while (getline(&line, &len, f) != -1) {
87f7558b
CB
585 int i;
586 char *p, *p2;
587
0a4dea41
CB
588 for (p = line, i = 0; p && i < 4; i++)
589 p = strchr(p + 1, ' ');
590 if (!p)
591 continue;
87f7558b 592
0a4dea41
CB
593 p2 = strchr(p + 1, ' ');
594 if (!p2)
595 continue;
596 *p2 = '\0';
597 if (strcmp(p + 1, "/") == 0) {
87f7558b 598 /* This is '/'. Is it the ramfs? */
0a4dea41 599 p = strchr(p2 + 1, '-');
87f7558b 600 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
0a4dea41 601 return true;
0a4dea41
CB
602 }
603 }
87f7558b 604
0a4dea41
CB
605 return false;
606}
607
9b96e96e 608static int pivot_enter(void)
0a4dea41 609{
05b7a16d 610 __do_close int oldroot = -EBADF, newroot = -EBADF;
cc309f33 611
3326c17e 612 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
613 if (oldroot < 0)
614 return log_error_errno(-1, errno,
615 "Failed to open old root for fchdir");
cc309f33 616
3326c17e 617 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
618 if (newroot < 0)
619 return log_error_errno(-1, errno,
620 "Failed to open new root for fchdir");
cc309f33
CB
621
622 /* change into new root fs */
87f7558b
CB
623 if (fchdir(newroot) < 0)
624 return log_error_errno(-1,
625 errno, "Failed to change directory to new rootfs: %s",
626 ROOTDIR);
cc309f33 627
0a4dea41 628 /* pivot_root into our new root fs */
87f7558b
CB
629 if (pivot_root(".", ".") < 0)
630 return log_error_errno(-1, errno,
631 "pivot_root() syscall failed: %s",
632 strerror(errno));
0a4dea41
CB
633
634 /*
635 * At this point the old-root is mounted on top of our new-root.
636 * To unmounted it we must not be chdir'd into it, so escape back
637 * to the old-root.
638 */
87f7558b
CB
639 if (fchdir(oldroot) < 0)
640 return log_error_errno(-1, errno, "Failed to enter old root");
0a4dea41 641
87f7558b
CB
642 if (umount2(".", MNT_DETACH) < 0)
643 return log_error_errno(-1, errno, "Failed to detach old root");
0a4dea41 644
87f7558b
CB
645 if (fchdir(newroot) < 0)
646 return log_error_errno(-1, errno, "Failed to re-enter new root");
cc309f33 647
87f7558b 648 return 0;
0a4dea41
CB
649}
650
9b96e96e 651static int chroot_enter(void)
0a4dea41
CB
652{
653 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
654 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
655 return -1;
656 }
657
658 if (chroot(".") < 0) {
659 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
660 return -1;
661 }
662
663 if (chdir("/") < 0) {
664 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
665 return -1;
666 }
667
668 return 0;
669}
670
0232cbac 671static int permute_and_enter(void)
29a73c2f 672{
0a4dea41
CB
673 struct statfs sb;
674
675 if (statfs("/", &sb) < 0) {
676 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 677 return -1;
0a4dea41
CB
678 }
679
680 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
681 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
682 * /proc/1/mountinfo. */
683 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
684 return chroot_enter();
29a73c2f 685
cc309f33 686 if (pivot_enter() < 0) {
0a4dea41 687 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 688 return -1;
29a73c2f
CB
689 }
690
cc309f33 691 return 0;
29a73c2f
CB
692}
693
694/* Prepare our new clean root. */
0232cbac 695static int permute_prepare(void)
29a73c2f
CB
696{
697 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 698 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
699 return -1;
700 }
701
702 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 703 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
704 return -1;
705 }
706
707 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 708 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
709 return -1;
710 }
711
712 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 713 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
714 return -1;
715 }
716
717 return 0;
718}
719
0232cbac
CB
720/* Calls chroot() on ramfs, pivot_root() in all other cases. */
721static bool permute_root(void)
29a73c2f
CB
722{
723 /* Prepare new root. */
0232cbac 724 if (permute_prepare() < 0)
29a73c2f
CB
725 return false;
726
727 /* Pivot into new root. */
0232cbac 728 if (permute_and_enter() < 0)
29a73c2f
CB
729 return false;
730
731 return true;
732}
733
0a4dea41 734static bool cgfs_prepare_mounts(void)
29a73c2f
CB
735{
736 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 737 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
738 return false;
739 }
480262c9 740
29a73c2f 741 if (!umount_if_mounted()) {
b8defc3d 742 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
743 return false;
744 }
745
746 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 747 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
748 return false;
749 }
750
1d81c6a6 751 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
0646f250 752 if (cgroup_ops->mntns_fd < 0) {
a257a8ee
CB
753 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
754 return false;
755 }
756
480262c9 757 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 758 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
759 return false;
760 }
480262c9 761
29a73c2f 762 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 763 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
764 return false;
765 }
480262c9 766
29a73c2f
CB
767 return true;
768}
769
0a4dea41 770static bool cgfs_mount_hierarchies(void)
29a73c2f 771{
5fbea8a6
CB
772 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
773 return false;
51c7ca35 774
5fbea8a6
CB
775 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
776 return false;
29a73c2f 777
5fbea8a6
CB
778 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
779 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
780 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
781 if ((*h)->fd < 0)
29a73c2f 782 return false;
29a73c2f 783 }
5fbea8a6 784
29a73c2f
CB
785 return true;
786}
787
480262c9 788static bool cgfs_setup_controllers(void)
29a73c2f 789{
0a4dea41 790 if (!cgfs_prepare_mounts())
29a73c2f 791 return false;
29a73c2f 792
2b8eff1d
CB
793 if (!cgfs_mount_hierarchies())
794 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
29a73c2f 795
0232cbac 796 if (!permute_root())
29a73c2f
CB
797 return false;
798
799 return true;
800}
801
dee86006 802static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra)
b9b6bdc9
CB
803{
804 int ret;
805
806 if (reload_successful) {
807 reload_successful = 0;
808
809 /* write() is async signal safe */
810 ret = write(STDERR_FILENO,
811 "Switched into non-virtualization mode\n",
812 STRLITERALLEN("Switched into non-virtualization mode\n"));
813 if (ret < 0)
814 goto please_compiler;
815 } else {
816 reload_successful = 1;
817
818 /* write() is async signal safe */
819 ret = write(STDERR_FILENO, "Switched into virtualization mode\n",
820 STRLITERALLEN("Switched into virtualization mode\n"));
821 if (ret < 0)
822 goto please_compiler;
823 }
824
825please_compiler:
826 /*
827 * The write() syscall is a function whose return value needs to be
4210ee1d
CB
828 * checked. Otherwise the compiler will warn.Another one could be to
829 * use syscall(__NR_write, ...) directly but whatever.
b9b6bdc9
CB
830 */
831 return;
832}
833
2243c5a9 834static void __attribute__((constructor)) lxcfs_init(void)
237e200e 835{
05b7a16d 836 __do_close int init_ns = -EBADF, root_fd = -EBADF,
de69569b 837 pidfd = -EBADF;
4ec5c9da 838 int i = 0;
2aa59b2e 839 pid_t pid;
237e200e 840
c2357135 841 lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
cc42d0c7 842
5fbea8a6 843 cgroup_ops = cgroup_init();
c2357135
CB
844 if (!cgroup_ops) {
845 lxcfs_info("Failed to initialize cgroup support");
846 goto broken_upgrade;
847 }
237e200e 848
480262c9 849 /* Preserve initial namespace. */
2aa59b2e
CB
850 pid = getpid();
851 init_ns = preserve_ns(pid, "mnt");
c2357135
CB
852 if (init_ns < 0) {
853 lxcfs_info("Failed to preserve initial mount namespace");
854 goto broken_upgrade;
855 }
480262c9 856
480262c9
CB
857 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
858 * to privately mount lxcfs cgroups. */
c2357135 859 if (!cgfs_setup_controllers()) {
2243c5a9 860 log_exit("Failed to setup private cgroup mounts for lxcfs");
c2357135
CB
861 goto broken_upgrade;
862 }
480262c9 863
c2357135 864 if (setns(init_ns, 0) < 0) {
2243c5a9 865 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
c2357135
CB
866 goto broken_upgrade;
867 }
29a73c2f 868
c2357135 869 if (!init_cpuview()) {
2243c5a9 870 log_exit("Failed to init CPU view");
c2357135
CB
871 goto broken_upgrade;
872 }
056adcef 873
cc42d0c7
CB
874 lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
875 lxcfs_info("hierarchies:");
4ec5c9da
CB
876
877 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
cc42d0c7
CB
878 char **controller_list = (*h)->controllers;
879 __do_free char *controllers = NULL;
880 if (controller_list && *controller_list)
881 controllers = lxc_string_join(",", (const char **)controller_list, false);
882 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
4ec5c9da 883 }
2aa59b2e
CB
884
885 pidfd = pidfd_open(pid, 0);
886 if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
887 can_use_pidfd = true;
cc42d0c7 888 lxcfs_info("Kernel supports pidfds");
2aa59b2e 889 }
ce8fc84c 890
c6805016
CB
891 can_use_swap = cgroup_ops->can_use_swap(cgroup_ops);
892 if (can_use_swap)
893 lxcfs_info("Kernel supports swap accounting");
894 else
895 lxcfs_info("Kernel does not support swap accounting");
896
cc42d0c7 897 lxcfs_info("api_extensions:");
3cf1e562
CB
898 for (size_t nr = 0; nr < nr_api_extensions; nr++)
899 lxcfs_info("- %s", api_extensions[nr]);
de69569b
CB
900
901 root_fd = open("/", O_PATH | O_CLOEXEC);
c2357135
CB
902 if (root_fd < 0)
903 lxcfs_info("%s - Failed to open root directory", strerror(errno));
904 else if (fchdir(root_fd) < 0)
905 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
906
dee86006
CB
907 if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) {
908 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno));
b9b6bdc9 909 goto broken_upgrade;
dee86006 910 }
b9b6bdc9
CB
911
912 reload_successful = 1;
c2357135 913 return;
de69569b 914
c2357135 915broken_upgrade:
b9b6bdc9 916 reload_successful = 0;
c2357135 917 lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
237e200e
SH
918}
919
2243c5a9 920static void __attribute__((destructor)) lxcfs_exit(void)
237e200e 921{
cc42d0c7
CB
922 lxcfs_info("Running destructor %s", __func__);
923
c8f77ce4 924 clear_initpid_store();
056adcef 925 free_cpuview();
2243c5a9 926 cgroup_exit(cgroup_ops);
1c4b4e38 927}
285aea40 928
0d5383b7 929void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data)
285aea40
CB
930{
931 struct fuse_context *fc = fuse_get_context();
932 can_use_sys_cpu = true;
933 has_versioned_opts = true;
934 return fc->private_data;
935}