]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/bindings.c
bindings: fix init pid hashing
[mirror_lxcfs.git] / src / bindings.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
237e200e 2
1f5596dd
CB
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE
5#endif
6
7#ifndef FUSE_USE_VERSION
237e200e 8#define FUSE_USE_VERSION 26
1f5596dd
CB
9#endif
10
11#define _FILE_OFFSET_BITS 64
237e200e 12
237e200e 13#include <dirent.h>
29a73c2f 14#include <errno.h>
237e200e
SH
15#include <fcntl.h>
16#include <fuse.h>
0ecddf02 17#include <inttypes.h>
237e200e 18#include <libgen.h>
dee86006
CB
19#include <linux/magic.h>
20#include <linux/sched.h>
237e200e 21#include <pthread.h>
29a73c2f 22#include <sched.h>
db1b32f6 23#include <stdarg.h>
29a73c2f 24#include <stdbool.h>
0ecddf02 25#include <stdint.h>
29a73c2f
CB
26#include <stdio.h>
27#include <stdlib.h>
28#include <string.h>
29a73c2f
CB
29#include <sys/epoll.h>
30#include <sys/mman.h>
31#include <sys/mount.h>
237e200e
SH
32#include <sys/param.h>
33#include <sys/socket.h>
29a73c2f 34#include <sys/syscall.h>
0ecddf02 35#include <sys/sysinfo.h>
d89504c4 36#include <sys/vfs.h>
dee86006
CB
37#include <time.h>
38#include <unistd.h>
39#include <wait.h>
237e200e 40
ce8fc84c 41#include "api_extensions.h"
237e200e 42#include "bindings.h"
580fe4df 43#include "cgroup_fuse.h"
5fbea8a6
CB
44#include "cgroups/cgroup.h"
45#include "cgroups/cgroup_utils.h"
dee86006 46#include "config.h"
c9236032 47#include "memory_utils.h"
1f5596dd 48#include "proc_cpuview.h"
8364a99c 49#include "syscall_numbers.h"
1d81c6a6 50#include "utils.h"
237e200e 51
2aa59b2e 52static bool can_use_pidfd;
b9b6bdc9
CB
53
54static volatile sig_atomic_t reload_successful;
cbfc55fd
CB
55
56bool liblxcfs_functional(void)
57{
b9b6bdc9 58 return reload_successful != 0;
cbfc55fd 59}
2aa59b2e 60
29a73c2f
CB
61/* Define pivot_root() if missing from the C library */
62#ifndef HAVE_PIVOT_ROOT
4ec5c9da 63static int pivot_root(const char *new_root, const char *put_old)
29a73c2f 64{
4ec5c9da 65 return syscall(__NR_pivot_root, new_root, put_old);
29a73c2f
CB
66}
67#else
4ec5c9da 68extern int pivot_root(const char *new_root, const char *put_old);
29a73c2f
CB
69#endif
70
237e200e
SH
71/*
72 * A table caching which pid is init for a pid namespace.
73 * When looking up which pid is init for $qpid, we first
74 * 1. Stat /proc/$qpid/ns/pid.
75 * 2. Check whether the ino_t is in our store.
76 * a. if not, fork a child in qpid's ns to send us
77 * ucred.pid = 1, and read the initpid. Cache
78 * initpid and creation time for /proc/initpid
79 * in a new store entry.
80 * b. if so, verify that /proc/initpid still matches
81 * what we have saved. If not, clear the store
82 * entry and go back to a. If so, return the
83 * cached initpid.
84 */
85struct pidns_init_store {
2aa59b2e
CB
86 ino_t ino; /* inode number for /proc/$pid/ns/pid */
87 pid_t initpid; /* the pid of nit in that ns */
88 int init_pidfd;
1ba088ae 89 int64_t ctime; /* the time at which /proc/$initpid was created */
237e200e 90 struct pidns_init_store *next;
1ba088ae 91 int64_t lastcheck;
237e200e
SH
92};
93
94/* lol - look at how they are allocated in the kernel */
95#define PIDNS_HASH_SIZE 4096
96#define HASH(x) ((x) % PIDNS_HASH_SIZE)
97
98static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
99static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
4ec5c9da 100
237e200e
SH
101static void lock_mutex(pthread_mutex_t *l)
102{
103 int ret;
104
4ec5c9da
CB
105 ret = pthread_mutex_lock(l);
106 if (ret)
107 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
108}
109
77f4399a 110struct cgroup_ops *cgroup_ops;
29a73c2f 111
237e200e
SH
112static void unlock_mutex(pthread_mutex_t *l)
113{
114 int ret;
115
4ec5c9da
CB
116 ret = pthread_mutex_unlock(l);
117 if (ret)
118 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
119}
120
fcdedd16 121static inline void unlock_mutex_function(pthread_mutex_t **mutex)
237e200e 122{
fcdedd16
WB
123 if (*mutex)
124 unlock_mutex(*mutex);
237e200e 125}
fcdedd16 126#define __do_unlock call_cleaner(unlock_mutex)
237e200e 127
fcdedd16 128static pthread_mutex_t* __attribute__((warn_unused_result)) store_lock(void)
237e200e 129{
fcdedd16
WB
130 lock_mutex(&pidns_store_mutex);
131 return &pidns_store_mutex;
237e200e
SH
132}
133
2aa59b2e
CB
134/* /proc/ = 6
135 * +
136 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
137 * +
138 * \0 = 1
139 */
140#define LXCFS_PROC_PID_LEN \
141 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
142
bc189096 143static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
237e200e 144{
bc189096 145 int ret;
237e200e 146
bc189096
CB
147 if (entry->init_pidfd < 0)
148 return ret_errno(ENOSYS);
7dd6560a 149
bc189096
CB
150 ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
151 if (ret < 0) {
152 if (errno == ENOSYS)
153 return ret_errno(ENOSYS);
7dd6560a 154
bc189096 155 return 0;
2aa59b2e
CB
156 }
157
bc189096
CB
158 return 1;
159}
160
161static int initpid_still_valid_stat(struct pidns_init_store *entry)
162{
163 struct stat st;
164 char path[LXCFS_PROC_PID_LEN];
165
166 snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
167 if (stat(path, &st) || entry->ctime != st.st_ctime)
168 return 0;
169
170 return 1;
171}
172
173/* Must be called under store_lock */
174static bool initpid_still_valid(struct pidns_init_store *entry)
175{
176 int ret;
177
178 ret = initpid_still_valid_pidfd(entry);
179 if (ret < 0)
180 ret = initpid_still_valid_stat(entry);
181
182 return ret == 1;
237e200e
SH
183}
184
185/* Must be called under store_lock */
2aa59b2e 186static void remove_initpid(struct pidns_init_store *entry)
237e200e 187{
2aa59b2e
CB
188 struct pidns_init_store *it;
189 int ino_hash;
237e200e 190
2aa59b2e
CB
191 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
192 entry->initpid);
7dd6560a 193
2aa59b2e
CB
194 ino_hash = HASH(entry->ino);
195 if (pidns_hash_table[ino_hash] == entry) {
196 pidns_hash_table[ino_hash] = entry->next;
197 close_prot_errno_disarm(entry->init_pidfd);
198 free_disarm(entry);
237e200e
SH
199 return;
200 }
201
2aa59b2e
CB
202 it = pidns_hash_table[ino_hash];
203 while (it) {
204 if (it->next == entry) {
205 it->next = entry->next;
206 close_prot_errno_disarm(entry->init_pidfd);
207 free_disarm(entry);
237e200e
SH
208 return;
209 }
2aa59b2e 210 it = it->next;
237e200e
SH
211 }
212}
213
214#define PURGE_SECS 5
215/* Must be called under store_lock */
216static void prune_initpid_store(void)
217{
1ba088ae
CB
218 static int64_t last_prune = 0;
219 int64_t now, threshold;
237e200e
SH
220
221 if (!last_prune) {
222 last_prune = time(NULL);
223 return;
224 }
2aa59b2e 225
237e200e
SH
226 now = time(NULL);
227 if (now < last_prune + PURGE_SECS)
228 return;
7dd6560a 229
2aa59b2e 230 lxcfs_debug("Pruning init pid cache");
7dd6560a 231
237e200e
SH
232 last_prune = now;
233 threshold = now - 2 * PURGE_SECS;
234
2aa59b2e
CB
235 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
236 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
237 if (entry->lastcheck < threshold) {
238 struct pidns_init_store *cur = entry;
7dd6560a 239
2aa59b2e 240 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
7dd6560a 241
237e200e 242 if (prev)
2aa59b2e 243 prev->next = entry->next;
237e200e 244 else
2aa59b2e
CB
245 pidns_hash_table[i] = entry->next;
246 entry = entry->next;
247 close_prot_errno_disarm(cur->init_pidfd);
248 free_disarm(cur);
237e200e 249 } else {
2aa59b2e
CB
250 prev = entry;
251 entry = entry->next;
237e200e
SH
252 }
253 }
254 }
255}
256
257/* Must be called under store_lock */
fcdedd16 258static void save_initpid(ino_t pidns_inode, pid_t pid)
237e200e 259{
1e5d03fe 260 __do_free struct pidns_init_store *entry = NULL;
05b7a16d 261 __do_close int pidfd = -EBADF;
2aa59b2e
CB
262 char path[LXCFS_PROC_PID_LEN];
263 struct lxcfs_opts *opts = fuse_get_context()->private_data;
264 struct stat st;
265 int ino_hash;
266
9973cc06 267 if (opts && opts->use_pidfd && can_use_pidfd) {
2aa59b2e
CB
268 pidfd = pidfd_open(pid, 0);
269 if (pidfd < 0)
270 return;
271 }
237e200e 272
2aa59b2e
CB
273 snprintf(path, sizeof(path), "/proc/%d", pid);
274 if (stat(path, &st))
275 return;
7dd6560a 276
5ec289bf 277 entry = zalloc(sizeof(*entry));
0eb3756b 278 if (!entry)
237e200e 279 return;
2aa59b2e 280
97017213 281 ino_hash = HASH(pidns_inode);
1e5d03fe 282 *entry = (struct pidns_init_store){
fcdedd16 283 .ino = pidns_inode,
1e5d03fe
CB
284 .initpid = pid,
285 .ctime = st.st_ctime,
286 .next = pidns_hash_table[ino_hash],
287 .lastcheck = time(NULL),
288 .init_pidfd = move_fd(pidfd),
289 };
290 pidns_hash_table[ino_hash] = move_ptr(entry);
2aa59b2e
CB
291
292 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
237e200e
SH
293}
294
295/*
296 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
297 * entry for the inode number and creation time. Verify that the init pid
298 * is still valid. If not, remove it. Return the entry if valid, NULL
299 * otherwise.
300 * Must be called under store_lock
301 */
fcdedd16 302static struct pidns_init_store *lookup_verify_initpid(ino_t pidns_inode)
237e200e 303{
fcdedd16 304 struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
2aa59b2e
CB
305
306 while (entry) {
fcdedd16 307 if (entry->ino == pidns_inode) {
2aa59b2e
CB
308 if (initpid_still_valid(entry)) {
309 entry->lastcheck = time(NULL);
310 return entry;
237e200e 311 }
2aa59b2e
CB
312
313 remove_initpid(entry);
237e200e
SH
314 return NULL;
315 }
2aa59b2e 316 entry = entry->next;
237e200e
SH
317 }
318
319 return NULL;
320}
321
4ec5c9da 322static int send_creds_clone_wrapper(void *arg)
237e200e 323{
f1744de4
CB
324 int sock = PTR_TO_INT(arg);
325 char v = '1'; /* we are the child */
326 struct ucred cred = {
327 .uid = 0,
328 .gid = 0,
329 .pid = 1,
330 };
331
332 return send_creds(sock, &cred, v, true) != SEND_CREDS_OK;
237e200e
SH
333}
334
87f7558b
CB
335/*
336 * Let's use the "standard stack limit" (i.e. glibc thread size default) for
337 * stack sizes: 8MB.
338 */
339#define __LXCFS_STACK_SIZE (8 * 1024 * 1024)
6abff455 340pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags)
87f7558b
CB
341{
342 pid_t ret;
343 void *stack;
344
345 stack = malloc(__LXCFS_STACK_SIZE);
346 if (!stack)
347 return ret_errno(ENOMEM);
348
349#ifdef __ia64__
350 ret = __clone2(fn, stack, __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
351#else
352 ret = clone(fn, stack + __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
353#endif
354 return ret;
355}
356
357#define LXCFS_PROC_PID_NS_LEN \
358 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
359 STRLITERALLEN("/ns/pid") + 1)
360
580fe4df
CB
361/*
362 * clone a task which switches to @task's namespace and writes '1'.
363 * over a unix sock so we can read the task's reaper's pid in our
364 * namespace
365 *
366 * Note: glibc's fork() does not respect pidns, which can lead to failed
367 * assertions inside glibc (and thus failed forks) if the child's pid in
368 * the pidns and the parent pid outside are identical. Using clone prevents
369 * this issue.
370 */
371static void write_task_init_pid_exit(int sock, pid_t target)
372{
05b7a16d 373 __do_close int fd = -EBADF;
87f7558b 374 char path[LXCFS_PROC_PID_NS_LEN];
580fe4df 375 pid_t pid;
87f7558b
CB
376
377 snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
378 fd = open(path, O_RDONLY | O_CLOEXEC);
379 if (fd < 0)
380 log_exit("write_task_init_pid_exit open of ns/pid");
381
382 if (setns(fd, 0))
383 log_exit("Failed to setns to pid namespace of process %d", target);
384
f1744de4 385 pid = lxcfs_clone(send_creds_clone_wrapper, INT_TO_PTR(sock), 0);
580fe4df 386 if (pid < 0)
87f7558b
CB
387 _exit(EXIT_FAILURE);
388
580fe4df
CB
389 if (pid != 0) {
390 if (!wait_for_pid(pid))
87f7558b
CB
391 _exit(EXIT_FAILURE);
392
393 _exit(EXIT_SUCCESS);
237e200e 394 }
237e200e
SH
395}
396
580fe4df 397static pid_t get_init_pid_for_task(pid_t task)
237e200e 398{
580fe4df 399 char v = '0';
87f7558b 400 pid_t pid_ret = -1;
dac3dc93
CB
401 struct ucred cred = {
402 .pid = -1,
403 .uid = -1,
404 .gid = -1,
405 };
87f7558b
CB
406 pid_t pid;
407 int sock[2];
237e200e 408
87f7558b 409 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
580fe4df 410 return -1;
237e200e 411
580fe4df
CB
412 pid = fork();
413 if (pid < 0)
414 goto out;
87f7558b
CB
415
416 if (pid == 0) {
580fe4df
CB
417 close(sock[1]);
418 write_task_init_pid_exit(sock[0], task);
87f7558b 419 _exit(EXIT_SUCCESS);
237e200e 420 }
7213ec5c 421
580fe4df
CB
422 if (!recv_creds(sock[1], &cred, &v))
423 goto out;
87f7558b
CB
424
425 pid_ret = cred.pid;
237e200e 426
580fe4df
CB
427out:
428 close(sock[0]);
429 close(sock[1]);
430 if (pid > 0)
431 wait_for_pid(pid);
237e200e 432
87f7558b
CB
433 return pid_ret;
434}
2aa59b2e
CB
435
436pid_t lookup_initpid_in_store(pid_t pid)
237e200e 437{
fcdedd16 438 __do_unlock pthread_mutex_t *store_mutex = NULL;
580fe4df 439 pid_t answer = 0;
2aa59b2e
CB
440 char path[LXCFS_PROC_PID_NS_LEN];
441 struct stat st;
442 struct pidns_init_store *entry;
443
444 snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
b7672ded 445
2aa59b2e 446 if (stat(path, &st))
580fe4df 447 goto out;
2aa59b2e 448
fcdedd16
WB
449 store_mutex = store_lock();
450
451 entry = lookup_verify_initpid(st.st_ino);
2aa59b2e
CB
452 if (entry) {
453 answer = entry->initpid;
580fe4df
CB
454 goto out;
455 }
2aa59b2e 456
fcdedd16
WB
457 /* release the mutex as the following call is expensive */
458 unlock_mutex(move_ptr(store_mutex));
2aa59b2e 459 answer = get_init_pid_for_task(pid);
fcdedd16
WB
460 store_mutex = store_lock();
461
580fe4df 462 if (answer > 0)
fcdedd16 463 save_initpid(st.st_ino, answer);
b7672ded 464
580fe4df 465out:
2aa59b2e
CB
466 /*
467 * Prune at the end in case we're returning the value we were about to
468 * return.
469 */
580fe4df 470 prune_initpid_store();
2aa59b2e 471
580fe4df 472 return answer;
237e200e
SH
473}
474
29a73c2f
CB
475/*
476 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
477 */
478
29a73c2f
CB
479static bool umount_if_mounted(void)
480{
481 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 482 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
483 return false;
484 }
485 return true;
486}
487
2283e240
CB
488/* __typeof__ should be safe to use with all compilers. */
489typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
490static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
491{
492 return (fs->f_type == (fs_type_magic)magic_val);
493}
494
0a4dea41
CB
495/*
496 * looking at fs/proc_namespace.c, it appears we can
497 * actually expect the rootfs entry to very specifically contain
498 * " - rootfs rootfs "
499 * IIUC, so long as we've chrooted so that rootfs is not our root,
500 * the rootfs entry should always be skipped in mountinfo contents.
501 */
502static bool is_on_ramfs(void)
503{
87f7558b 504 __do_free char *line = NULL;
757a63e7 505 __do_free void *fopen_cache = NULL;
87f7558b 506 __do_fclose FILE *f = NULL;
0a4dea41 507 size_t len = 0;
0a4dea41 508
757a63e7 509 f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
0a4dea41
CB
510 if (!f)
511 return false;
512
513 while (getline(&line, &len, f) != -1) {
87f7558b
CB
514 int i;
515 char *p, *p2;
516
0a4dea41
CB
517 for (p = line, i = 0; p && i < 4; i++)
518 p = strchr(p + 1, ' ');
519 if (!p)
520 continue;
87f7558b 521
0a4dea41
CB
522 p2 = strchr(p + 1, ' ');
523 if (!p2)
524 continue;
525 *p2 = '\0';
526 if (strcmp(p + 1, "/") == 0) {
87f7558b 527 /* This is '/'. Is it the ramfs? */
0a4dea41 528 p = strchr(p2 + 1, '-');
87f7558b 529 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
0a4dea41 530 return true;
0a4dea41
CB
531 }
532 }
87f7558b 533
0a4dea41
CB
534 return false;
535}
536
cc309f33 537static int pivot_enter()
0a4dea41 538{
05b7a16d 539 __do_close int oldroot = -EBADF, newroot = -EBADF;
cc309f33 540
3326c17e 541 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
542 if (oldroot < 0)
543 return log_error_errno(-1, errno,
544 "Failed to open old root for fchdir");
cc309f33 545
3326c17e 546 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
547 if (newroot < 0)
548 return log_error_errno(-1, errno,
549 "Failed to open new root for fchdir");
cc309f33
CB
550
551 /* change into new root fs */
87f7558b
CB
552 if (fchdir(newroot) < 0)
553 return log_error_errno(-1,
554 errno, "Failed to change directory to new rootfs: %s",
555 ROOTDIR);
cc309f33 556
0a4dea41 557 /* pivot_root into our new root fs */
87f7558b
CB
558 if (pivot_root(".", ".") < 0)
559 return log_error_errno(-1, errno,
560 "pivot_root() syscall failed: %s",
561 strerror(errno));
0a4dea41
CB
562
563 /*
564 * At this point the old-root is mounted on top of our new-root.
565 * To unmounted it we must not be chdir'd into it, so escape back
566 * to the old-root.
567 */
87f7558b
CB
568 if (fchdir(oldroot) < 0)
569 return log_error_errno(-1, errno, "Failed to enter old root");
0a4dea41 570
87f7558b
CB
571 if (umount2(".", MNT_DETACH) < 0)
572 return log_error_errno(-1, errno, "Failed to detach old root");
0a4dea41 573
87f7558b
CB
574 if (fchdir(newroot) < 0)
575 return log_error_errno(-1, errno, "Failed to re-enter new root");
cc309f33 576
87f7558b 577 return 0;
0a4dea41
CB
578}
579
580static int chroot_enter()
581{
582 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
583 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
584 return -1;
585 }
586
587 if (chroot(".") < 0) {
588 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
589 return -1;
590 }
591
592 if (chdir("/") < 0) {
593 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
594 return -1;
595 }
596
597 return 0;
598}
599
0232cbac 600static int permute_and_enter(void)
29a73c2f 601{
0a4dea41
CB
602 struct statfs sb;
603
604 if (statfs("/", &sb) < 0) {
605 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 606 return -1;
0a4dea41
CB
607 }
608
609 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
610 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
611 * /proc/1/mountinfo. */
612 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
613 return chroot_enter();
29a73c2f 614
cc309f33 615 if (pivot_enter() < 0) {
0a4dea41 616 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 617 return -1;
29a73c2f
CB
618 }
619
cc309f33 620 return 0;
29a73c2f
CB
621}
622
623/* Prepare our new clean root. */
0232cbac 624static int permute_prepare(void)
29a73c2f
CB
625{
626 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 627 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
628 return -1;
629 }
630
631 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 632 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
633 return -1;
634 }
635
636 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 637 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
638 return -1;
639 }
640
641 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 642 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
643 return -1;
644 }
645
646 return 0;
647}
648
0232cbac
CB
649/* Calls chroot() on ramfs, pivot_root() in all other cases. */
650static bool permute_root(void)
29a73c2f
CB
651{
652 /* Prepare new root. */
0232cbac 653 if (permute_prepare() < 0)
29a73c2f
CB
654 return false;
655
656 /* Pivot into new root. */
0232cbac 657 if (permute_and_enter() < 0)
29a73c2f
CB
658 return false;
659
660 return true;
661}
662
0a4dea41 663static bool cgfs_prepare_mounts(void)
29a73c2f
CB
664{
665 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 666 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
667 return false;
668 }
480262c9 669
29a73c2f 670 if (!umount_if_mounted()) {
b8defc3d 671 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
672 return false;
673 }
674
675 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 676 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
677 return false;
678 }
679
1d81c6a6 680 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
0646f250 681 if (cgroup_ops->mntns_fd < 0) {
a257a8ee
CB
682 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
683 return false;
684 }
685
480262c9 686 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 687 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
688 return false;
689 }
480262c9 690
29a73c2f 691 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 692 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
693 return false;
694 }
480262c9 695
29a73c2f
CB
696 return true;
697}
698
0a4dea41 699static bool cgfs_mount_hierarchies(void)
29a73c2f 700{
5fbea8a6
CB
701 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
702 return false;
51c7ca35 703
5fbea8a6
CB
704 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
705 return false;
29a73c2f 706
5fbea8a6
CB
707 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
708 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
709 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
710 if ((*h)->fd < 0)
29a73c2f 711 return false;
29a73c2f 712 }
5fbea8a6 713
29a73c2f
CB
714 return true;
715}
716
480262c9 717static bool cgfs_setup_controllers(void)
29a73c2f 718{
0a4dea41 719 if (!cgfs_prepare_mounts())
29a73c2f 720 return false;
29a73c2f 721
2b8eff1d
CB
722 if (!cgfs_mount_hierarchies())
723 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
29a73c2f 724
0232cbac 725 if (!permute_root())
29a73c2f
CB
726 return false;
727
728 return true;
729}
730
dee86006 731static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra)
b9b6bdc9
CB
732{
733 int ret;
734
735 if (reload_successful) {
736 reload_successful = 0;
737
738 /* write() is async signal safe */
739 ret = write(STDERR_FILENO,
740 "Switched into non-virtualization mode\n",
741 STRLITERALLEN("Switched into non-virtualization mode\n"));
742 if (ret < 0)
743 goto please_compiler;
744 } else {
745 reload_successful = 1;
746
747 /* write() is async signal safe */
748 ret = write(STDERR_FILENO, "Switched into virtualization mode\n",
749 STRLITERALLEN("Switched into virtualization mode\n"));
750 if (ret < 0)
751 goto please_compiler;
752 }
753
754please_compiler:
755 /*
756 * The write() syscall is a function whose return value needs to be
757 * checked. Otherwise the compiler will warn. This is how we
758 * please our master. Another one could be to use
759 * syscall(__NR_write, ...) directly but whatever.
760 */
761 return;
762}
763
2243c5a9 764static void __attribute__((constructor)) lxcfs_init(void)
237e200e 765{
05b7a16d 766 __do_close int init_ns = -EBADF, root_fd = -EBADF,
de69569b 767 pidfd = -EBADF;
4ec5c9da 768 int i = 0;
2aa59b2e 769 pid_t pid;
237e200e 770
c2357135 771 lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
cc42d0c7 772
5fbea8a6 773 cgroup_ops = cgroup_init();
c2357135
CB
774 if (!cgroup_ops) {
775 lxcfs_info("Failed to initialize cgroup support");
776 goto broken_upgrade;
777 }
237e200e 778
480262c9 779 /* Preserve initial namespace. */
2aa59b2e
CB
780 pid = getpid();
781 init_ns = preserve_ns(pid, "mnt");
c2357135
CB
782 if (init_ns < 0) {
783 lxcfs_info("Failed to preserve initial mount namespace");
784 goto broken_upgrade;
785 }
480262c9 786
480262c9
CB
787 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
788 * to privately mount lxcfs cgroups. */
c2357135 789 if (!cgfs_setup_controllers()) {
2243c5a9 790 log_exit("Failed to setup private cgroup mounts for lxcfs");
c2357135
CB
791 goto broken_upgrade;
792 }
480262c9 793
c2357135 794 if (setns(init_ns, 0) < 0) {
2243c5a9 795 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
c2357135
CB
796 goto broken_upgrade;
797 }
29a73c2f 798
c2357135 799 if (!init_cpuview()) {
2243c5a9 800 log_exit("Failed to init CPU view");
c2357135
CB
801 goto broken_upgrade;
802 }
056adcef 803
cc42d0c7
CB
804 lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
805 lxcfs_info("hierarchies:");
4ec5c9da
CB
806
807 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
cc42d0c7
CB
808 char **controller_list = (*h)->controllers;
809 __do_free char *controllers = NULL;
810 if (controller_list && *controller_list)
811 controllers = lxc_string_join(",", (const char **)controller_list, false);
812 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
4ec5c9da 813 }
2aa59b2e
CB
814
815 pidfd = pidfd_open(pid, 0);
816 if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
817 can_use_pidfd = true;
cc42d0c7 818 lxcfs_info("Kernel supports pidfds");
2aa59b2e 819 }
ce8fc84c 820
cc42d0c7 821 lxcfs_info("api_extensions:");
ce8fc84c 822 for (i = 0; i < nr_api_extensions; i++)
cc42d0c7 823 lxcfs_info("- %s", api_extensions[i]);
de69569b
CB
824
825 root_fd = open("/", O_PATH | O_CLOEXEC);
c2357135
CB
826 if (root_fd < 0)
827 lxcfs_info("%s - Failed to open root directory", strerror(errno));
828 else if (fchdir(root_fd) < 0)
829 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
830
dee86006
CB
831 if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) {
832 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno));
b9b6bdc9 833 goto broken_upgrade;
dee86006 834 }
b9b6bdc9
CB
835
836 reload_successful = 1;
c2357135 837 return;
de69569b 838
c2357135 839broken_upgrade:
b9b6bdc9 840 reload_successful = 0;
c2357135 841 lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
237e200e
SH
842}
843
2243c5a9 844static void __attribute__((destructor)) lxcfs_exit(void)
237e200e 845{
cc42d0c7
CB
846 lxcfs_info("Running destructor %s", __func__);
847
056adcef 848 free_cpuview();
2243c5a9 849 cgroup_exit(cgroup_ops);
1c4b4e38 850}