]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/bindings.c
Merge pull request #367 from brauner/master
[mirror_lxcfs.git] / src / bindings.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
237e200e 2
1f5596dd
CB
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE
5#endif
6
7#ifndef FUSE_USE_VERSION
237e200e 8#define FUSE_USE_VERSION 26
1f5596dd
CB
9#endif
10
11#define _FILE_OFFSET_BITS 64
237e200e 12
237e200e 13#include <dirent.h>
29a73c2f 14#include <errno.h>
237e200e
SH
15#include <fcntl.h>
16#include <fuse.h>
0ecddf02 17#include <inttypes.h>
237e200e 18#include <libgen.h>
237e200e 19#include <pthread.h>
29a73c2f 20#include <sched.h>
db1b32f6 21#include <stdarg.h>
29a73c2f 22#include <stdbool.h>
0ecddf02 23#include <stdint.h>
29a73c2f
CB
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27#include <time.h>
28#include <unistd.h>
29#include <wait.h>
d89504c4 30#include <linux/magic.h>
237e200e 31#include <linux/sched.h>
29a73c2f
CB
32#include <sys/epoll.h>
33#include <sys/mman.h>
34#include <sys/mount.h>
237e200e 35#include <sys/param.h>
87f7558b 36#include <signal.h>
237e200e 37#include <sys/socket.h>
29a73c2f 38#include <sys/syscall.h>
0ecddf02 39#include <sys/sysinfo.h>
d89504c4 40#include <sys/vfs.h>
237e200e 41
ce8fc84c 42#include "api_extensions.h"
237e200e 43#include "bindings.h"
1d81c6a6 44#include "config.h"
580fe4df 45#include "cgroup_fuse.h"
5fbea8a6
CB
46#include "cgroups/cgroup.h"
47#include "cgroups/cgroup_utils.h"
c9236032 48#include "memory_utils.h"
1f5596dd 49#include "proc_cpuview.h"
1d81c6a6 50#include "utils.h"
237e200e 51
2aa59b2e 52static bool can_use_pidfd;
cbfc55fd
CB
53static bool reload_successful;
54
55bool liblxcfs_functional(void)
56{
57 return reload_successful;
58}
2aa59b2e 59
29a73c2f
CB
60/* Define pivot_root() if missing from the C library */
61#ifndef HAVE_PIVOT_ROOT
4ec5c9da 62static int pivot_root(const char *new_root, const char *put_old)
29a73c2f
CB
63{
64#ifdef __NR_pivot_root
4ec5c9da 65 return syscall(__NR_pivot_root, new_root, put_old);
29a73c2f 66#else
4ec5c9da
CB
67 errno = ENOSYS;
68 return -1;
29a73c2f
CB
69#endif
70}
71#else
4ec5c9da 72extern int pivot_root(const char *new_root, const char *put_old);
29a73c2f
CB
73#endif
74
237e200e
SH
75/*
76 * A table caching which pid is init for a pid namespace.
77 * When looking up which pid is init for $qpid, we first
78 * 1. Stat /proc/$qpid/ns/pid.
79 * 2. Check whether the ino_t is in our store.
80 * a. if not, fork a child in qpid's ns to send us
81 * ucred.pid = 1, and read the initpid. Cache
82 * initpid and creation time for /proc/initpid
83 * in a new store entry.
84 * b. if so, verify that /proc/initpid still matches
85 * what we have saved. If not, clear the store
86 * entry and go back to a. If so, return the
87 * cached initpid.
88 */
89struct pidns_init_store {
2aa59b2e
CB
90 ino_t ino; /* inode number for /proc/$pid/ns/pid */
91 pid_t initpid; /* the pid of nit in that ns */
92 int init_pidfd;
93 long int ctime; /* the time at which /proc/$initpid was created */
237e200e
SH
94 struct pidns_init_store *next;
95 long int lastcheck;
96};
97
98/* lol - look at how they are allocated in the kernel */
99#define PIDNS_HASH_SIZE 4096
100#define HASH(x) ((x) % PIDNS_HASH_SIZE)
101
102static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
103static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
4ec5c9da 104
237e200e
SH
105static void lock_mutex(pthread_mutex_t *l)
106{
107 int ret;
108
4ec5c9da
CB
109 ret = pthread_mutex_lock(l);
110 if (ret)
111 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
112}
113
77f4399a 114struct cgroup_ops *cgroup_ops;
29a73c2f 115
237e200e
SH
116static void unlock_mutex(pthread_mutex_t *l)
117{
118 int ret;
119
4ec5c9da
CB
120 ret = pthread_mutex_unlock(l);
121 if (ret)
122 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
123}
124
125static void store_lock(void)
126{
127 lock_mutex(&pidns_store_mutex);
128}
129
130static void store_unlock(void)
131{
132 unlock_mutex(&pidns_store_mutex);
133}
134
2aa59b2e
CB
135/* /proc/ = 6
136 * +
137 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
138 * +
139 * \0 = 1
140 */
141#define LXCFS_PROC_PID_LEN \
142 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
143
bc189096 144static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
237e200e 145{
bc189096 146 int ret;
237e200e 147
bc189096
CB
148 if (entry->init_pidfd < 0)
149 return ret_errno(ENOSYS);
7dd6560a 150
bc189096
CB
151 ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
152 if (ret < 0) {
153 if (errno == ENOSYS)
154 return ret_errno(ENOSYS);
7dd6560a 155
bc189096 156 return 0;
2aa59b2e
CB
157 }
158
bc189096
CB
159 return 1;
160}
161
162static int initpid_still_valid_stat(struct pidns_init_store *entry)
163{
164 struct stat st;
165 char path[LXCFS_PROC_PID_LEN];
166
167 snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
168 if (stat(path, &st) || entry->ctime != st.st_ctime)
169 return 0;
170
171 return 1;
172}
173
174/* Must be called under store_lock */
175static bool initpid_still_valid(struct pidns_init_store *entry)
176{
177 int ret;
178
179 ret = initpid_still_valid_pidfd(entry);
180 if (ret < 0)
181 ret = initpid_still_valid_stat(entry);
182
183 return ret == 1;
237e200e
SH
184}
185
186/* Must be called under store_lock */
2aa59b2e 187static void remove_initpid(struct pidns_init_store *entry)
237e200e 188{
2aa59b2e
CB
189 struct pidns_init_store *it;
190 int ino_hash;
237e200e 191
2aa59b2e
CB
192 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
193 entry->initpid);
7dd6560a 194
2aa59b2e
CB
195 ino_hash = HASH(entry->ino);
196 if (pidns_hash_table[ino_hash] == entry) {
197 pidns_hash_table[ino_hash] = entry->next;
198 close_prot_errno_disarm(entry->init_pidfd);
199 free_disarm(entry);
237e200e
SH
200 return;
201 }
202
2aa59b2e
CB
203 it = pidns_hash_table[ino_hash];
204 while (it) {
205 if (it->next == entry) {
206 it->next = entry->next;
207 close_prot_errno_disarm(entry->init_pidfd);
208 free_disarm(entry);
237e200e
SH
209 return;
210 }
2aa59b2e 211 it = it->next;
237e200e
SH
212 }
213}
214
215#define PURGE_SECS 5
216/* Must be called under store_lock */
217static void prune_initpid_store(void)
218{
219 static long int last_prune = 0;
237e200e 220 long int now, threshold;
237e200e
SH
221
222 if (!last_prune) {
223 last_prune = time(NULL);
224 return;
225 }
2aa59b2e 226
237e200e
SH
227 now = time(NULL);
228 if (now < last_prune + PURGE_SECS)
229 return;
7dd6560a 230
2aa59b2e 231 lxcfs_debug("Pruning init pid cache");
7dd6560a 232
237e200e
SH
233 last_prune = now;
234 threshold = now - 2 * PURGE_SECS;
235
2aa59b2e
CB
236 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
237 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
238 if (entry->lastcheck < threshold) {
239 struct pidns_init_store *cur = entry;
7dd6560a 240
2aa59b2e 241 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
7dd6560a 242
237e200e 243 if (prev)
2aa59b2e 244 prev->next = entry->next;
237e200e 245 else
2aa59b2e
CB
246 pidns_hash_table[i] = entry->next;
247 entry = entry->next;
248 close_prot_errno_disarm(cur->init_pidfd);
249 free_disarm(cur);
237e200e 250 } else {
2aa59b2e
CB
251 prev = entry;
252 entry = entry->next;
237e200e
SH
253 }
254 }
255 }
256}
257
258/* Must be called under store_lock */
259static void save_initpid(struct stat *sb, pid_t pid)
260{
1e5d03fe 261 __do_free struct pidns_init_store *entry = NULL;
2aa59b2e
CB
262 __do_close_prot_errno int pidfd = -EBADF;
263 char path[LXCFS_PROC_PID_LEN];
264 struct lxcfs_opts *opts = fuse_get_context()->private_data;
265 struct stat st;
266 int ino_hash;
267
9973cc06 268 if (opts && opts->use_pidfd && can_use_pidfd) {
2aa59b2e
CB
269 pidfd = pidfd_open(pid, 0);
270 if (pidfd < 0)
271 return;
272 }
237e200e 273
2aa59b2e
CB
274 snprintf(path, sizeof(path), "/proc/%d", pid);
275 if (stat(path, &st))
276 return;
7dd6560a 277
1e5d03fe
CB
278 entry = malloc(sizeof(*entry));
279 if (entry)
237e200e 280 return;
2aa59b2e 281
1e5d03fe
CB
282 ino_hash = HASH(entry->ino);
283 *entry = (struct pidns_init_store){
284 .ino = sb->st_ino,
285 .initpid = pid,
286 .ctime = st.st_ctime,
287 .next = pidns_hash_table[ino_hash],
288 .lastcheck = time(NULL),
289 .init_pidfd = move_fd(pidfd),
290 };
291 pidns_hash_table[ino_hash] = move_ptr(entry);
2aa59b2e
CB
292
293 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
237e200e
SH
294}
295
296/*
297 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
298 * entry for the inode number and creation time. Verify that the init pid
299 * is still valid. If not, remove it. Return the entry if valid, NULL
300 * otherwise.
301 * Must be called under store_lock
302 */
303static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
304{
2aa59b2e
CB
305 struct pidns_init_store *entry = pidns_hash_table[HASH(sb->st_ino)];
306
307 while (entry) {
308 if (entry->ino == sb->st_ino) {
309 if (initpid_still_valid(entry)) {
310 entry->lastcheck = time(NULL);
311 return entry;
237e200e 312 }
2aa59b2e
CB
313
314 remove_initpid(entry);
237e200e
SH
315 return NULL;
316 }
2aa59b2e 317 entry = entry->next;
237e200e
SH
318 }
319
320 return NULL;
321}
322
4ec5c9da 323static int send_creds_clone_wrapper(void *arg)
237e200e 324{
f1744de4
CB
325 int sock = PTR_TO_INT(arg);
326 char v = '1'; /* we are the child */
327 struct ucred cred = {
328 .uid = 0,
329 .gid = 0,
330 .pid = 1,
331 };
332
333 return send_creds(sock, &cred, v, true) != SEND_CREDS_OK;
237e200e
SH
334}
335
87f7558b
CB
336/*
337 * Let's use the "standard stack limit" (i.e. glibc thread size default) for
338 * stack sizes: 8MB.
339 */
340#define __LXCFS_STACK_SIZE (8 * 1024 * 1024)
341static pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags)
342{
343 pid_t ret;
344 void *stack;
345
346 stack = malloc(__LXCFS_STACK_SIZE);
347 if (!stack)
348 return ret_errno(ENOMEM);
349
350#ifdef __ia64__
351 ret = __clone2(fn, stack, __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
352#else
353 ret = clone(fn, stack + __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
354#endif
355 return ret;
356}
357
358#define LXCFS_PROC_PID_NS_LEN \
359 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
360 STRLITERALLEN("/ns/pid") + 1)
361
580fe4df
CB
362/*
363 * clone a task which switches to @task's namespace and writes '1'.
364 * over a unix sock so we can read the task's reaper's pid in our
365 * namespace
366 *
367 * Note: glibc's fork() does not respect pidns, which can lead to failed
368 * assertions inside glibc (and thus failed forks) if the child's pid in
369 * the pidns and the parent pid outside are identical. Using clone prevents
370 * this issue.
371 */
372static void write_task_init_pid_exit(int sock, pid_t target)
373{
87f7558b
CB
374 __do_close_prot_errno int fd = -EBADF;
375 char path[LXCFS_PROC_PID_NS_LEN];
580fe4df 376 pid_t pid;
87f7558b
CB
377
378 snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
379 fd = open(path, O_RDONLY | O_CLOEXEC);
380 if (fd < 0)
381 log_exit("write_task_init_pid_exit open of ns/pid");
382
383 if (setns(fd, 0))
384 log_exit("Failed to setns to pid namespace of process %d", target);
385
f1744de4 386 pid = lxcfs_clone(send_creds_clone_wrapper, INT_TO_PTR(sock), 0);
580fe4df 387 if (pid < 0)
87f7558b
CB
388 _exit(EXIT_FAILURE);
389
580fe4df
CB
390 if (pid != 0) {
391 if (!wait_for_pid(pid))
87f7558b
CB
392 _exit(EXIT_FAILURE);
393
394 _exit(EXIT_SUCCESS);
237e200e 395 }
237e200e
SH
396}
397
580fe4df 398static pid_t get_init_pid_for_task(pid_t task)
237e200e 399{
580fe4df 400 char v = '0';
87f7558b
CB
401 pid_t pid_ret = -1;
402 pid_t pid;
403 int sock[2];
580fe4df 404 struct ucred cred;
237e200e 405
87f7558b 406 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
580fe4df 407 return -1;
237e200e 408
580fe4df
CB
409 pid = fork();
410 if (pid < 0)
411 goto out;
87f7558b
CB
412
413 if (pid == 0) {
580fe4df
CB
414 close(sock[1]);
415 write_task_init_pid_exit(sock[0], task);
87f7558b 416 _exit(EXIT_SUCCESS);
237e200e 417 }
7213ec5c 418
580fe4df
CB
419 if (!recv_creds(sock[1], &cred, &v))
420 goto out;
87f7558b
CB
421
422 pid_ret = cred.pid;
237e200e 423
580fe4df
CB
424out:
425 close(sock[0]);
426 close(sock[1]);
427 if (pid > 0)
428 wait_for_pid(pid);
237e200e 429
87f7558b
CB
430 return pid_ret;
431}
2aa59b2e
CB
432
433pid_t lookup_initpid_in_store(pid_t pid)
237e200e 434{
580fe4df 435 pid_t answer = 0;
2aa59b2e
CB
436 char path[LXCFS_PROC_PID_NS_LEN];
437 struct stat st;
438 struct pidns_init_store *entry;
439
440 snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
b7672ded 441
580fe4df 442 store_lock();
2aa59b2e 443 if (stat(path, &st))
580fe4df 444 goto out;
2aa59b2e
CB
445
446 entry = lookup_verify_initpid(&st);
447 if (entry) {
448 answer = entry->initpid;
580fe4df
CB
449 goto out;
450 }
2aa59b2e
CB
451
452 answer = get_init_pid_for_task(pid);
580fe4df 453 if (answer > 0)
2aa59b2e 454 save_initpid(&st, answer);
b7672ded 455
580fe4df 456out:
2aa59b2e
CB
457 /*
458 * Prune at the end in case we're returning the value we were about to
459 * return.
460 */
580fe4df 461 prune_initpid_store();
2aa59b2e 462
580fe4df 463 store_unlock();
2aa59b2e 464
580fe4df 465 return answer;
237e200e
SH
466}
467
29a73c2f
CB
468/*
469 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
470 */
471
29a73c2f
CB
472static bool umount_if_mounted(void)
473{
474 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 475 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
476 return false;
477 }
478 return true;
479}
480
2283e240
CB
481/* __typeof__ should be safe to use with all compilers. */
482typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
483static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
484{
485 return (fs->f_type == (fs_type_magic)magic_val);
486}
487
0a4dea41
CB
488/*
489 * looking at fs/proc_namespace.c, it appears we can
490 * actually expect the rootfs entry to very specifically contain
491 * " - rootfs rootfs "
492 * IIUC, so long as we've chrooted so that rootfs is not our root,
493 * the rootfs entry should always be skipped in mountinfo contents.
494 */
495static bool is_on_ramfs(void)
496{
87f7558b 497 __do_free char *line = NULL;
757a63e7 498 __do_free void *fopen_cache = NULL;
87f7558b 499 __do_fclose FILE *f = NULL;
0a4dea41 500 size_t len = 0;
0a4dea41 501
757a63e7 502 f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
0a4dea41
CB
503 if (!f)
504 return false;
505
506 while (getline(&line, &len, f) != -1) {
87f7558b
CB
507 int i;
508 char *p, *p2;
509
0a4dea41
CB
510 for (p = line, i = 0; p && i < 4; i++)
511 p = strchr(p + 1, ' ');
512 if (!p)
513 continue;
87f7558b 514
0a4dea41
CB
515 p2 = strchr(p + 1, ' ');
516 if (!p2)
517 continue;
518 *p2 = '\0';
519 if (strcmp(p + 1, "/") == 0) {
87f7558b 520 /* This is '/'. Is it the ramfs? */
0a4dea41 521 p = strchr(p2 + 1, '-');
87f7558b 522 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
0a4dea41 523 return true;
0a4dea41
CB
524 }
525 }
87f7558b 526
0a4dea41
CB
527 return false;
528}
529
cc309f33 530static int pivot_enter()
0a4dea41 531{
87f7558b 532 __do_close_prot_errno int oldroot = -EBADF, newroot = -EBADF;
cc309f33 533
3326c17e 534 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
535 if (oldroot < 0)
536 return log_error_errno(-1, errno,
537 "Failed to open old root for fchdir");
cc309f33 538
3326c17e 539 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
540 if (newroot < 0)
541 return log_error_errno(-1, errno,
542 "Failed to open new root for fchdir");
cc309f33
CB
543
544 /* change into new root fs */
87f7558b
CB
545 if (fchdir(newroot) < 0)
546 return log_error_errno(-1,
547 errno, "Failed to change directory to new rootfs: %s",
548 ROOTDIR);
cc309f33 549
0a4dea41 550 /* pivot_root into our new root fs */
87f7558b
CB
551 if (pivot_root(".", ".") < 0)
552 return log_error_errno(-1, errno,
553 "pivot_root() syscall failed: %s",
554 strerror(errno));
0a4dea41
CB
555
556 /*
557 * At this point the old-root is mounted on top of our new-root.
558 * To unmounted it we must not be chdir'd into it, so escape back
559 * to the old-root.
560 */
87f7558b
CB
561 if (fchdir(oldroot) < 0)
562 return log_error_errno(-1, errno, "Failed to enter old root");
0a4dea41 563
87f7558b
CB
564 if (umount2(".", MNT_DETACH) < 0)
565 return log_error_errno(-1, errno, "Failed to detach old root");
0a4dea41 566
87f7558b
CB
567 if (fchdir(newroot) < 0)
568 return log_error_errno(-1, errno, "Failed to re-enter new root");
cc309f33 569
87f7558b 570 return 0;
0a4dea41
CB
571}
572
573static int chroot_enter()
574{
575 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
576 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
577 return -1;
578 }
579
580 if (chroot(".") < 0) {
581 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
582 return -1;
583 }
584
585 if (chdir("/") < 0) {
586 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
587 return -1;
588 }
589
590 return 0;
591}
592
0232cbac 593static int permute_and_enter(void)
29a73c2f 594{
0a4dea41
CB
595 struct statfs sb;
596
597 if (statfs("/", &sb) < 0) {
598 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 599 return -1;
0a4dea41
CB
600 }
601
602 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
603 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
604 * /proc/1/mountinfo. */
605 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
606 return chroot_enter();
29a73c2f 607
cc309f33 608 if (pivot_enter() < 0) {
0a4dea41 609 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 610 return -1;
29a73c2f
CB
611 }
612
cc309f33 613 return 0;
29a73c2f
CB
614}
615
616/* Prepare our new clean root. */
0232cbac 617static int permute_prepare(void)
29a73c2f
CB
618{
619 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 620 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
621 return -1;
622 }
623
624 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 625 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
626 return -1;
627 }
628
629 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 630 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
631 return -1;
632 }
633
634 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 635 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
636 return -1;
637 }
638
639 return 0;
640}
641
0232cbac
CB
642/* Calls chroot() on ramfs, pivot_root() in all other cases. */
643static bool permute_root(void)
29a73c2f
CB
644{
645 /* Prepare new root. */
0232cbac 646 if (permute_prepare() < 0)
29a73c2f
CB
647 return false;
648
649 /* Pivot into new root. */
0232cbac 650 if (permute_and_enter() < 0)
29a73c2f
CB
651 return false;
652
653 return true;
654}
655
0a4dea41 656static bool cgfs_prepare_mounts(void)
29a73c2f
CB
657{
658 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 659 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
660 return false;
661 }
480262c9 662
29a73c2f 663 if (!umount_if_mounted()) {
b8defc3d 664 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
665 return false;
666 }
667
668 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 669 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
670 return false;
671 }
672
1d81c6a6 673 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
0646f250 674 if (cgroup_ops->mntns_fd < 0) {
a257a8ee
CB
675 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
676 return false;
677 }
678
480262c9 679 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 680 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
681 return false;
682 }
480262c9 683
29a73c2f 684 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 685 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
686 return false;
687 }
480262c9 688
29a73c2f
CB
689 return true;
690}
691
0a4dea41 692static bool cgfs_mount_hierarchies(void)
29a73c2f 693{
5fbea8a6
CB
694 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
695 return false;
51c7ca35 696
5fbea8a6
CB
697 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
698 return false;
29a73c2f 699
5fbea8a6
CB
700 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
701 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
702 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
703 if ((*h)->fd < 0)
29a73c2f 704 return false;
29a73c2f 705 }
5fbea8a6 706
29a73c2f
CB
707 return true;
708}
709
480262c9 710static bool cgfs_setup_controllers(void)
29a73c2f 711{
0a4dea41 712 if (!cgfs_prepare_mounts())
29a73c2f 713 return false;
29a73c2f 714
2b8eff1d
CB
715 if (!cgfs_mount_hierarchies())
716 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
29a73c2f 717
0232cbac 718 if (!permute_root())
29a73c2f
CB
719 return false;
720
721 return true;
722}
723
2243c5a9 724static void __attribute__((constructor)) lxcfs_init(void)
237e200e 725{
de69569b
CB
726 __do_close_prot_errno int init_ns = -EBADF, root_fd = -EBADF,
727 pidfd = -EBADF;
4ec5c9da 728 int i = 0;
2aa59b2e 729 pid_t pid;
237e200e 730
c2357135 731 lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
cc42d0c7 732
5fbea8a6 733 cgroup_ops = cgroup_init();
c2357135
CB
734 if (!cgroup_ops) {
735 lxcfs_info("Failed to initialize cgroup support");
736 goto broken_upgrade;
737 }
237e200e 738
480262c9 739 /* Preserve initial namespace. */
2aa59b2e
CB
740 pid = getpid();
741 init_ns = preserve_ns(pid, "mnt");
c2357135
CB
742 if (init_ns < 0) {
743 lxcfs_info("Failed to preserve initial mount namespace");
744 goto broken_upgrade;
745 }
480262c9 746
480262c9
CB
747 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
748 * to privately mount lxcfs cgroups. */
c2357135 749 if (!cgfs_setup_controllers()) {
2243c5a9 750 log_exit("Failed to setup private cgroup mounts for lxcfs");
c2357135
CB
751 goto broken_upgrade;
752 }
480262c9 753
c2357135 754 if (setns(init_ns, 0) < 0) {
2243c5a9 755 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
c2357135
CB
756 goto broken_upgrade;
757 }
29a73c2f 758
c2357135 759 if (!init_cpuview()) {
2243c5a9 760 log_exit("Failed to init CPU view");
c2357135
CB
761 goto broken_upgrade;
762 }
056adcef 763
cc42d0c7
CB
764 lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
765 lxcfs_info("hierarchies:");
4ec5c9da
CB
766
767 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
cc42d0c7
CB
768 char **controller_list = (*h)->controllers;
769 __do_free char *controllers = NULL;
770 if (controller_list && *controller_list)
771 controllers = lxc_string_join(",", (const char **)controller_list, false);
772 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
4ec5c9da 773 }
2aa59b2e
CB
774
775 pidfd = pidfd_open(pid, 0);
776 if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
777 can_use_pidfd = true;
cc42d0c7 778 lxcfs_info("Kernel supports pidfds");
2aa59b2e 779 }
ce8fc84c 780
cc42d0c7 781 lxcfs_info("api_extensions:");
ce8fc84c 782 for (i = 0; i < nr_api_extensions; i++)
cc42d0c7 783 lxcfs_info("- %s", api_extensions[i]);
de69569b
CB
784
785 root_fd = open("/", O_PATH | O_CLOEXEC);
c2357135
CB
786 if (root_fd < 0)
787 lxcfs_info("%s - Failed to open root directory", strerror(errno));
788 else if (fchdir(root_fd) < 0)
789 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
790
cbfc55fd 791 reload_successful = true;
c2357135 792 return;
de69569b 793
c2357135 794broken_upgrade:
cbfc55fd 795 reload_successful = false;
c2357135 796 lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
237e200e
SH
797}
798
2243c5a9 799static void __attribute__((destructor)) lxcfs_exit(void)
237e200e 800{
cc42d0c7
CB
801 lxcfs_info("Running destructor %s", __func__);
802
056adcef 803 free_cpuview();
2243c5a9 804 cgroup_exit(cgroup_ops);
1c4b4e38 805}