]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/bindings.c
tree-wide: add missing O_CLOEXEC
[mirror_lxcfs.git] / src / bindings.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
237e200e 2
1f5596dd
CB
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE
5#endif
6
7#ifndef FUSE_USE_VERSION
237e200e 8#define FUSE_USE_VERSION 26
1f5596dd
CB
9#endif
10
11#define _FILE_OFFSET_BITS 64
237e200e 12
237e200e 13#include <dirent.h>
29a73c2f 14#include <errno.h>
237e200e
SH
15#include <fcntl.h>
16#include <fuse.h>
0ecddf02 17#include <inttypes.h>
237e200e 18#include <libgen.h>
237e200e 19#include <pthread.h>
29a73c2f 20#include <sched.h>
db1b32f6 21#include <stdarg.h>
29a73c2f 22#include <stdbool.h>
0ecddf02 23#include <stdint.h>
29a73c2f
CB
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27#include <time.h>
28#include <unistd.h>
29#include <wait.h>
d89504c4 30#include <linux/magic.h>
237e200e 31#include <linux/sched.h>
29a73c2f
CB
32#include <sys/epoll.h>
33#include <sys/mman.h>
34#include <sys/mount.h>
237e200e 35#include <sys/param.h>
87f7558b 36#include <signal.h>
237e200e 37#include <sys/socket.h>
29a73c2f 38#include <sys/syscall.h>
0ecddf02 39#include <sys/sysinfo.h>
d89504c4 40#include <sys/vfs.h>
237e200e 41
ce8fc84c 42#include "api_extensions.h"
237e200e 43#include "bindings.h"
1d81c6a6 44#include "config.h"
580fe4df 45#include "cgroup_fuse.h"
5fbea8a6
CB
46#include "cgroups/cgroup.h"
47#include "cgroups/cgroup_utils.h"
c9236032 48#include "memory_utils.h"
1f5596dd 49#include "proc_cpuview.h"
1d81c6a6 50#include "utils.h"
237e200e 51
2aa59b2e
CB
52static bool can_use_pidfd;
53
29a73c2f
CB
54/* Define pivot_root() if missing from the C library */
55#ifndef HAVE_PIVOT_ROOT
4ec5c9da 56static int pivot_root(const char *new_root, const char *put_old)
29a73c2f
CB
57{
58#ifdef __NR_pivot_root
4ec5c9da 59 return syscall(__NR_pivot_root, new_root, put_old);
29a73c2f 60#else
4ec5c9da
CB
61 errno = ENOSYS;
62 return -1;
29a73c2f
CB
63#endif
64}
65#else
4ec5c9da 66extern int pivot_root(const char *new_root, const char *put_old);
29a73c2f
CB
67#endif
68
237e200e
SH
69/*
70 * A table caching which pid is init for a pid namespace.
71 * When looking up which pid is init for $qpid, we first
72 * 1. Stat /proc/$qpid/ns/pid.
73 * 2. Check whether the ino_t is in our store.
74 * a. if not, fork a child in qpid's ns to send us
75 * ucred.pid = 1, and read the initpid. Cache
76 * initpid and creation time for /proc/initpid
77 * in a new store entry.
78 * b. if so, verify that /proc/initpid still matches
79 * what we have saved. If not, clear the store
80 * entry and go back to a. If so, return the
81 * cached initpid.
82 */
83struct pidns_init_store {
2aa59b2e
CB
84 ino_t ino; /* inode number for /proc/$pid/ns/pid */
85 pid_t initpid; /* the pid of nit in that ns */
86 int init_pidfd;
87 long int ctime; /* the time at which /proc/$initpid was created */
237e200e
SH
88 struct pidns_init_store *next;
89 long int lastcheck;
90};
91
92/* lol - look at how they are allocated in the kernel */
93#define PIDNS_HASH_SIZE 4096
94#define HASH(x) ((x) % PIDNS_HASH_SIZE)
95
96static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
97static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
4ec5c9da 98
237e200e
SH
99static void lock_mutex(pthread_mutex_t *l)
100{
101 int ret;
102
4ec5c9da
CB
103 ret = pthread_mutex_lock(l);
104 if (ret)
105 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
106}
107
77f4399a 108struct cgroup_ops *cgroup_ops;
29a73c2f 109
237e200e
SH
110static void unlock_mutex(pthread_mutex_t *l)
111{
112 int ret;
113
4ec5c9da
CB
114 ret = pthread_mutex_unlock(l);
115 if (ret)
116 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
117}
118
119static void store_lock(void)
120{
121 lock_mutex(&pidns_store_mutex);
122}
123
124static void store_unlock(void)
125{
126 unlock_mutex(&pidns_store_mutex);
127}
128
2aa59b2e
CB
129/* /proc/ = 6
130 * +
131 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
132 * +
133 * \0 = 1
134 */
135#define LXCFS_PROC_PID_LEN \
136 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
137
bc189096 138static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
237e200e 139{
bc189096 140 int ret;
237e200e 141
bc189096
CB
142 if (entry->init_pidfd < 0)
143 return ret_errno(ENOSYS);
7dd6560a 144
bc189096
CB
145 ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
146 if (ret < 0) {
147 if (errno == ENOSYS)
148 return ret_errno(ENOSYS);
7dd6560a 149
bc189096 150 return 0;
2aa59b2e
CB
151 }
152
bc189096
CB
153 return 1;
154}
155
156static int initpid_still_valid_stat(struct pidns_init_store *entry)
157{
158 struct stat st;
159 char path[LXCFS_PROC_PID_LEN];
160
161 snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
162 if (stat(path, &st) || entry->ctime != st.st_ctime)
163 return 0;
164
165 return 1;
166}
167
168/* Must be called under store_lock */
169static bool initpid_still_valid(struct pidns_init_store *entry)
170{
171 int ret;
172
173 ret = initpid_still_valid_pidfd(entry);
174 if (ret < 0)
175 ret = initpid_still_valid_stat(entry);
176
177 return ret == 1;
237e200e
SH
178}
179
180/* Must be called under store_lock */
2aa59b2e 181static void remove_initpid(struct pidns_init_store *entry)
237e200e 182{
2aa59b2e
CB
183 struct pidns_init_store *it;
184 int ino_hash;
237e200e 185
2aa59b2e
CB
186 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
187 entry->initpid);
7dd6560a 188
2aa59b2e
CB
189 ino_hash = HASH(entry->ino);
190 if (pidns_hash_table[ino_hash] == entry) {
191 pidns_hash_table[ino_hash] = entry->next;
192 close_prot_errno_disarm(entry->init_pidfd);
193 free_disarm(entry);
237e200e
SH
194 return;
195 }
196
2aa59b2e
CB
197 it = pidns_hash_table[ino_hash];
198 while (it) {
199 if (it->next == entry) {
200 it->next = entry->next;
201 close_prot_errno_disarm(entry->init_pidfd);
202 free_disarm(entry);
237e200e
SH
203 return;
204 }
2aa59b2e 205 it = it->next;
237e200e
SH
206 }
207}
208
209#define PURGE_SECS 5
210/* Must be called under store_lock */
211static void prune_initpid_store(void)
212{
213 static long int last_prune = 0;
237e200e 214 long int now, threshold;
237e200e
SH
215
216 if (!last_prune) {
217 last_prune = time(NULL);
218 return;
219 }
2aa59b2e 220
237e200e
SH
221 now = time(NULL);
222 if (now < last_prune + PURGE_SECS)
223 return;
7dd6560a 224
2aa59b2e 225 lxcfs_debug("Pruning init pid cache");
7dd6560a 226
237e200e
SH
227 last_prune = now;
228 threshold = now - 2 * PURGE_SECS;
229
2aa59b2e
CB
230 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
231 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
232 if (entry->lastcheck < threshold) {
233 struct pidns_init_store *cur = entry;
7dd6560a 234
2aa59b2e 235 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
7dd6560a 236
237e200e 237 if (prev)
2aa59b2e 238 prev->next = entry->next;
237e200e 239 else
2aa59b2e
CB
240 pidns_hash_table[i] = entry->next;
241 entry = entry->next;
242 close_prot_errno_disarm(cur->init_pidfd);
243 free_disarm(cur);
237e200e 244 } else {
2aa59b2e
CB
245 prev = entry;
246 entry = entry->next;
237e200e
SH
247 }
248 }
249 }
250}
251
252/* Must be called under store_lock */
253static void save_initpid(struct stat *sb, pid_t pid)
254{
1e5d03fe 255 __do_free struct pidns_init_store *entry = NULL;
2aa59b2e
CB
256 __do_close_prot_errno int pidfd = -EBADF;
257 char path[LXCFS_PROC_PID_LEN];
258 struct lxcfs_opts *opts = fuse_get_context()->private_data;
259 struct stat st;
260 int ino_hash;
261
9973cc06 262 if (opts && opts->use_pidfd && can_use_pidfd) {
2aa59b2e
CB
263 pidfd = pidfd_open(pid, 0);
264 if (pidfd < 0)
265 return;
266 }
237e200e 267
2aa59b2e
CB
268 snprintf(path, sizeof(path), "/proc/%d", pid);
269 if (stat(path, &st))
270 return;
7dd6560a 271
1e5d03fe
CB
272 entry = malloc(sizeof(*entry));
273 if (entry)
237e200e 274 return;
2aa59b2e 275
1e5d03fe
CB
276 ino_hash = HASH(entry->ino);
277 *entry = (struct pidns_init_store){
278 .ino = sb->st_ino,
279 .initpid = pid,
280 .ctime = st.st_ctime,
281 .next = pidns_hash_table[ino_hash],
282 .lastcheck = time(NULL),
283 .init_pidfd = move_fd(pidfd),
284 };
285 pidns_hash_table[ino_hash] = move_ptr(entry);
2aa59b2e
CB
286
287 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
237e200e
SH
288}
289
290/*
291 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
292 * entry for the inode number and creation time. Verify that the init pid
293 * is still valid. If not, remove it. Return the entry if valid, NULL
294 * otherwise.
295 * Must be called under store_lock
296 */
297static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
298{
2aa59b2e
CB
299 struct pidns_init_store *entry = pidns_hash_table[HASH(sb->st_ino)];
300
301 while (entry) {
302 if (entry->ino == sb->st_ino) {
303 if (initpid_still_valid(entry)) {
304 entry->lastcheck = time(NULL);
305 return entry;
237e200e 306 }
2aa59b2e
CB
307
308 remove_initpid(entry);
237e200e
SH
309 return NULL;
310 }
2aa59b2e 311 entry = entry->next;
237e200e
SH
312 }
313
314 return NULL;
315}
316
4ec5c9da 317static int send_creds_clone_wrapper(void *arg)
237e200e 318{
f1744de4
CB
319 int sock = PTR_TO_INT(arg);
320 char v = '1'; /* we are the child */
321 struct ucred cred = {
322 .uid = 0,
323 .gid = 0,
324 .pid = 1,
325 };
326
327 return send_creds(sock, &cred, v, true) != SEND_CREDS_OK;
237e200e
SH
328}
329
87f7558b
CB
330/*
331 * Let's use the "standard stack limit" (i.e. glibc thread size default) for
332 * stack sizes: 8MB.
333 */
334#define __LXCFS_STACK_SIZE (8 * 1024 * 1024)
335static pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags)
336{
337 pid_t ret;
338 void *stack;
339
340 stack = malloc(__LXCFS_STACK_SIZE);
341 if (!stack)
342 return ret_errno(ENOMEM);
343
344#ifdef __ia64__
345 ret = __clone2(fn, stack, __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
346#else
347 ret = clone(fn, stack + __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
348#endif
349 return ret;
350}
351
352#define LXCFS_PROC_PID_NS_LEN \
353 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
354 STRLITERALLEN("/ns/pid") + 1)
355
580fe4df
CB
356/*
357 * clone a task which switches to @task's namespace and writes '1'.
358 * over a unix sock so we can read the task's reaper's pid in our
359 * namespace
360 *
361 * Note: glibc's fork() does not respect pidns, which can lead to failed
362 * assertions inside glibc (and thus failed forks) if the child's pid in
363 * the pidns and the parent pid outside are identical. Using clone prevents
364 * this issue.
365 */
366static void write_task_init_pid_exit(int sock, pid_t target)
367{
87f7558b
CB
368 __do_close_prot_errno int fd = -EBADF;
369 char path[LXCFS_PROC_PID_NS_LEN];
580fe4df 370 pid_t pid;
87f7558b
CB
371
372 snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
373 fd = open(path, O_RDONLY | O_CLOEXEC);
374 if (fd < 0)
375 log_exit("write_task_init_pid_exit open of ns/pid");
376
377 if (setns(fd, 0))
378 log_exit("Failed to setns to pid namespace of process %d", target);
379
f1744de4 380 pid = lxcfs_clone(send_creds_clone_wrapper, INT_TO_PTR(sock), 0);
580fe4df 381 if (pid < 0)
87f7558b
CB
382 _exit(EXIT_FAILURE);
383
580fe4df
CB
384 if (pid != 0) {
385 if (!wait_for_pid(pid))
87f7558b
CB
386 _exit(EXIT_FAILURE);
387
388 _exit(EXIT_SUCCESS);
237e200e 389 }
237e200e
SH
390}
391
580fe4df 392static pid_t get_init_pid_for_task(pid_t task)
237e200e 393{
580fe4df 394 char v = '0';
87f7558b
CB
395 pid_t pid_ret = -1;
396 pid_t pid;
397 int sock[2];
580fe4df 398 struct ucred cred;
237e200e 399
87f7558b 400 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
580fe4df 401 return -1;
237e200e 402
580fe4df
CB
403 pid = fork();
404 if (pid < 0)
405 goto out;
87f7558b
CB
406
407 if (pid == 0) {
580fe4df
CB
408 close(sock[1]);
409 write_task_init_pid_exit(sock[0], task);
87f7558b 410 _exit(EXIT_SUCCESS);
237e200e 411 }
7213ec5c 412
580fe4df
CB
413 if (!recv_creds(sock[1], &cred, &v))
414 goto out;
87f7558b
CB
415
416 pid_ret = cred.pid;
237e200e 417
580fe4df
CB
418out:
419 close(sock[0]);
420 close(sock[1]);
421 if (pid > 0)
422 wait_for_pid(pid);
237e200e 423
87f7558b
CB
424 return pid_ret;
425}
2aa59b2e
CB
426
427pid_t lookup_initpid_in_store(pid_t pid)
237e200e 428{
580fe4df 429 pid_t answer = 0;
2aa59b2e
CB
430 char path[LXCFS_PROC_PID_NS_LEN];
431 struct stat st;
432 struct pidns_init_store *entry;
433
434 snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
b7672ded 435
580fe4df 436 store_lock();
2aa59b2e 437 if (stat(path, &st))
580fe4df 438 goto out;
2aa59b2e
CB
439
440 entry = lookup_verify_initpid(&st);
441 if (entry) {
442 answer = entry->initpid;
580fe4df
CB
443 goto out;
444 }
2aa59b2e
CB
445
446 answer = get_init_pid_for_task(pid);
580fe4df 447 if (answer > 0)
2aa59b2e 448 save_initpid(&st, answer);
b7672ded 449
580fe4df 450out:
2aa59b2e
CB
451 /*
452 * Prune at the end in case we're returning the value we were about to
453 * return.
454 */
580fe4df 455 prune_initpid_store();
2aa59b2e 456
580fe4df 457 store_unlock();
2aa59b2e 458
580fe4df 459 return answer;
237e200e
SH
460}
461
29a73c2f
CB
462/*
463 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
464 */
465
29a73c2f
CB
466static bool umount_if_mounted(void)
467{
468 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 469 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
470 return false;
471 }
472 return true;
473}
474
2283e240
CB
475/* __typeof__ should be safe to use with all compilers. */
476typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
477static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
478{
479 return (fs->f_type == (fs_type_magic)magic_val);
480}
481
0a4dea41
CB
482/*
483 * looking at fs/proc_namespace.c, it appears we can
484 * actually expect the rootfs entry to very specifically contain
485 * " - rootfs rootfs "
486 * IIUC, so long as we've chrooted so that rootfs is not our root,
487 * the rootfs entry should always be skipped in mountinfo contents.
488 */
489static bool is_on_ramfs(void)
490{
87f7558b 491 __do_free char *line = NULL;
757a63e7 492 __do_free void *fopen_cache = NULL;
87f7558b 493 __do_fclose FILE *f = NULL;
0a4dea41 494 size_t len = 0;
0a4dea41 495
757a63e7 496 f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
0a4dea41
CB
497 if (!f)
498 return false;
499
500 while (getline(&line, &len, f) != -1) {
87f7558b
CB
501 int i;
502 char *p, *p2;
503
0a4dea41
CB
504 for (p = line, i = 0; p && i < 4; i++)
505 p = strchr(p + 1, ' ');
506 if (!p)
507 continue;
87f7558b 508
0a4dea41
CB
509 p2 = strchr(p + 1, ' ');
510 if (!p2)
511 continue;
512 *p2 = '\0';
513 if (strcmp(p + 1, "/") == 0) {
87f7558b 514 /* This is '/'. Is it the ramfs? */
0a4dea41 515 p = strchr(p2 + 1, '-');
87f7558b 516 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
0a4dea41 517 return true;
0a4dea41
CB
518 }
519 }
87f7558b 520
0a4dea41
CB
521 return false;
522}
523
cc309f33 524static int pivot_enter()
0a4dea41 525{
87f7558b 526 __do_close_prot_errno int oldroot = -EBADF, newroot = -EBADF;
cc309f33 527
3326c17e 528 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
529 if (oldroot < 0)
530 return log_error_errno(-1, errno,
531 "Failed to open old root for fchdir");
cc309f33 532
3326c17e 533 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
534 if (newroot < 0)
535 return log_error_errno(-1, errno,
536 "Failed to open new root for fchdir");
cc309f33
CB
537
538 /* change into new root fs */
87f7558b
CB
539 if (fchdir(newroot) < 0)
540 return log_error_errno(-1,
541 errno, "Failed to change directory to new rootfs: %s",
542 ROOTDIR);
cc309f33 543
0a4dea41 544 /* pivot_root into our new root fs */
87f7558b
CB
545 if (pivot_root(".", ".") < 0)
546 return log_error_errno(-1, errno,
547 "pivot_root() syscall failed: %s",
548 strerror(errno));
0a4dea41
CB
549
550 /*
551 * At this point the old-root is mounted on top of our new-root.
552 * To unmounted it we must not be chdir'd into it, so escape back
553 * to the old-root.
554 */
87f7558b
CB
555 if (fchdir(oldroot) < 0)
556 return log_error_errno(-1, errno, "Failed to enter old root");
0a4dea41 557
87f7558b
CB
558 if (umount2(".", MNT_DETACH) < 0)
559 return log_error_errno(-1, errno, "Failed to detach old root");
0a4dea41 560
87f7558b
CB
561 if (fchdir(newroot) < 0)
562 return log_error_errno(-1, errno, "Failed to re-enter new root");
cc309f33 563
87f7558b 564 return 0;
0a4dea41
CB
565}
566
567static int chroot_enter()
568{
569 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
570 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
571 return -1;
572 }
573
574 if (chroot(".") < 0) {
575 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
576 return -1;
577 }
578
579 if (chdir("/") < 0) {
580 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
581 return -1;
582 }
583
584 return 0;
585}
586
0232cbac 587static int permute_and_enter(void)
29a73c2f 588{
0a4dea41
CB
589 struct statfs sb;
590
591 if (statfs("/", &sb) < 0) {
592 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 593 return -1;
0a4dea41
CB
594 }
595
596 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
597 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
598 * /proc/1/mountinfo. */
599 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
600 return chroot_enter();
29a73c2f 601
cc309f33 602 if (pivot_enter() < 0) {
0a4dea41 603 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 604 return -1;
29a73c2f
CB
605 }
606
cc309f33 607 return 0;
29a73c2f
CB
608}
609
610/* Prepare our new clean root. */
0232cbac 611static int permute_prepare(void)
29a73c2f
CB
612{
613 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 614 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
615 return -1;
616 }
617
618 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 619 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
620 return -1;
621 }
622
623 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 624 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
625 return -1;
626 }
627
628 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 629 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
630 return -1;
631 }
632
633 return 0;
634}
635
0232cbac
CB
636/* Calls chroot() on ramfs, pivot_root() in all other cases. */
637static bool permute_root(void)
29a73c2f
CB
638{
639 /* Prepare new root. */
0232cbac 640 if (permute_prepare() < 0)
29a73c2f
CB
641 return false;
642
643 /* Pivot into new root. */
0232cbac 644 if (permute_and_enter() < 0)
29a73c2f
CB
645 return false;
646
647 return true;
648}
649
0a4dea41 650static bool cgfs_prepare_mounts(void)
29a73c2f
CB
651{
652 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 653 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
654 return false;
655 }
480262c9 656
29a73c2f 657 if (!umount_if_mounted()) {
b8defc3d 658 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
659 return false;
660 }
661
662 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 663 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
664 return false;
665 }
666
1d81c6a6 667 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
0646f250 668 if (cgroup_ops->mntns_fd < 0) {
a257a8ee
CB
669 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
670 return false;
671 }
672
480262c9 673 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 674 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
675 return false;
676 }
480262c9 677
29a73c2f 678 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 679 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
680 return false;
681 }
480262c9 682
29a73c2f
CB
683 return true;
684}
685
0a4dea41 686static bool cgfs_mount_hierarchies(void)
29a73c2f 687{
5fbea8a6
CB
688 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
689 return false;
51c7ca35 690
5fbea8a6
CB
691 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
692 return false;
29a73c2f 693
5fbea8a6
CB
694 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
695 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
696 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
697 if ((*h)->fd < 0)
29a73c2f 698 return false;
29a73c2f 699 }
5fbea8a6 700
29a73c2f
CB
701 return true;
702}
703
480262c9 704static bool cgfs_setup_controllers(void)
29a73c2f 705{
0a4dea41 706 if (!cgfs_prepare_mounts())
29a73c2f 707 return false;
29a73c2f 708
2b8eff1d
CB
709 if (!cgfs_mount_hierarchies())
710 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
29a73c2f 711
0232cbac 712 if (!permute_root())
29a73c2f
CB
713 return false;
714
715 return true;
716}
717
2243c5a9 718static void __attribute__((constructor)) lxcfs_init(void)
237e200e 719{
2aa59b2e 720 __do_close_prot_errno int init_ns = -EBADF, pidfd = -EBADF;
4ec5c9da 721 int i = 0;
2aa59b2e 722 pid_t pid;
5fbea8a6 723 char *cret;
e58dab00 724 char cwd[MAXPATHLEN];
237e200e 725
cc42d0c7
CB
726 lxcfs_info("Running constructor %s", __func__);
727
5fbea8a6
CB
728 cgroup_ops = cgroup_init();
729 if (!cgroup_ops)
2243c5a9 730 log_exit("Failed to initialize cgroup support");
237e200e 731
480262c9 732 /* Preserve initial namespace. */
2aa59b2e
CB
733 pid = getpid();
734 init_ns = preserve_ns(pid, "mnt");
2243c5a9
CB
735 if (init_ns < 0)
736 log_exit("Failed to preserve initial mount namespace");
480262c9 737
e58dab00 738 cret = getcwd(cwd, MAXPATHLEN);
4ec5c9da 739 if (!cret)
2243c5a9 740 log_exit("%s - Could not retrieve current working directory", strerror(errno));
e58dab00 741
480262c9
CB
742 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
743 * to privately mount lxcfs cgroups. */
2243c5a9
CB
744 if (!cgfs_setup_controllers())
745 log_exit("Failed to setup private cgroup mounts for lxcfs");
480262c9 746
2243c5a9
CB
747 if (setns(init_ns, 0) < 0)
748 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
29a73c2f 749
e58dab00 750 if (!cret || chdir(cwd) < 0)
2243c5a9 751 log_exit("%s - Could not change back to original working directory", strerror(errno));
e58dab00 752
2243c5a9
CB
753 if (!init_cpuview())
754 log_exit("Failed to init CPU view");
056adcef 755
cc42d0c7
CB
756 lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
757 lxcfs_info("hierarchies:");
4ec5c9da
CB
758
759 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
cc42d0c7
CB
760 char **controller_list = (*h)->controllers;
761 __do_free char *controllers = NULL;
762 if (controller_list && *controller_list)
763 controllers = lxc_string_join(",", (const char **)controller_list, false);
764 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
4ec5c9da 765 }
2aa59b2e
CB
766
767 pidfd = pidfd_open(pid, 0);
768 if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
769 can_use_pidfd = true;
cc42d0c7 770 lxcfs_info("Kernel supports pidfds");
2aa59b2e 771 }
ce8fc84c 772
cc42d0c7 773 lxcfs_info("api_extensions:");
ce8fc84c 774 for (i = 0; i < nr_api_extensions; i++)
cc42d0c7 775 lxcfs_info("- %s", api_extensions[i]);
237e200e
SH
776}
777
2243c5a9 778static void __attribute__((destructor)) lxcfs_exit(void)
237e200e 779{
cc42d0c7
CB
780 lxcfs_info("Running destructor %s", __func__);
781
056adcef 782 free_cpuview();
2243c5a9 783 cgroup_exit(cgroup_ops);
1c4b4e38 784}