]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/bindings.c
Merge pull request #417 from brauner/2020-06-10/fixes
[mirror_lxcfs.git] / src / bindings.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
237e200e 2
1f5596dd
CB
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE
5#endif
6
7#ifndef FUSE_USE_VERSION
237e200e 8#define FUSE_USE_VERSION 26
1f5596dd
CB
9#endif
10
11#define _FILE_OFFSET_BITS 64
237e200e 12
237e200e 13#include <dirent.h>
29a73c2f 14#include <errno.h>
237e200e
SH
15#include <fcntl.h>
16#include <fuse.h>
0ecddf02 17#include <inttypes.h>
237e200e 18#include <libgen.h>
dee86006
CB
19#include <linux/magic.h>
20#include <linux/sched.h>
237e200e 21#include <pthread.h>
29a73c2f 22#include <sched.h>
db1b32f6 23#include <stdarg.h>
29a73c2f 24#include <stdbool.h>
0ecddf02 25#include <stdint.h>
29a73c2f
CB
26#include <stdio.h>
27#include <stdlib.h>
28#include <string.h>
29a73c2f
CB
29#include <sys/epoll.h>
30#include <sys/mman.h>
31#include <sys/mount.h>
237e200e
SH
32#include <sys/param.h>
33#include <sys/socket.h>
29a73c2f 34#include <sys/syscall.h>
0ecddf02 35#include <sys/sysinfo.h>
d89504c4 36#include <sys/vfs.h>
dee86006
CB
37#include <time.h>
38#include <unistd.h>
39#include <wait.h>
237e200e 40
ce8fc84c 41#include "api_extensions.h"
237e200e 42#include "bindings.h"
580fe4df 43#include "cgroup_fuse.h"
5fbea8a6
CB
44#include "cgroups/cgroup.h"
45#include "cgroups/cgroup_utils.h"
dee86006 46#include "config.h"
c9236032 47#include "memory_utils.h"
1f5596dd 48#include "proc_cpuview.h"
8364a99c 49#include "syscall_numbers.h"
1d81c6a6 50#include "utils.h"
237e200e 51
2aa59b2e 52static bool can_use_pidfd;
b9b6bdc9
CB
53
54static volatile sig_atomic_t reload_successful;
cbfc55fd
CB
55
56bool liblxcfs_functional(void)
57{
b9b6bdc9 58 return reload_successful != 0;
cbfc55fd 59}
2aa59b2e 60
29a73c2f
CB
61/* Define pivot_root() if missing from the C library */
62#ifndef HAVE_PIVOT_ROOT
4ec5c9da 63static int pivot_root(const char *new_root, const char *put_old)
29a73c2f 64{
4ec5c9da 65 return syscall(__NR_pivot_root, new_root, put_old);
29a73c2f
CB
66}
67#else
4ec5c9da 68extern int pivot_root(const char *new_root, const char *put_old);
29a73c2f
CB
69#endif
70
237e200e
SH
71/*
72 * A table caching which pid is init for a pid namespace.
73 * When looking up which pid is init for $qpid, we first
74 * 1. Stat /proc/$qpid/ns/pid.
75 * 2. Check whether the ino_t is in our store.
76 * a. if not, fork a child in qpid's ns to send us
77 * ucred.pid = 1, and read the initpid. Cache
78 * initpid and creation time for /proc/initpid
79 * in a new store entry.
80 * b. if so, verify that /proc/initpid still matches
81 * what we have saved. If not, clear the store
82 * entry and go back to a. If so, return the
83 * cached initpid.
84 */
85struct pidns_init_store {
2aa59b2e
CB
86 ino_t ino; /* inode number for /proc/$pid/ns/pid */
87 pid_t initpid; /* the pid of nit in that ns */
88 int init_pidfd;
1ba088ae 89 int64_t ctime; /* the time at which /proc/$initpid was created */
237e200e 90 struct pidns_init_store *next;
1ba088ae 91 int64_t lastcheck;
237e200e
SH
92};
93
94/* lol - look at how they are allocated in the kernel */
95#define PIDNS_HASH_SIZE 4096
96#define HASH(x) ((x) % PIDNS_HASH_SIZE)
97
98static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
99static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
4ec5c9da 100
4e1e4115 101static void mutex_lock(pthread_mutex_t *l)
237e200e
SH
102{
103 int ret;
104
4ec5c9da
CB
105 ret = pthread_mutex_lock(l);
106 if (ret)
107 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
108}
109
77f4399a 110struct cgroup_ops *cgroup_ops;
29a73c2f 111
4e1e4115 112static void mutex_unlock(pthread_mutex_t *l)
237e200e
SH
113{
114 int ret;
115
4ec5c9da
CB
116 ret = pthread_mutex_unlock(l);
117 if (ret)
118 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
119}
120
4e1e4115 121static inline void store_lock(void)
237e200e 122{
4e1e4115 123 mutex_lock(&pidns_store_mutex);
237e200e
SH
124}
125
4e1e4115 126static inline void store_unlock(void)
237e200e 127{
4e1e4115 128 mutex_unlock(&pidns_store_mutex);
237e200e
SH
129}
130
2aa59b2e
CB
131/* /proc/ = 6
132 * +
133 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
134 * +
135 * \0 = 1
136 */
137#define LXCFS_PROC_PID_LEN \
138 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
139
bc189096 140static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
237e200e 141{
bc189096 142 int ret;
237e200e 143
bc189096
CB
144 if (entry->init_pidfd < 0)
145 return ret_errno(ENOSYS);
7dd6560a 146
bc189096
CB
147 ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
148 if (ret < 0) {
149 if (errno == ENOSYS)
150 return ret_errno(ENOSYS);
7dd6560a 151
bc189096 152 return 0;
2aa59b2e
CB
153 }
154
bc189096
CB
155 return 1;
156}
157
158static int initpid_still_valid_stat(struct pidns_init_store *entry)
159{
160 struct stat st;
161 char path[LXCFS_PROC_PID_LEN];
162
163 snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
164 if (stat(path, &st) || entry->ctime != st.st_ctime)
165 return 0;
166
167 return 1;
168}
169
170/* Must be called under store_lock */
171static bool initpid_still_valid(struct pidns_init_store *entry)
172{
173 int ret;
174
175 ret = initpid_still_valid_pidfd(entry);
176 if (ret < 0)
177 ret = initpid_still_valid_stat(entry);
178
179 return ret == 1;
237e200e
SH
180}
181
182/* Must be called under store_lock */
2aa59b2e 183static void remove_initpid(struct pidns_init_store *entry)
237e200e 184{
2aa59b2e
CB
185 struct pidns_init_store *it;
186 int ino_hash;
237e200e 187
2aa59b2e
CB
188 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
189 entry->initpid);
7dd6560a 190
2aa59b2e
CB
191 ino_hash = HASH(entry->ino);
192 if (pidns_hash_table[ino_hash] == entry) {
193 pidns_hash_table[ino_hash] = entry->next;
194 close_prot_errno_disarm(entry->init_pidfd);
195 free_disarm(entry);
237e200e
SH
196 return;
197 }
198
2aa59b2e
CB
199 it = pidns_hash_table[ino_hash];
200 while (it) {
201 if (it->next == entry) {
202 it->next = entry->next;
203 close_prot_errno_disarm(entry->init_pidfd);
204 free_disarm(entry);
237e200e
SH
205 return;
206 }
2aa59b2e 207 it = it->next;
237e200e
SH
208 }
209}
210
211#define PURGE_SECS 5
212/* Must be called under store_lock */
213static void prune_initpid_store(void)
214{
1ba088ae
CB
215 static int64_t last_prune = 0;
216 int64_t now, threshold;
237e200e
SH
217
218 if (!last_prune) {
219 last_prune = time(NULL);
220 return;
221 }
2aa59b2e 222
237e200e 223 now = time(NULL);
b18d6121 224 if (now < (last_prune + PURGE_SECS))
237e200e 225 return;
7dd6560a 226
2aa59b2e 227 lxcfs_debug("Pruning init pid cache");
7dd6560a 228
237e200e
SH
229 last_prune = now;
230 threshold = now - 2 * PURGE_SECS;
231
2aa59b2e
CB
232 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
233 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
234 if (entry->lastcheck < threshold) {
235 struct pidns_init_store *cur = entry;
7dd6560a 236
2aa59b2e 237 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
7dd6560a 238
237e200e 239 if (prev)
2aa59b2e 240 prev->next = entry->next;
237e200e 241 else
2aa59b2e
CB
242 pidns_hash_table[i] = entry->next;
243 entry = entry->next;
244 close_prot_errno_disarm(cur->init_pidfd);
245 free_disarm(cur);
237e200e 246 } else {
2aa59b2e
CB
247 prev = entry;
248 entry = entry->next;
237e200e
SH
249 }
250 }
251 }
252}
253
c8f77ce4
CB
254static void clear_initpid_store(void)
255{
256 store_lock();
257 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
258 for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) {
259 struct pidns_init_store *cur = entry;
260
261 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
262
263 pidns_hash_table[i] = entry->next;
264 entry = entry->next;
265 close_prot_errno_disarm(cur->init_pidfd);
266 free_disarm(cur);
267 }
268 }
269 store_unlock();
270}
271
237e200e 272/* Must be called under store_lock */
fcdedd16 273static void save_initpid(ino_t pidns_inode, pid_t pid)
237e200e 274{
1e5d03fe 275 __do_free struct pidns_init_store *entry = NULL;
05b7a16d 276 __do_close int pidfd = -EBADF;
536620fd 277 const struct lxcfs_opts *opts = fuse_get_context()->private_data;
2aa59b2e 278 char path[LXCFS_PROC_PID_LEN];
2aa59b2e
CB
279 struct stat st;
280 int ino_hash;
281
9973cc06 282 if (opts && opts->use_pidfd && can_use_pidfd) {
2aa59b2e
CB
283 pidfd = pidfd_open(pid, 0);
284 if (pidfd < 0)
285 return;
286 }
237e200e 287
2aa59b2e
CB
288 snprintf(path, sizeof(path), "/proc/%d", pid);
289 if (stat(path, &st))
290 return;
7dd6560a 291
5ec289bf 292 entry = zalloc(sizeof(*entry));
0eb3756b 293 if (!entry)
237e200e 294 return;
2aa59b2e 295
97017213 296 ino_hash = HASH(pidns_inode);
1e5d03fe 297 *entry = (struct pidns_init_store){
fcdedd16 298 .ino = pidns_inode,
1e5d03fe
CB
299 .initpid = pid,
300 .ctime = st.st_ctime,
301 .next = pidns_hash_table[ino_hash],
302 .lastcheck = time(NULL),
303 .init_pidfd = move_fd(pidfd),
304 };
305 pidns_hash_table[ino_hash] = move_ptr(entry);
2aa59b2e
CB
306
307 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
237e200e
SH
308}
309
310/*
311 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
312 * entry for the inode number and creation time. Verify that the init pid
313 * is still valid. If not, remove it. Return the entry if valid, NULL
314 * otherwise.
315 * Must be called under store_lock
316 */
cfda2e8a 317static pid_t lookup_verify_initpid(ino_t pidns_inode)
237e200e 318{
fcdedd16 319 struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
2aa59b2e
CB
320
321 while (entry) {
fcdedd16 322 if (entry->ino == pidns_inode) {
2aa59b2e
CB
323 if (initpid_still_valid(entry)) {
324 entry->lastcheck = time(NULL);
cfda2e8a 325 return entry->initpid;
237e200e 326 }
2aa59b2e
CB
327
328 remove_initpid(entry);
cfda2e8a 329 return ret_errno(ESRCH);
237e200e 330 }
2aa59b2e 331 entry = entry->next;
237e200e
SH
332 }
333
cfda2e8a 334 return ret_errno(ESRCH);
237e200e
SH
335}
336
35acc247 337static bool send_creds_ok(int sock_fd)
237e200e 338{
f1744de4
CB
339 char v = '1'; /* we are the child */
340 struct ucred cred = {
341 .uid = 0,
342 .gid = 0,
343 .pid = 1,
344 };
345
35acc247 346 return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK;
237e200e
SH
347}
348
35acc247 349__returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd)
87f7558b 350{
35acc247
CB
351 /*
352 * These flags don't interest at all so we don't jump through any hoops
353 * of retrieving them and passing them to the kernel.
354 */
355 errno = EINVAL;
356 if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
357 CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
358 return -EINVAL;
359
360#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
361 /* On s390/s390x and cris the order of the first and second arguments
362 * of the system call is reversed.
363 */
364 return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd);
365#elif defined(__sparc__) && defined(__arch64__)
366 {
367 /*
368 * sparc64 always returns the other process id in %o0, and a
369 * boolean flag whether this is the child or the parent in %o1.
370 * Inline assembly is needed to get the flag returned in %o1.
371 */
372 register long g1 asm("g1") = __NR_clone;
373 register long o0 asm("o0") = flags | SIGCHLD;
374 register long o1 asm("o1") = 0; /* is parent/child indicator */
375 register long o2 asm("o2") = (unsigned long)pidfd;
376 long is_error, retval, in_child;
377 pid_t child_pid;
378
379 asm volatile(
380#if defined(__arch64__)
381 "t 0x6d\n\t" /* 64-bit trap */
382#else
383 "t 0x10\n\t" /* 32-bit trap */
384#endif
385 /*
386 * catch errors: On sparc, the carry bit (csr) in the
387 * processor status register (psr) is used instead of a
388 * full register.
389 */
390 "addx %%g0, 0, %%g1"
391 : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */
392 : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */
393 : "%cc"); /* clobbers */
394
395 is_error = g1;
396 retval = o0;
397 in_child = o1;
398
399 if (is_error) {
400 errno = retval;
401 return -1;
402 }
87f7558b 403
35acc247
CB
404 if (in_child)
405 return 0;
87f7558b 406
35acc247
CB
407 child_pid = retval;
408 return child_pid;
409 }
410#elif defined(__ia64__)
411 /* On ia64 the stack and stack size are passed as separate arguments. */
412 return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd);
87f7558b 413#else
35acc247 414 return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd);
87f7558b 415#endif
87f7558b
CB
416}
417
418#define LXCFS_PROC_PID_NS_LEN \
419 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
420 STRLITERALLEN("/ns/pid") + 1)
421
580fe4df
CB
422/*
423 * clone a task which switches to @task's namespace and writes '1'.
424 * over a unix sock so we can read the task's reaper's pid in our
425 * namespace
426 *
427 * Note: glibc's fork() does not respect pidns, which can lead to failed
428 * assertions inside glibc (and thus failed forks) if the child's pid in
429 * the pidns and the parent pid outside are identical. Using clone prevents
430 * this issue.
431 */
432static void write_task_init_pid_exit(int sock, pid_t target)
433{
05b7a16d 434 __do_close int fd = -EBADF;
87f7558b 435 char path[LXCFS_PROC_PID_NS_LEN];
580fe4df 436 pid_t pid;
87f7558b
CB
437
438 snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
439 fd = open(path, O_RDONLY | O_CLOEXEC);
440 if (fd < 0)
441 log_exit("write_task_init_pid_exit open of ns/pid");
442
443 if (setns(fd, 0))
444 log_exit("Failed to setns to pid namespace of process %d", target);
445
35acc247 446 pid = lxcfs_raw_clone(0, NULL);
580fe4df 447 if (pid < 0)
87f7558b
CB
448 _exit(EXIT_FAILURE);
449
35acc247
CB
450 if (pid == 0) {
451 if (!send_creds_ok(sock))
87f7558b
CB
452 _exit(EXIT_FAILURE);
453
454 _exit(EXIT_SUCCESS);
237e200e 455 }
35acc247
CB
456
457 if (!wait_for_pid(pid))
458 _exit(EXIT_FAILURE);
459
460 _exit(EXIT_SUCCESS);
237e200e
SH
461}
462
8a07696e 463static pid_t scm_init_pid(pid_t task)
237e200e 464{
580fe4df 465 char v = '0';
87f7558b 466 pid_t pid_ret = -1;
dac3dc93
CB
467 struct ucred cred = {
468 .pid = -1,
469 .uid = -1,
470 .gid = -1,
471 };
87f7558b
CB
472 pid_t pid;
473 int sock[2];
237e200e 474
87f7558b 475 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
580fe4df 476 return -1;
237e200e 477
580fe4df
CB
478 pid = fork();
479 if (pid < 0)
480 goto out;
87f7558b
CB
481
482 if (pid == 0) {
580fe4df
CB
483 close(sock[1]);
484 write_task_init_pid_exit(sock[0], task);
87f7558b 485 _exit(EXIT_SUCCESS);
237e200e 486 }
7213ec5c 487
580fe4df
CB
488 if (!recv_creds(sock[1], &cred, &v))
489 goto out;
87f7558b
CB
490
491 pid_ret = cred.pid;
237e200e 492
580fe4df
CB
493out:
494 close(sock[0]);
495 close(sock[1]);
496 if (pid > 0)
497 wait_for_pid(pid);
237e200e 498
87f7558b
CB
499 return pid_ret;
500}
2aa59b2e
CB
501
502pid_t lookup_initpid_in_store(pid_t pid)
237e200e 503{
cfda2e8a 504 pid_t hashed_pid = 0;
2aa59b2e
CB
505 char path[LXCFS_PROC_PID_NS_LEN];
506 struct stat st;
2aa59b2e
CB
507
508 snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
2aa59b2e 509 if (stat(path, &st))
4e1e4115 510 return ret_errno(ESRCH);
2aa59b2e 511
4e1e4115 512 store_lock();
fcdedd16 513
cfda2e8a
CB
514 hashed_pid = lookup_verify_initpid(st.st_ino);
515 if (hashed_pid < 0) {
516 /* release the mutex as the following call is expensive */
517 store_unlock();
2aa59b2e 518
8a07696e 519 hashed_pid = scm_init_pid(pid);
4e1e4115 520
cfda2e8a 521 store_lock();
4e1e4115 522
cfda2e8a
CB
523 if (hashed_pid > 0)
524 save_initpid(st.st_ino, hashed_pid);
525 }
b7672ded 526
2aa59b2e 527 /*
cfda2e8a
CB
528 * Prune at the end in case we're pruning the value
529 * we were about to return.
2aa59b2e 530 */
580fe4df 531 prune_initpid_store();
4e1e4115 532 store_unlock();
2aa59b2e 533
cfda2e8a 534 return hashed_pid;
237e200e
SH
535}
536
29a73c2f
CB
537/*
538 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
539 */
540
29a73c2f
CB
541static bool umount_if_mounted(void)
542{
543 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 544 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
545 return false;
546 }
547 return true;
548}
549
2283e240
CB
550/* __typeof__ should be safe to use with all compilers. */
551typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
552static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
553{
554 return (fs->f_type == (fs_type_magic)magic_val);
555}
556
0a4dea41
CB
557/*
558 * looking at fs/proc_namespace.c, it appears we can
559 * actually expect the rootfs entry to very specifically contain
560 * " - rootfs rootfs "
561 * IIUC, so long as we've chrooted so that rootfs is not our root,
562 * the rootfs entry should always be skipped in mountinfo contents.
563 */
564static bool is_on_ramfs(void)
565{
87f7558b 566 __do_free char *line = NULL;
757a63e7 567 __do_free void *fopen_cache = NULL;
87f7558b 568 __do_fclose FILE *f = NULL;
0a4dea41 569 size_t len = 0;
0a4dea41 570
757a63e7 571 f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
0a4dea41
CB
572 if (!f)
573 return false;
574
575 while (getline(&line, &len, f) != -1) {
87f7558b
CB
576 int i;
577 char *p, *p2;
578
0a4dea41
CB
579 for (p = line, i = 0; p && i < 4; i++)
580 p = strchr(p + 1, ' ');
581 if (!p)
582 continue;
87f7558b 583
0a4dea41
CB
584 p2 = strchr(p + 1, ' ');
585 if (!p2)
586 continue;
587 *p2 = '\0';
588 if (strcmp(p + 1, "/") == 0) {
87f7558b 589 /* This is '/'. Is it the ramfs? */
0a4dea41 590 p = strchr(p2 + 1, '-');
87f7558b 591 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
0a4dea41 592 return true;
0a4dea41
CB
593 }
594 }
87f7558b 595
0a4dea41
CB
596 return false;
597}
598
cc309f33 599static int pivot_enter()
0a4dea41 600{
05b7a16d 601 __do_close int oldroot = -EBADF, newroot = -EBADF;
cc309f33 602
3326c17e 603 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
604 if (oldroot < 0)
605 return log_error_errno(-1, errno,
606 "Failed to open old root for fchdir");
cc309f33 607
3326c17e 608 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
87f7558b
CB
609 if (newroot < 0)
610 return log_error_errno(-1, errno,
611 "Failed to open new root for fchdir");
cc309f33
CB
612
613 /* change into new root fs */
87f7558b
CB
614 if (fchdir(newroot) < 0)
615 return log_error_errno(-1,
616 errno, "Failed to change directory to new rootfs: %s",
617 ROOTDIR);
cc309f33 618
0a4dea41 619 /* pivot_root into our new root fs */
87f7558b
CB
620 if (pivot_root(".", ".") < 0)
621 return log_error_errno(-1, errno,
622 "pivot_root() syscall failed: %s",
623 strerror(errno));
0a4dea41
CB
624
625 /*
626 * At this point the old-root is mounted on top of our new-root.
627 * To unmounted it we must not be chdir'd into it, so escape back
628 * to the old-root.
629 */
87f7558b
CB
630 if (fchdir(oldroot) < 0)
631 return log_error_errno(-1, errno, "Failed to enter old root");
0a4dea41 632
87f7558b
CB
633 if (umount2(".", MNT_DETACH) < 0)
634 return log_error_errno(-1, errno, "Failed to detach old root");
0a4dea41 635
87f7558b
CB
636 if (fchdir(newroot) < 0)
637 return log_error_errno(-1, errno, "Failed to re-enter new root");
cc309f33 638
87f7558b 639 return 0;
0a4dea41
CB
640}
641
642static int chroot_enter()
643{
644 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
645 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
646 return -1;
647 }
648
649 if (chroot(".") < 0) {
650 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
651 return -1;
652 }
653
654 if (chdir("/") < 0) {
655 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
656 return -1;
657 }
658
659 return 0;
660}
661
0232cbac 662static int permute_and_enter(void)
29a73c2f 663{
0a4dea41
CB
664 struct statfs sb;
665
666 if (statfs("/", &sb) < 0) {
667 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 668 return -1;
0a4dea41
CB
669 }
670
671 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
672 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
673 * /proc/1/mountinfo. */
674 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
675 return chroot_enter();
29a73c2f 676
cc309f33 677 if (pivot_enter() < 0) {
0a4dea41 678 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 679 return -1;
29a73c2f
CB
680 }
681
cc309f33 682 return 0;
29a73c2f
CB
683}
684
685/* Prepare our new clean root. */
0232cbac 686static int permute_prepare(void)
29a73c2f
CB
687{
688 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 689 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
690 return -1;
691 }
692
693 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 694 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
695 return -1;
696 }
697
698 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 699 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
700 return -1;
701 }
702
703 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 704 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
705 return -1;
706 }
707
708 return 0;
709}
710
0232cbac
CB
711/* Calls chroot() on ramfs, pivot_root() in all other cases. */
712static bool permute_root(void)
29a73c2f
CB
713{
714 /* Prepare new root. */
0232cbac 715 if (permute_prepare() < 0)
29a73c2f
CB
716 return false;
717
718 /* Pivot into new root. */
0232cbac 719 if (permute_and_enter() < 0)
29a73c2f
CB
720 return false;
721
722 return true;
723}
724
0a4dea41 725static bool cgfs_prepare_mounts(void)
29a73c2f
CB
726{
727 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 728 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
729 return false;
730 }
480262c9 731
29a73c2f 732 if (!umount_if_mounted()) {
b8defc3d 733 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
734 return false;
735 }
736
737 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 738 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
739 return false;
740 }
741
1d81c6a6 742 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
0646f250 743 if (cgroup_ops->mntns_fd < 0) {
a257a8ee
CB
744 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
745 return false;
746 }
747
480262c9 748 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 749 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
750 return false;
751 }
480262c9 752
29a73c2f 753 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 754 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
755 return false;
756 }
480262c9 757
29a73c2f
CB
758 return true;
759}
760
0a4dea41 761static bool cgfs_mount_hierarchies(void)
29a73c2f 762{
5fbea8a6
CB
763 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
764 return false;
51c7ca35 765
5fbea8a6
CB
766 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
767 return false;
29a73c2f 768
5fbea8a6
CB
769 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
770 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
771 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
772 if ((*h)->fd < 0)
29a73c2f 773 return false;
29a73c2f 774 }
5fbea8a6 775
29a73c2f
CB
776 return true;
777}
778
480262c9 779static bool cgfs_setup_controllers(void)
29a73c2f 780{
0a4dea41 781 if (!cgfs_prepare_mounts())
29a73c2f 782 return false;
29a73c2f 783
2b8eff1d
CB
784 if (!cgfs_mount_hierarchies())
785 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
29a73c2f 786
0232cbac 787 if (!permute_root())
29a73c2f
CB
788 return false;
789
790 return true;
791}
792
dee86006 793static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra)
b9b6bdc9
CB
794{
795 int ret;
796
797 if (reload_successful) {
798 reload_successful = 0;
799
800 /* write() is async signal safe */
801 ret = write(STDERR_FILENO,
802 "Switched into non-virtualization mode\n",
803 STRLITERALLEN("Switched into non-virtualization mode\n"));
804 if (ret < 0)
805 goto please_compiler;
806 } else {
807 reload_successful = 1;
808
809 /* write() is async signal safe */
810 ret = write(STDERR_FILENO, "Switched into virtualization mode\n",
811 STRLITERALLEN("Switched into virtualization mode\n"));
812 if (ret < 0)
813 goto please_compiler;
814 }
815
816please_compiler:
817 /*
818 * The write() syscall is a function whose return value needs to be
819 * checked. Otherwise the compiler will warn. This is how we
820 * please our master. Another one could be to use
821 * syscall(__NR_write, ...) directly but whatever.
822 */
823 return;
824}
825
2243c5a9 826static void __attribute__((constructor)) lxcfs_init(void)
237e200e 827{
05b7a16d 828 __do_close int init_ns = -EBADF, root_fd = -EBADF,
de69569b 829 pidfd = -EBADF;
4ec5c9da 830 int i = 0;
2aa59b2e 831 pid_t pid;
237e200e 832
c2357135 833 lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
cc42d0c7 834
5fbea8a6 835 cgroup_ops = cgroup_init();
c2357135
CB
836 if (!cgroup_ops) {
837 lxcfs_info("Failed to initialize cgroup support");
838 goto broken_upgrade;
839 }
237e200e 840
480262c9 841 /* Preserve initial namespace. */
2aa59b2e
CB
842 pid = getpid();
843 init_ns = preserve_ns(pid, "mnt");
c2357135
CB
844 if (init_ns < 0) {
845 lxcfs_info("Failed to preserve initial mount namespace");
846 goto broken_upgrade;
847 }
480262c9 848
480262c9
CB
849 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
850 * to privately mount lxcfs cgroups. */
c2357135 851 if (!cgfs_setup_controllers()) {
2243c5a9 852 log_exit("Failed to setup private cgroup mounts for lxcfs");
c2357135
CB
853 goto broken_upgrade;
854 }
480262c9 855
c2357135 856 if (setns(init_ns, 0) < 0) {
2243c5a9 857 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
c2357135
CB
858 goto broken_upgrade;
859 }
29a73c2f 860
c2357135 861 if (!init_cpuview()) {
2243c5a9 862 log_exit("Failed to init CPU view");
c2357135
CB
863 goto broken_upgrade;
864 }
056adcef 865
cc42d0c7
CB
866 lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
867 lxcfs_info("hierarchies:");
4ec5c9da
CB
868
869 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
cc42d0c7
CB
870 char **controller_list = (*h)->controllers;
871 __do_free char *controllers = NULL;
872 if (controller_list && *controller_list)
873 controllers = lxc_string_join(",", (const char **)controller_list, false);
874 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
4ec5c9da 875 }
2aa59b2e
CB
876
877 pidfd = pidfd_open(pid, 0);
878 if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
879 can_use_pidfd = true;
cc42d0c7 880 lxcfs_info("Kernel supports pidfds");
2aa59b2e 881 }
ce8fc84c 882
cc42d0c7 883 lxcfs_info("api_extensions:");
ce8fc84c 884 for (i = 0; i < nr_api_extensions; i++)
cc42d0c7 885 lxcfs_info("- %s", api_extensions[i]);
de69569b
CB
886
887 root_fd = open("/", O_PATH | O_CLOEXEC);
c2357135
CB
888 if (root_fd < 0)
889 lxcfs_info("%s - Failed to open root directory", strerror(errno));
890 else if (fchdir(root_fd) < 0)
891 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
892
dee86006
CB
893 if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) {
894 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno));
b9b6bdc9 895 goto broken_upgrade;
dee86006 896 }
b9b6bdc9
CB
897
898 reload_successful = 1;
c2357135 899 return;
de69569b 900
c2357135 901broken_upgrade:
b9b6bdc9 902 reload_successful = 0;
c2357135 903 lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
237e200e
SH
904}
905
2243c5a9 906static void __attribute__((destructor)) lxcfs_exit(void)
237e200e 907{
cc42d0c7
CB
908 lxcfs_info("Running destructor %s", __func__);
909
c8f77ce4 910 clear_initpid_store();
056adcef 911 free_cpuview();
2243c5a9 912 cgroup_exit(cgroup_ops);
1c4b4e38 913}