]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
Merge pull request #328 from brauner/master
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
1f5596dd
CB
9#ifndef _GNU_SOURCE
10#define _GNU_SOURCE
11#endif
12
13#ifndef FUSE_USE_VERSION
237e200e 14#define FUSE_USE_VERSION 26
1f5596dd
CB
15#endif
16
17#define _FILE_OFFSET_BITS 64
237e200e 18
237e200e 19#include <dirent.h>
29a73c2f 20#include <errno.h>
237e200e
SH
21#include <fcntl.h>
22#include <fuse.h>
0ecddf02 23#include <inttypes.h>
237e200e 24#include <libgen.h>
237e200e 25#include <pthread.h>
29a73c2f 26#include <sched.h>
db1b32f6 27#include <stdarg.h>
29a73c2f 28#include <stdbool.h>
0ecddf02 29#include <stdint.h>
29a73c2f
CB
30#include <stdio.h>
31#include <stdlib.h>
32#include <string.h>
33#include <time.h>
34#include <unistd.h>
35#include <wait.h>
d89504c4 36#include <linux/magic.h>
237e200e 37#include <linux/sched.h>
29a73c2f
CB
38#include <sys/epoll.h>
39#include <sys/mman.h>
40#include <sys/mount.h>
237e200e
SH
41#include <sys/param.h>
42#include <sys/socket.h>
29a73c2f 43#include <sys/syscall.h>
0ecddf02 44#include <sys/sysinfo.h>
d89504c4 45#include <sys/vfs.h>
237e200e 46
237e200e 47#include "bindings.h"
1d81c6a6 48#include "config.h"
580fe4df 49#include "cgroup_fuse.h"
5fbea8a6
CB
50#include "cgroups/cgroup.h"
51#include "cgroups/cgroup_utils.h"
c9236032 52#include "memory_utils.h"
1f5596dd 53#include "proc_cpuview.h"
1d81c6a6 54#include "utils.h"
237e200e 55
2aa59b2e
CB
56static bool can_use_pidfd;
57
29a73c2f
CB
58/* Define pivot_root() if missing from the C library */
59#ifndef HAVE_PIVOT_ROOT
4ec5c9da 60static int pivot_root(const char *new_root, const char *put_old)
29a73c2f
CB
61{
62#ifdef __NR_pivot_root
4ec5c9da 63 return syscall(__NR_pivot_root, new_root, put_old);
29a73c2f 64#else
4ec5c9da
CB
65 errno = ENOSYS;
66 return -1;
29a73c2f
CB
67#endif
68}
69#else
4ec5c9da 70extern int pivot_root(const char *new_root, const char *put_old);
29a73c2f
CB
71#endif
72
237e200e
SH
73/*
74 * A table caching which pid is init for a pid namespace.
75 * When looking up which pid is init for $qpid, we first
76 * 1. Stat /proc/$qpid/ns/pid.
77 * 2. Check whether the ino_t is in our store.
78 * a. if not, fork a child in qpid's ns to send us
79 * ucred.pid = 1, and read the initpid. Cache
80 * initpid and creation time for /proc/initpid
81 * in a new store entry.
82 * b. if so, verify that /proc/initpid still matches
83 * what we have saved. If not, clear the store
84 * entry and go back to a. If so, return the
85 * cached initpid.
86 */
87struct pidns_init_store {
2aa59b2e
CB
88 ino_t ino; /* inode number for /proc/$pid/ns/pid */
89 pid_t initpid; /* the pid of nit in that ns */
90 int init_pidfd;
91 long int ctime; /* the time at which /proc/$initpid was created */
237e200e
SH
92 struct pidns_init_store *next;
93 long int lastcheck;
94};
95
96/* lol - look at how they are allocated in the kernel */
97#define PIDNS_HASH_SIZE 4096
98#define HASH(x) ((x) % PIDNS_HASH_SIZE)
99
100static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
101static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
4ec5c9da 102
237e200e
SH
103static void lock_mutex(pthread_mutex_t *l)
104{
105 int ret;
106
4ec5c9da
CB
107 ret = pthread_mutex_lock(l);
108 if (ret)
109 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
110}
111
77f4399a 112struct cgroup_ops *cgroup_ops;
29a73c2f 113
237e200e
SH
114static void unlock_mutex(pthread_mutex_t *l)
115{
116 int ret;
117
4ec5c9da
CB
118 ret = pthread_mutex_unlock(l);
119 if (ret)
120 log_exit("%s - returned %d\n", strerror(ret), ret);
237e200e
SH
121}
122
123static void store_lock(void)
124{
125 lock_mutex(&pidns_store_mutex);
126}
127
128static void store_unlock(void)
129{
130 unlock_mutex(&pidns_store_mutex);
131}
132
2aa59b2e
CB
133/* /proc/ = 6
134 * +
135 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
136 * +
137 * \0 = 1
138 */
139#define LXCFS_PROC_PID_LEN \
140 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
141
237e200e 142/* Must be called under store_lock */
2aa59b2e 143static bool initpid_still_valid(struct pidns_init_store *entry)
237e200e 144{
2aa59b2e 145 bool valid = true;
237e200e 146
2aa59b2e
CB
147 if (entry->init_pidfd >= 0) {
148 if (pidfd_send_signal(entry->init_pidfd, 0, NULL, 0))
149 valid = false;
150 } else {
151 struct stat st;
152 char path[LXCFS_PROC_PID_LEN];
7dd6560a 153
2aa59b2e 154 snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
7dd6560a 155
2aa59b2e
CB
156 if (stat(path, &st) || entry->ctime != st.st_ctime)
157 valid = false;
158 }
159
160 return valid;
237e200e
SH
161}
162
163/* Must be called under store_lock */
2aa59b2e 164static void remove_initpid(struct pidns_init_store *entry)
237e200e 165{
2aa59b2e
CB
166 struct pidns_init_store *it;
167 int ino_hash;
237e200e 168
2aa59b2e
CB
169 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
170 entry->initpid);
7dd6560a 171
2aa59b2e
CB
172 ino_hash = HASH(entry->ino);
173 if (pidns_hash_table[ino_hash] == entry) {
174 pidns_hash_table[ino_hash] = entry->next;
175 close_prot_errno_disarm(entry->init_pidfd);
176 free_disarm(entry);
237e200e
SH
177 return;
178 }
179
2aa59b2e
CB
180 it = pidns_hash_table[ino_hash];
181 while (it) {
182 if (it->next == entry) {
183 it->next = entry->next;
184 close_prot_errno_disarm(entry->init_pidfd);
185 free_disarm(entry);
237e200e
SH
186 return;
187 }
2aa59b2e 188 it = it->next;
237e200e
SH
189 }
190}
191
192#define PURGE_SECS 5
193/* Must be called under store_lock */
194static void prune_initpid_store(void)
195{
196 static long int last_prune = 0;
237e200e 197 long int now, threshold;
237e200e
SH
198
199 if (!last_prune) {
200 last_prune = time(NULL);
201 return;
202 }
2aa59b2e 203
237e200e
SH
204 now = time(NULL);
205 if (now < last_prune + PURGE_SECS)
206 return;
7dd6560a 207
2aa59b2e 208 lxcfs_debug("Pruning init pid cache");
7dd6560a 209
237e200e
SH
210 last_prune = now;
211 threshold = now - 2 * PURGE_SECS;
212
2aa59b2e
CB
213 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
214 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
215 if (entry->lastcheck < threshold) {
216 struct pidns_init_store *cur = entry;
7dd6560a 217
2aa59b2e 218 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
7dd6560a 219
237e200e 220 if (prev)
2aa59b2e 221 prev->next = entry->next;
237e200e 222 else
2aa59b2e
CB
223 pidns_hash_table[i] = entry->next;
224 entry = entry->next;
225 close_prot_errno_disarm(cur->init_pidfd);
226 free_disarm(cur);
237e200e 227 } else {
2aa59b2e
CB
228 prev = entry;
229 entry = entry->next;
237e200e
SH
230 }
231 }
232 }
233}
234
235/* Must be called under store_lock */
236static void save_initpid(struct stat *sb, pid_t pid)
237{
2aa59b2e
CB
238 __do_free struct pidns_init_store *e = NULL;
239 __do_close_prot_errno int pidfd = -EBADF;
240 char path[LXCFS_PROC_PID_LEN];
241 struct lxcfs_opts *opts = fuse_get_context()->private_data;
242 struct stat st;
243 int ino_hash;
244
245 if (opts->use_pidfd && can_use_pidfd) {
246 pidfd = pidfd_open(pid, 0);
247 if (pidfd < 0)
248 return;
249 }
237e200e 250
2aa59b2e
CB
251 snprintf(path, sizeof(path), "/proc/%d", pid);
252 if (stat(path, &st))
253 return;
7dd6560a 254
2aa59b2e
CB
255 e = malloc(sizeof(*e));
256 if (!e)
237e200e 257 return;
2aa59b2e 258
237e200e
SH
259 e->ino = sb->st_ino;
260 e->initpid = pid;
2aa59b2e
CB
261 e->ctime = st.st_ctime;
262 ino_hash = HASH(e->ino);
263 e->next = pidns_hash_table[ino_hash];
237e200e 264 e->lastcheck = time(NULL);
2aa59b2e
CB
265 e->init_pidfd = move_fd(pidfd);
266 pidns_hash_table[ino_hash] = move_ptr(e);
267
268 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
237e200e
SH
269}
270
271/*
272 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
273 * entry for the inode number and creation time. Verify that the init pid
274 * is still valid. If not, remove it. Return the entry if valid, NULL
275 * otherwise.
276 * Must be called under store_lock
277 */
278static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
279{
2aa59b2e
CB
280 struct pidns_init_store *entry = pidns_hash_table[HASH(sb->st_ino)];
281
282 while (entry) {
283 if (entry->ino == sb->st_ino) {
284 if (initpid_still_valid(entry)) {
285 entry->lastcheck = time(NULL);
286 return entry;
237e200e 287 }
2aa59b2e
CB
288
289 remove_initpid(entry);
237e200e
SH
290 return NULL;
291 }
2aa59b2e 292 entry = entry->next;
237e200e
SH
293 }
294
295 return NULL;
296}
297
4ec5c9da 298static int send_creds_clone_wrapper(void *arg)
237e200e 299{
4ec5c9da
CB
300 struct ucred cred;
301 char v;
302 int sock = *(int *)arg;
ba59ea09 303
4ec5c9da
CB
304 /* we are the child */
305 cred.uid = 0;
306 cred.gid = 0;
307 cred.pid = 1;
308 v = '1';
309 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
310 return 1;
311 return 0;
237e200e
SH
312}
313
580fe4df
CB
314/*
315 * clone a task which switches to @task's namespace and writes '1'.
316 * over a unix sock so we can read the task's reaper's pid in our
317 * namespace
318 *
319 * Note: glibc's fork() does not respect pidns, which can lead to failed
320 * assertions inside glibc (and thus failed forks) if the child's pid in
321 * the pidns and the parent pid outside are identical. Using clone prevents
322 * this issue.
323 */
324static void write_task_init_pid_exit(int sock, pid_t target)
325{
326 char fnam[100];
327 pid_t pid;
328 int fd, ret;
329 size_t stack_size = sysconf(_SC_PAGESIZE);
330 void *stack = alloca(stack_size);
237e200e 331
580fe4df
CB
332 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
333 if (ret < 0 || ret >= sizeof(fnam))
334 _exit(1);
f23fe717 335
580fe4df
CB
336 fd = open(fnam, O_RDONLY);
337 if (fd < 0) {
338 perror("write_task_init_pid_exit open of ns/pid");
339 _exit(1);
237e200e 340 }
580fe4df
CB
341 if (setns(fd, 0)) {
342 perror("write_task_init_pid_exit setns 1");
343 close(fd);
344 _exit(1);
345 }
346 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
347 if (pid < 0)
348 _exit(1);
349 if (pid != 0) {
350 if (!wait_for_pid(pid))
351 _exit(1);
352 _exit(0);
237e200e 353 }
237e200e
SH
354}
355
580fe4df 356static pid_t get_init_pid_for_task(pid_t task)
237e200e 357{
580fe4df
CB
358 int sock[2];
359 pid_t pid;
360 pid_t ret = -1;
361 char v = '0';
362 struct ucred cred;
237e200e 363
580fe4df
CB
364 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
365 perror("socketpair");
366 return -1;
237e200e
SH
367 }
368
580fe4df
CB
369 pid = fork();
370 if (pid < 0)
371 goto out;
372 if (!pid) {
373 close(sock[1]);
374 write_task_init_pid_exit(sock[0], task);
375 _exit(0);
237e200e 376 }
7213ec5c 377
580fe4df
CB
378 if (!recv_creds(sock[1], &cred, &v))
379 goto out;
380 ret = cred.pid;
237e200e 381
580fe4df
CB
382out:
383 close(sock[0]);
384 close(sock[1]);
385 if (pid > 0)
386 wait_for_pid(pid);
237e200e
SH
387 return ret;
388}
389
2aa59b2e
CB
390#define LXCFS_PROC_PID_NS_LEN \
391 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
392 STRLITERALLEN("/ns/pid") + 1)
393
394pid_t lookup_initpid_in_store(pid_t pid)
237e200e 395{
580fe4df 396 pid_t answer = 0;
2aa59b2e
CB
397 char path[LXCFS_PROC_PID_NS_LEN];
398 struct stat st;
399 struct pidns_init_store *entry;
400
401 snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
b7672ded 402
580fe4df 403 store_lock();
2aa59b2e 404 if (stat(path, &st))
580fe4df 405 goto out;
2aa59b2e
CB
406
407 entry = lookup_verify_initpid(&st);
408 if (entry) {
409 answer = entry->initpid;
580fe4df
CB
410 goto out;
411 }
2aa59b2e
CB
412
413 answer = get_init_pid_for_task(pid);
580fe4df 414 if (answer > 0)
2aa59b2e 415 save_initpid(&st, answer);
b7672ded 416
580fe4df 417out:
2aa59b2e
CB
418 /*
419 * Prune at the end in case we're returning the value we were about to
420 * return.
421 */
580fe4df 422 prune_initpid_store();
2aa59b2e 423
580fe4df 424 store_unlock();
2aa59b2e 425
580fe4df 426 return answer;
237e200e
SH
427}
428
29a73c2f
CB
429/*
430 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
431 */
432
29a73c2f
CB
433static bool umount_if_mounted(void)
434{
435 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 436 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
437 return false;
438 }
439 return true;
440}
441
2283e240
CB
442/* __typeof__ should be safe to use with all compilers. */
443typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
444static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
445{
446 return (fs->f_type == (fs_type_magic)magic_val);
447}
448
0a4dea41
CB
449/*
450 * looking at fs/proc_namespace.c, it appears we can
451 * actually expect the rootfs entry to very specifically contain
452 * " - rootfs rootfs "
453 * IIUC, so long as we've chrooted so that rootfs is not our root,
454 * the rootfs entry should always be skipped in mountinfo contents.
455 */
456static bool is_on_ramfs(void)
457{
458 FILE *f;
459 char *p, *p2;
460 char *line = NULL;
461 size_t len = 0;
462 int i;
463
464 f = fopen("/proc/self/mountinfo", "r");
465 if (!f)
466 return false;
467
468 while (getline(&line, &len, f) != -1) {
469 for (p = line, i = 0; p && i < 4; i++)
470 p = strchr(p + 1, ' ');
471 if (!p)
472 continue;
473 p2 = strchr(p + 1, ' ');
474 if (!p2)
475 continue;
476 *p2 = '\0';
477 if (strcmp(p + 1, "/") == 0) {
478 // this is '/'. is it the ramfs?
479 p = strchr(p2 + 1, '-');
480 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
481 free(line);
482 fclose(f);
483 return true;
484 }
485 }
486 }
487 free(line);
488 fclose(f);
489 return false;
490}
491
cc309f33 492static int pivot_enter()
0a4dea41 493{
cc309f33
CB
494 int ret = -1, oldroot = -1, newroot = -1;
495
496 oldroot = open("/", O_DIRECTORY | O_RDONLY);
497 if (oldroot < 0) {
498 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
499 return ret;
500 }
501
502 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
503 if (newroot < 0) {
504 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
505 goto err;
506 }
507
508 /* change into new root fs */
509 if (fchdir(newroot) < 0) {
510 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
511 goto err;
512 }
513
0a4dea41
CB
514 /* pivot_root into our new root fs */
515 if (pivot_root(".", ".") < 0) {
516 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
cc309f33 517 goto err;
0a4dea41
CB
518 }
519
520 /*
521 * At this point the old-root is mounted on top of our new-root.
522 * To unmounted it we must not be chdir'd into it, so escape back
523 * to the old-root.
524 */
525 if (fchdir(oldroot) < 0) {
526 lxcfs_error("%s\n", "Failed to enter old root.");
cc309f33 527 goto err;
0a4dea41
CB
528 }
529
530 if (umount2(".", MNT_DETACH) < 0) {
531 lxcfs_error("%s\n", "Failed to detach old root.");
cc309f33 532 goto err;
0a4dea41
CB
533 }
534
535 if (fchdir(newroot) < 0) {
536 lxcfs_error("%s\n", "Failed to re-enter new root.");
cc309f33 537 goto err;
0a4dea41
CB
538 }
539
cc309f33
CB
540 ret = 0;
541
542err:
543 if (oldroot > 0)
544 close(oldroot);
545 if (newroot > 0)
546 close(newroot);
547
548 return ret;
0a4dea41
CB
549}
550
551static int chroot_enter()
552{
553 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
554 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
555 return -1;
556 }
557
558 if (chroot(".") < 0) {
559 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
560 return -1;
561 }
562
563 if (chdir("/") < 0) {
564 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
565 return -1;
566 }
567
568 return 0;
569}
570
0232cbac 571static int permute_and_enter(void)
29a73c2f 572{
0a4dea41
CB
573 struct statfs sb;
574
575 if (statfs("/", &sb) < 0) {
576 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 577 return -1;
0a4dea41
CB
578 }
579
580 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
581 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
582 * /proc/1/mountinfo. */
583 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
584 return chroot_enter();
29a73c2f 585
cc309f33 586 if (pivot_enter() < 0) {
0a4dea41 587 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 588 return -1;
29a73c2f
CB
589 }
590
cc309f33 591 return 0;
29a73c2f
CB
592}
593
594/* Prepare our new clean root. */
0232cbac 595static int permute_prepare(void)
29a73c2f
CB
596{
597 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 598 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
599 return -1;
600 }
601
602 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 603 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
604 return -1;
605 }
606
607 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 608 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
609 return -1;
610 }
611
612 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 613 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
614 return -1;
615 }
616
617 return 0;
618}
619
0232cbac
CB
620/* Calls chroot() on ramfs, pivot_root() in all other cases. */
621static bool permute_root(void)
29a73c2f
CB
622{
623 /* Prepare new root. */
0232cbac 624 if (permute_prepare() < 0)
29a73c2f
CB
625 return false;
626
627 /* Pivot into new root. */
0232cbac 628 if (permute_and_enter() < 0)
29a73c2f
CB
629 return false;
630
631 return true;
632}
633
0a4dea41 634static bool cgfs_prepare_mounts(void)
29a73c2f
CB
635{
636 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 637 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
638 return false;
639 }
480262c9 640
29a73c2f 641 if (!umount_if_mounted()) {
b8defc3d 642 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
643 return false;
644 }
645
646 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 647 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
648 return false;
649 }
650
1d81c6a6 651 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
0646f250 652 if (cgroup_ops->mntns_fd < 0) {
a257a8ee
CB
653 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
654 return false;
655 }
656
480262c9 657 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 658 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
659 return false;
660 }
480262c9 661
29a73c2f 662 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 663 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
664 return false;
665 }
480262c9 666
29a73c2f
CB
667 return true;
668}
669
0a4dea41 670static bool cgfs_mount_hierarchies(void)
29a73c2f 671{
5fbea8a6
CB
672 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
673 return false;
51c7ca35 674
5fbea8a6
CB
675 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
676 return false;
29a73c2f 677
5fbea8a6
CB
678 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
679 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
680 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
681 if ((*h)->fd < 0)
29a73c2f 682 return false;
29a73c2f 683 }
5fbea8a6 684
29a73c2f
CB
685 return true;
686}
687
480262c9 688static bool cgfs_setup_controllers(void)
29a73c2f 689{
0a4dea41 690 if (!cgfs_prepare_mounts())
29a73c2f 691 return false;
29a73c2f 692
0a4dea41 693 if (!cgfs_mount_hierarchies()) {
b8defc3d 694 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
29a73c2f
CB
695 return false;
696 }
697
0232cbac 698 if (!permute_root())
29a73c2f
CB
699 return false;
700
701 return true;
702}
703
2243c5a9 704static void __attribute__((constructor)) lxcfs_init(void)
237e200e 705{
2aa59b2e 706 __do_close_prot_errno int init_ns = -EBADF, pidfd = -EBADF;
4ec5c9da 707 int i = 0;
2aa59b2e 708 pid_t pid;
5fbea8a6 709 char *cret;
e58dab00 710 char cwd[MAXPATHLEN];
237e200e 711
5fbea8a6
CB
712 cgroup_ops = cgroup_init();
713 if (!cgroup_ops)
2243c5a9 714 log_exit("Failed to initialize cgroup support");
237e200e 715
480262c9 716 /* Preserve initial namespace. */
2aa59b2e
CB
717 pid = getpid();
718 init_ns = preserve_ns(pid, "mnt");
2243c5a9
CB
719 if (init_ns < 0)
720 log_exit("Failed to preserve initial mount namespace");
480262c9 721
e58dab00 722 cret = getcwd(cwd, MAXPATHLEN);
4ec5c9da 723 if (!cret)
2243c5a9 724 log_exit("%s - Could not retrieve current working directory", strerror(errno));
e58dab00 725
480262c9
CB
726 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
727 * to privately mount lxcfs cgroups. */
2243c5a9
CB
728 if (!cgfs_setup_controllers())
729 log_exit("Failed to setup private cgroup mounts for lxcfs");
480262c9 730
2243c5a9
CB
731 if (setns(init_ns, 0) < 0)
732 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
29a73c2f 733
e58dab00 734 if (!cret || chdir(cwd) < 0)
2243c5a9 735 log_exit("%s - Could not change back to original working directory", strerror(errno));
e58dab00 736
2243c5a9
CB
737 if (!init_cpuview())
738 log_exit("Failed to init CPU view");
056adcef 739
4ec5c9da
CB
740 fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd);
741 fprintf(stderr, "hierarchies:\n");
742
743 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
744 __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
745 fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
746 }
2aa59b2e
CB
747
748 pidfd = pidfd_open(pid, 0);
749 if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
750 can_use_pidfd = true;
751 lxcfs_error("Kernel supports pidfds");
752 }
237e200e
SH
753}
754
2243c5a9 755static void __attribute__((destructor)) lxcfs_exit(void)
237e200e 756{
0646f250 757 lxcfs_debug("%s\n", "Running destructor for liblxcfs");
056adcef 758 free_cpuview();
2243c5a9 759 cgroup_exit(cgroup_ops);
1c4b4e38 760}