]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
proc: split proc virtualization into separate files
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
1f5596dd
CB
9#ifndef _GNU_SOURCE
10#define _GNU_SOURCE
11#endif
12
13#ifndef FUSE_USE_VERSION
237e200e 14#define FUSE_USE_VERSION 26
1f5596dd
CB
15#endif
16
17#define _FILE_OFFSET_BITS 64
237e200e 18
237e200e 19#include <dirent.h>
29a73c2f 20#include <errno.h>
237e200e
SH
21#include <fcntl.h>
22#include <fuse.h>
0ecddf02 23#include <inttypes.h>
237e200e 24#include <libgen.h>
237e200e 25#include <pthread.h>
29a73c2f 26#include <sched.h>
db1b32f6 27#include <stdarg.h>
29a73c2f 28#include <stdbool.h>
0ecddf02 29#include <stdint.h>
29a73c2f
CB
30#include <stdio.h>
31#include <stdlib.h>
32#include <string.h>
33#include <time.h>
34#include <unistd.h>
35#include <wait.h>
d89504c4 36#include <linux/magic.h>
237e200e 37#include <linux/sched.h>
29a73c2f
CB
38#include <sys/epoll.h>
39#include <sys/mman.h>
40#include <sys/mount.h>
237e200e
SH
41#include <sys/param.h>
42#include <sys/socket.h>
29a73c2f 43#include <sys/syscall.h>
0ecddf02 44#include <sys/sysinfo.h>
d89504c4 45#include <sys/vfs.h>
237e200e 46
237e200e 47#include "bindings.h"
1d81c6a6 48#include "config.h"
580fe4df 49#include "cgroup_fuse.h"
5fbea8a6
CB
50#include "cgroups/cgroup.h"
51#include "cgroups/cgroup_utils.h"
c9236032 52#include "memory_utils.h"
1f5596dd 53#include "proc_cpuview.h"
1d81c6a6 54#include "utils.h"
237e200e 55
29a73c2f
CB
56/* Define pivot_root() if missing from the C library */
57#ifndef HAVE_PIVOT_ROOT
58static int pivot_root(const char * new_root, const char * put_old)
59{
60#ifdef __NR_pivot_root
61return syscall(__NR_pivot_root, new_root, put_old);
62#else
63errno = ENOSYS;
64return -1;
65#endif
66}
67#else
68extern int pivot_root(const char * new_root, const char * put_old);
69#endif
70
237e200e
SH
71/*
72 * A table caching which pid is init for a pid namespace.
73 * When looking up which pid is init for $qpid, we first
74 * 1. Stat /proc/$qpid/ns/pid.
75 * 2. Check whether the ino_t is in our store.
76 * a. if not, fork a child in qpid's ns to send us
77 * ucred.pid = 1, and read the initpid. Cache
78 * initpid and creation time for /proc/initpid
79 * in a new store entry.
80 * b. if so, verify that /proc/initpid still matches
81 * what we have saved. If not, clear the store
82 * entry and go back to a. If so, return the
83 * cached initpid.
84 */
85struct pidns_init_store {
86 ino_t ino; // inode number for /proc/$pid/ns/pid
87 pid_t initpid; // the pid of nit in that ns
88 long int ctime; // the time at which /proc/$initpid was created
89 struct pidns_init_store *next;
90 long int lastcheck;
91};
92
93/* lol - look at how they are allocated in the kernel */
94#define PIDNS_HASH_SIZE 4096
95#define HASH(x) ((x) % PIDNS_HASH_SIZE)
96
97static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
98static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
99static void lock_mutex(pthread_mutex_t *l)
100{
101 int ret;
102
103 if ((ret = pthread_mutex_lock(l)) != 0) {
b8defc3d 104 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
105 exit(1);
106 }
107}
108
77f4399a 109struct cgroup_ops *cgroup_ops;
29a73c2f 110
237e200e
SH
111static void unlock_mutex(pthread_mutex_t *l)
112{
113 int ret;
114
115 if ((ret = pthread_mutex_unlock(l)) != 0) {
b8defc3d 116 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
117 exit(1);
118 }
119}
120
121static void store_lock(void)
122{
123 lock_mutex(&pidns_store_mutex);
124}
125
126static void store_unlock(void)
127{
128 unlock_mutex(&pidns_store_mutex);
129}
130
131/* Must be called under store_lock */
132static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
133{
134 struct stat initsb;
135 char fnam[100];
136
137 snprintf(fnam, 100, "/proc/%d", e->initpid);
138 if (stat(fnam, &initsb) < 0)
139 return false;
7dd6560a
CB
140
141 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
142 initsb.st_ctime, e->initpid);
143
237e200e
SH
144 if (e->ctime != initsb.st_ctime)
145 return false;
146 return true;
147}
148
149/* Must be called under store_lock */
150static void remove_initpid(struct pidns_init_store *e)
151{
152 struct pidns_init_store *tmp;
153 int h;
154
7dd6560a
CB
155 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
156
237e200e
SH
157 h = HASH(e->ino);
158 if (pidns_hash_table[h] == e) {
159 pidns_hash_table[h] = e->next;
54a6d46a 160 free_disarm(e);
237e200e
SH
161 return;
162 }
163
164 tmp = pidns_hash_table[h];
165 while (tmp) {
166 if (tmp->next == e) {
167 tmp->next = e->next;
54a6d46a 168 free_disarm(e);
237e200e
SH
169 return;
170 }
171 tmp = tmp->next;
172 }
173}
174
175#define PURGE_SECS 5
176/* Must be called under store_lock */
177static void prune_initpid_store(void)
178{
179 static long int last_prune = 0;
180 struct pidns_init_store *e, *prev, *delme;
181 long int now, threshold;
182 int i;
183
184 if (!last_prune) {
185 last_prune = time(NULL);
186 return;
187 }
188 now = time(NULL);
189 if (now < last_prune + PURGE_SECS)
190 return;
7dd6560a
CB
191
192 lxcfs_debug("%s\n", "Pruning.");
193
237e200e
SH
194 last_prune = now;
195 threshold = now - 2 * PURGE_SECS;
196
197 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
198 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
199 if (e->lastcheck < threshold) {
7dd6560a
CB
200
201 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
202
237e200e
SH
203 delme = e;
204 if (prev)
205 prev->next = e->next;
206 else
207 pidns_hash_table[i] = e->next;
208 e = e->next;
54a6d46a 209 free_disarm(delme);
237e200e
SH
210 } else {
211 prev = e;
212 e = e->next;
213 }
214 }
215 }
216}
217
218/* Must be called under store_lock */
219static void save_initpid(struct stat *sb, pid_t pid)
220{
221 struct pidns_init_store *e;
222 char fpath[100];
223 struct stat procsb;
224 int h;
225
7dd6560a
CB
226 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
227
237e200e
SH
228 snprintf(fpath, 100, "/proc/%d", pid);
229 if (stat(fpath, &procsb) < 0)
230 return;
231 do {
232 e = malloc(sizeof(*e));
233 } while (!e);
234 e->ino = sb->st_ino;
235 e->initpid = pid;
236 e->ctime = procsb.st_ctime;
237 h = HASH(e->ino);
238 e->next = pidns_hash_table[h];
239 e->lastcheck = time(NULL);
240 pidns_hash_table[h] = e;
241}
242
243/*
244 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
245 * entry for the inode number and creation time. Verify that the init pid
246 * is still valid. If not, remove it. Return the entry if valid, NULL
247 * otherwise.
248 * Must be called under store_lock
249 */
250static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
251{
252 int h = HASH(sb->st_ino);
253 struct pidns_init_store *e = pidns_hash_table[h];
254
255 while (e) {
256 if (e->ino == sb->st_ino) {
257 if (initpid_still_valid(e, sb)) {
258 e->lastcheck = time(NULL);
259 return e;
260 }
261 remove_initpid(e);
262 return NULL;
263 }
264 e = e->next;
265 }
266
267 return NULL;
268}
269
237e200e
SH
270struct cgfs_files {
271 char *name;
272 uint32_t uid, gid;
273 uint32_t mode;
274};
275
237e200e
SH
276static void print_subsystems(void)
277{
5fbea8a6 278 int i = 0;
237e200e 279
0646f250 280 fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd);
cc97d34c 281 fprintf(stderr, "hierarchies:\n");
5fbea8a6
CB
282 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
283 __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
284 fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
237e200e
SH
285 }
286}
287
580fe4df 288bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
237e200e 289{
580fe4df 290 int ret, cfd;
237e200e 291 size_t len;
5fbea8a6 292 char *fnam;
237e200e 293
d298bba1 294 cfd = get_cgroup_fd(controller);
5fbea8a6 295 if (cfd < 0)
237e200e 296 return false;
f5a6d92e
CB
297
298 /* Make sure we pass a relative path to *at() family of functions.
299 * . + /cgroup + / + file + \0
300 */
ba59ea09 301 len = strlen(cgroup) + strlen(file) + 3;
237e200e 302 fnam = alloca(len);
075387cd 303 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
ba59ea09
CB
304 if (ret < 0 || (size_t)ret >= len)
305 return false;
306
580fe4df 307 return (faccessat(cfd, fnam, F_OK, 0) == 0);
237e200e
SH
308}
309
580fe4df
CB
310#define SEND_CREDS_OK 0
311#define SEND_CREDS_NOTSK 1
312#define SEND_CREDS_FAIL 2
580fe4df 313static int wait_for_pid(pid_t pid);
580fe4df 314static int send_creds_clone_wrapper(void *arg);
237e200e 315
580fe4df
CB
316/*
317 * clone a task which switches to @task's namespace and writes '1'.
318 * over a unix sock so we can read the task's reaper's pid in our
319 * namespace
320 *
321 * Note: glibc's fork() does not respect pidns, which can lead to failed
322 * assertions inside glibc (and thus failed forks) if the child's pid in
323 * the pidns and the parent pid outside are identical. Using clone prevents
324 * this issue.
325 */
326static void write_task_init_pid_exit(int sock, pid_t target)
327{
328 char fnam[100];
329 pid_t pid;
330 int fd, ret;
331 size_t stack_size = sysconf(_SC_PAGESIZE);
332 void *stack = alloca(stack_size);
237e200e 333
580fe4df
CB
334 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
335 if (ret < 0 || ret >= sizeof(fnam))
336 _exit(1);
f23fe717 337
580fe4df
CB
338 fd = open(fnam, O_RDONLY);
339 if (fd < 0) {
340 perror("write_task_init_pid_exit open of ns/pid");
341 _exit(1);
237e200e 342 }
580fe4df
CB
343 if (setns(fd, 0)) {
344 perror("write_task_init_pid_exit setns 1");
345 close(fd);
346 _exit(1);
347 }
348 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
349 if (pid < 0)
350 _exit(1);
351 if (pid != 0) {
352 if (!wait_for_pid(pid))
353 _exit(1);
354 _exit(0);
237e200e 355 }
237e200e
SH
356}
357
580fe4df
CB
358static int send_creds_clone_wrapper(void *arg) {
359 struct ucred cred;
360 char v;
361 int sock = *(int *)arg;
237e200e 362
580fe4df
CB
363 /* we are the child */
364 cred.uid = 0;
365 cred.gid = 0;
366 cred.pid = 1;
367 v = '1';
368 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
369 return 1;
237e200e
SH
370 return 0;
371}
372
580fe4df 373static pid_t get_init_pid_for_task(pid_t task)
237e200e 374{
580fe4df
CB
375 int sock[2];
376 pid_t pid;
377 pid_t ret = -1;
378 char v = '0';
379 struct ucred cred;
237e200e 380
580fe4df
CB
381 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
382 perror("socketpair");
383 return -1;
237e200e
SH
384 }
385
580fe4df
CB
386 pid = fork();
387 if (pid < 0)
388 goto out;
389 if (!pid) {
390 close(sock[1]);
391 write_task_init_pid_exit(sock[0], task);
392 _exit(0);
237e200e 393 }
7213ec5c 394
580fe4df
CB
395 if (!recv_creds(sock[1], &cred, &v))
396 goto out;
397 ret = cred.pid;
237e200e 398
580fe4df
CB
399out:
400 close(sock[0]);
401 close(sock[1]);
402 if (pid > 0)
403 wait_for_pid(pid);
237e200e
SH
404 return ret;
405}
406
580fe4df 407pid_t lookup_initpid_in_store(pid_t qpid)
237e200e 408{
580fe4df
CB
409 pid_t answer = 0;
410 struct stat sb;
411 struct pidns_init_store *e;
412 char fnam[100];
b7672ded 413
580fe4df
CB
414 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
415 store_lock();
416 if (stat(fnam, &sb) < 0)
417 goto out;
418 e = lookup_verify_initpid(&sb);
419 if (e) {
420 answer = e->initpid;
421 goto out;
422 }
423 answer = get_init_pid_for_task(qpid);
424 if (answer > 0)
425 save_initpid(&sb, answer);
b7672ded 426
580fe4df
CB
427out:
428 /* we prune at end in case we are returning
429 * the value we were about to return */
430 prune_initpid_store();
431 store_unlock();
432 return answer;
237e200e
SH
433}
434
580fe4df 435static int wait_for_pid(pid_t pid)
237e200e 436{
580fe4df 437 int status, ret;
f5a6d92e 438
580fe4df
CB
439 if (pid <= 0)
440 return -1;
237e200e 441
580fe4df
CB
442again:
443 ret = waitpid(pid, &status, 0);
444 if (ret == -1) {
445 if (errno == EINTR)
446 goto again;
447 return -1;
448 }
449 if (ret != pid)
450 goto again;
451 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
452 return -1;
237e200e
SH
453 return 0;
454}
455
580fe4df
CB
456#define INITSCOPE "/init.scope"
457void prune_init_slice(char *cg)
237e200e 458{
580fe4df
CB
459 char *point;
460 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
3ffd08ee 461
580fe4df
CB
462 if (cg_len < initscope_len)
463 return;
3ffd08ee 464
580fe4df
CB
465 point = cg + cg_len - initscope_len;
466 if (strcmp(point, INITSCOPE) == 0) {
467 if (point == cg)
468 *(point+1) = '\0';
469 else
470 *point = '\0';
471 }
237e200e
SH
472}
473
580fe4df
CB
474struct pid_ns_clone_args {
475 int *cpipe;
476 int sock;
477 pid_t tpid;
478 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
479};
237e200e 480
29a73c2f
CB
481/*
482 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
483 */
484
29a73c2f
CB
485static bool umount_if_mounted(void)
486{
487 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 488 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
489 return false;
490 }
491 return true;
492}
493
2283e240
CB
494/* __typeof__ should be safe to use with all compilers. */
495typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
496static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
497{
498 return (fs->f_type == (fs_type_magic)magic_val);
499}
500
0a4dea41
CB
501/*
502 * looking at fs/proc_namespace.c, it appears we can
503 * actually expect the rootfs entry to very specifically contain
504 * " - rootfs rootfs "
505 * IIUC, so long as we've chrooted so that rootfs is not our root,
506 * the rootfs entry should always be skipped in mountinfo contents.
507 */
508static bool is_on_ramfs(void)
509{
510 FILE *f;
511 char *p, *p2;
512 char *line = NULL;
513 size_t len = 0;
514 int i;
515
516 f = fopen("/proc/self/mountinfo", "r");
517 if (!f)
518 return false;
519
520 while (getline(&line, &len, f) != -1) {
521 for (p = line, i = 0; p && i < 4; i++)
522 p = strchr(p + 1, ' ');
523 if (!p)
524 continue;
525 p2 = strchr(p + 1, ' ');
526 if (!p2)
527 continue;
528 *p2 = '\0';
529 if (strcmp(p + 1, "/") == 0) {
530 // this is '/'. is it the ramfs?
531 p = strchr(p2 + 1, '-');
532 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
533 free(line);
534 fclose(f);
535 return true;
536 }
537 }
538 }
539 free(line);
540 fclose(f);
541 return false;
542}
543
cc309f33 544static int pivot_enter()
0a4dea41 545{
cc309f33
CB
546 int ret = -1, oldroot = -1, newroot = -1;
547
548 oldroot = open("/", O_DIRECTORY | O_RDONLY);
549 if (oldroot < 0) {
550 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
551 return ret;
552 }
553
554 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
555 if (newroot < 0) {
556 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
557 goto err;
558 }
559
560 /* change into new root fs */
561 if (fchdir(newroot) < 0) {
562 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
563 goto err;
564 }
565
0a4dea41
CB
566 /* pivot_root into our new root fs */
567 if (pivot_root(".", ".") < 0) {
568 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
cc309f33 569 goto err;
0a4dea41
CB
570 }
571
572 /*
573 * At this point the old-root is mounted on top of our new-root.
574 * To unmounted it we must not be chdir'd into it, so escape back
575 * to the old-root.
576 */
577 if (fchdir(oldroot) < 0) {
578 lxcfs_error("%s\n", "Failed to enter old root.");
cc309f33 579 goto err;
0a4dea41
CB
580 }
581
582 if (umount2(".", MNT_DETACH) < 0) {
583 lxcfs_error("%s\n", "Failed to detach old root.");
cc309f33 584 goto err;
0a4dea41
CB
585 }
586
587 if (fchdir(newroot) < 0) {
588 lxcfs_error("%s\n", "Failed to re-enter new root.");
cc309f33 589 goto err;
0a4dea41
CB
590 }
591
cc309f33
CB
592 ret = 0;
593
594err:
595 if (oldroot > 0)
596 close(oldroot);
597 if (newroot > 0)
598 close(newroot);
599
600 return ret;
0a4dea41
CB
601}
602
603static int chroot_enter()
604{
605 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
606 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
607 return -1;
608 }
609
610 if (chroot(".") < 0) {
611 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
612 return -1;
613 }
614
615 if (chdir("/") < 0) {
616 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
617 return -1;
618 }
619
620 return 0;
621}
622
0232cbac 623static int permute_and_enter(void)
29a73c2f 624{
0a4dea41
CB
625 struct statfs sb;
626
627 if (statfs("/", &sb) < 0) {
628 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 629 return -1;
0a4dea41
CB
630 }
631
632 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
633 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
634 * /proc/1/mountinfo. */
635 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
636 return chroot_enter();
29a73c2f 637
cc309f33 638 if (pivot_enter() < 0) {
0a4dea41 639 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 640 return -1;
29a73c2f
CB
641 }
642
cc309f33 643 return 0;
29a73c2f
CB
644}
645
646/* Prepare our new clean root. */
0232cbac 647static int permute_prepare(void)
29a73c2f
CB
648{
649 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 650 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
651 return -1;
652 }
653
654 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 655 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
656 return -1;
657 }
658
659 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 660 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
661 return -1;
662 }
663
664 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 665 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
666 return -1;
667 }
668
669 return 0;
670}
671
0232cbac
CB
672/* Calls chroot() on ramfs, pivot_root() in all other cases. */
673static bool permute_root(void)
29a73c2f
CB
674{
675 /* Prepare new root. */
0232cbac 676 if (permute_prepare() < 0)
29a73c2f
CB
677 return false;
678
679 /* Pivot into new root. */
0232cbac 680 if (permute_and_enter() < 0)
29a73c2f
CB
681 return false;
682
683 return true;
684}
685
0a4dea41 686static bool cgfs_prepare_mounts(void)
29a73c2f
CB
687{
688 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 689 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
690 return false;
691 }
480262c9 692
29a73c2f 693 if (!umount_if_mounted()) {
b8defc3d 694 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
695 return false;
696 }
697
698 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 699 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
700 return false;
701 }
702
1d81c6a6 703 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
0646f250 704 if (cgroup_ops->mntns_fd < 0) {
a257a8ee
CB
705 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
706 return false;
707 }
708
480262c9 709 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 710 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
711 return false;
712 }
480262c9 713
29a73c2f 714 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 715 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
716 return false;
717 }
480262c9 718
29a73c2f
CB
719 return true;
720}
721
0a4dea41 722static bool cgfs_mount_hierarchies(void)
29a73c2f 723{
5fbea8a6
CB
724 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
725 return false;
51c7ca35 726
5fbea8a6
CB
727 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
728 return false;
29a73c2f 729
5fbea8a6
CB
730 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
731 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
732 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
733 if ((*h)->fd < 0)
29a73c2f 734 return false;
29a73c2f 735 }
5fbea8a6 736
29a73c2f
CB
737 return true;
738}
739
480262c9 740static bool cgfs_setup_controllers(void)
29a73c2f 741{
0a4dea41 742 if (!cgfs_prepare_mounts())
29a73c2f 743 return false;
29a73c2f 744
0a4dea41 745 if (!cgfs_mount_hierarchies()) {
b8defc3d 746 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
29a73c2f
CB
747 return false;
748 }
749
0232cbac 750 if (!permute_root())
29a73c2f
CB
751 return false;
752
753 return true;
754}
755
2243c5a9 756static void __attribute__((constructor)) lxcfs_init(void)
237e200e 757{
2243c5a9 758 __do_close_prot_errno int init_ns = -EBADF;
5fbea8a6 759 char *cret;
e58dab00 760 char cwd[MAXPATHLEN];
237e200e 761
5fbea8a6
CB
762 cgroup_ops = cgroup_init();
763 if (!cgroup_ops)
2243c5a9 764 log_exit("Failed to initialize cgroup support");
237e200e 765
480262c9 766 /* Preserve initial namespace. */
1d81c6a6 767 init_ns = preserve_ns(getpid(), "mnt");
2243c5a9
CB
768 if (init_ns < 0)
769 log_exit("Failed to preserve initial mount namespace");
480262c9 770
e58dab00 771 cret = getcwd(cwd, MAXPATHLEN);
2243c5a9 772 log_exit("%s - Could not retrieve current working directory", strerror(errno));
e58dab00 773
480262c9
CB
774 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
775 * to privately mount lxcfs cgroups. */
2243c5a9
CB
776 if (!cgfs_setup_controllers())
777 log_exit("Failed to setup private cgroup mounts for lxcfs");
480262c9 778
2243c5a9
CB
779 if (setns(init_ns, 0) < 0)
780 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
29a73c2f 781
e58dab00 782 if (!cret || chdir(cwd) < 0)
2243c5a9 783 log_exit("%s - Could not change back to original working directory", strerror(errno));
e58dab00 784
2243c5a9
CB
785 if (!init_cpuview())
786 log_exit("Failed to init CPU view");
056adcef 787
237e200e 788 print_subsystems();
237e200e
SH
789}
790
2243c5a9 791static void __attribute__((destructor)) lxcfs_exit(void)
237e200e 792{
0646f250 793 lxcfs_debug("%s\n", "Running destructor for liblxcfs");
056adcef 794 free_cpuview();
2243c5a9 795 cgroup_exit(cgroup_ops);
1c4b4e38 796}