3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
6 * See COPYING file for details.
13 #ifndef FUSE_USE_VERSION
14 #define FUSE_USE_VERSION 26
17 #define _FILE_OFFSET_BITS 64
36 #include <linux/magic.h>
37 #include <linux/sched.h>
38 #include <sys/epoll.h>
40 #include <sys/mount.h>
41 #include <sys/param.h>
42 #include <sys/socket.h>
43 #include <sys/syscall.h>
44 #include <sys/sysinfo.h>
49 #include "cgroup_fuse.h"
50 #include "cgroups/cgroup.h"
51 #include "cgroups/cgroup_utils.h"
52 #include "memory_utils.h"
53 #include "proc_cpuview.h"
56 /* Define pivot_root() if missing from the C library */
57 #ifndef HAVE_PIVOT_ROOT
58 static int pivot_root(const char * new_root
, const char * put_old
)
60 #ifdef __NR_pivot_root
61 return syscall(__NR_pivot_root
, new_root
, put_old
);
68 extern int pivot_root(const char * new_root
, const char * put_old
);
72 * A table caching which pid is init for a pid namespace.
73 * When looking up which pid is init for $qpid, we first
74 * 1. Stat /proc/$qpid/ns/pid.
75 * 2. Check whether the ino_t is in our store.
76 * a. if not, fork a child in qpid's ns to send us
77 * ucred.pid = 1, and read the initpid. Cache
78 * initpid and creation time for /proc/initpid
79 * in a new store entry.
80 * b. if so, verify that /proc/initpid still matches
81 * what we have saved. If not, clear the store
82 * entry and go back to a. If so, return the
85 struct pidns_init_store
{
86 ino_t ino
; // inode number for /proc/$pid/ns/pid
87 pid_t initpid
; // the pid of nit in that ns
88 long int ctime
; // the time at which /proc/$initpid was created
89 struct pidns_init_store
*next
;
93 /* lol - look at how they are allocated in the kernel */
94 #define PIDNS_HASH_SIZE 4096
95 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
97 static struct pidns_init_store
*pidns_hash_table
[PIDNS_HASH_SIZE
];
98 static pthread_mutex_t pidns_store_mutex
= PTHREAD_MUTEX_INITIALIZER
;
99 static void lock_mutex(pthread_mutex_t
*l
)
103 if ((ret
= pthread_mutex_lock(l
)) != 0) {
104 lxcfs_error("returned:%d %s\n", ret
, strerror(ret
));
109 struct cgroup_ops
*cgroup_ops
;
111 static void unlock_mutex(pthread_mutex_t
*l
)
115 if ((ret
= pthread_mutex_unlock(l
)) != 0) {
116 lxcfs_error("returned:%d %s\n", ret
, strerror(ret
));
121 static void store_lock(void)
123 lock_mutex(&pidns_store_mutex
);
126 static void store_unlock(void)
128 unlock_mutex(&pidns_store_mutex
);
131 /* Must be called under store_lock */
132 static bool initpid_still_valid(struct pidns_init_store
*e
, struct stat
*nsfdsb
)
137 snprintf(fnam
, 100, "/proc/%d", e
->initpid
);
138 if (stat(fnam
, &initsb
) < 0)
141 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e
->ctime
,
142 initsb
.st_ctime
, e
->initpid
);
144 if (e
->ctime
!= initsb
.st_ctime
)
149 /* Must be called under store_lock */
150 static void remove_initpid(struct pidns_init_store
*e
)
152 struct pidns_init_store
*tmp
;
155 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e
->initpid
);
158 if (pidns_hash_table
[h
] == e
) {
159 pidns_hash_table
[h
] = e
->next
;
164 tmp
= pidns_hash_table
[h
];
166 if (tmp
->next
== e
) {
176 /* Must be called under store_lock */
177 static void prune_initpid_store(void)
179 static long int last_prune
= 0;
180 struct pidns_init_store
*e
, *prev
, *delme
;
181 long int now
, threshold
;
185 last_prune
= time(NULL
);
189 if (now
< last_prune
+ PURGE_SECS
)
192 lxcfs_debug("%s\n", "Pruning.");
195 threshold
= now
- 2 * PURGE_SECS
;
197 for (i
= 0; i
< PIDNS_HASH_SIZE
; i
++) {
198 for (prev
= NULL
, e
= pidns_hash_table
[i
]; e
; ) {
199 if (e
->lastcheck
< threshold
) {
201 lxcfs_debug("Removing cached entry for %d.\n", e
->initpid
);
205 prev
->next
= e
->next
;
207 pidns_hash_table
[i
] = e
->next
;
218 /* Must be called under store_lock */
219 static void save_initpid(struct stat
*sb
, pid_t pid
)
221 struct pidns_init_store
*e
;
226 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid
);
228 snprintf(fpath
, 100, "/proc/%d", pid
);
229 if (stat(fpath
, &procsb
) < 0)
232 e
= malloc(sizeof(*e
));
236 e
->ctime
= procsb
.st_ctime
;
238 e
->next
= pidns_hash_table
[h
];
239 e
->lastcheck
= time(NULL
);
240 pidns_hash_table
[h
] = e
;
244 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
245 * entry for the inode number and creation time. Verify that the init pid
246 * is still valid. If not, remove it. Return the entry if valid, NULL
248 * Must be called under store_lock
250 static struct pidns_init_store
*lookup_verify_initpid(struct stat
*sb
)
252 int h
= HASH(sb
->st_ino
);
253 struct pidns_init_store
*e
= pidns_hash_table
[h
];
256 if (e
->ino
== sb
->st_ino
) {
257 if (initpid_still_valid(e
, sb
)) {
258 e
->lastcheck
= time(NULL
);
276 static void print_subsystems(void)
280 fprintf(stderr
, "mount namespace: %d\n", cgroup_ops
->mntns_fd
);
281 fprintf(stderr
, "hierarchies:\n");
282 for (struct hierarchy
**h
= cgroup_ops
->hierarchies
; h
&& *h
; h
++, i
++) {
283 __do_free
char *controllers
= lxc_string_join(",", (const char **)(*h
)->controllers
, false);
284 fprintf(stderr
, " %2d: fd: %3d: %s\n", i
, (*h
)->fd
, controllers
?: "");
288 bool cgfs_param_exist(const char *controller
, const char *cgroup
, const char *file
)
294 cfd
= get_cgroup_fd(controller
);
298 /* Make sure we pass a relative path to *at() family of functions.
299 * . + /cgroup + / + file + \0
301 len
= strlen(cgroup
) + strlen(file
) + 3;
303 ret
= snprintf(fnam
, len
, "%s%s/%s", dot_or_empty(cgroup
), cgroup
, file
);
304 if (ret
< 0 || (size_t)ret
>= len
)
307 return (faccessat(cfd
, fnam
, F_OK
, 0) == 0);
310 #define SEND_CREDS_OK 0
311 #define SEND_CREDS_NOTSK 1
312 #define SEND_CREDS_FAIL 2
313 static int wait_for_pid(pid_t pid
);
314 static int send_creds_clone_wrapper(void *arg
);
317 * clone a task which switches to @task's namespace and writes '1'.
318 * over a unix sock so we can read the task's reaper's pid in our
321 * Note: glibc's fork() does not respect pidns, which can lead to failed
322 * assertions inside glibc (and thus failed forks) if the child's pid in
323 * the pidns and the parent pid outside are identical. Using clone prevents
326 static void write_task_init_pid_exit(int sock
, pid_t target
)
331 size_t stack_size
= sysconf(_SC_PAGESIZE
);
332 void *stack
= alloca(stack_size
);
334 ret
= snprintf(fnam
, sizeof(fnam
), "/proc/%d/ns/pid", (int)target
);
335 if (ret
< 0 || ret
>= sizeof(fnam
))
338 fd
= open(fnam
, O_RDONLY
);
340 perror("write_task_init_pid_exit open of ns/pid");
344 perror("write_task_init_pid_exit setns 1");
348 pid
= clone(send_creds_clone_wrapper
, stack
+ stack_size
, SIGCHLD
, &sock
);
352 if (!wait_for_pid(pid
))
358 static int send_creds_clone_wrapper(void *arg
) {
361 int sock
= *(int *)arg
;
363 /* we are the child */
368 if (send_creds(sock
, &cred
, v
, true) != SEND_CREDS_OK
)
373 static pid_t
get_init_pid_for_task(pid_t task
)
381 if (socketpair(AF_UNIX
, SOCK_DGRAM
, 0, sock
) < 0) {
382 perror("socketpair");
391 write_task_init_pid_exit(sock
[0], task
);
395 if (!recv_creds(sock
[1], &cred
, &v
))
407 pid_t
lookup_initpid_in_store(pid_t qpid
)
411 struct pidns_init_store
*e
;
414 snprintf(fnam
, 100, "/proc/%d/ns/pid", qpid
);
416 if (stat(fnam
, &sb
) < 0)
418 e
= lookup_verify_initpid(&sb
);
423 answer
= get_init_pid_for_task(qpid
);
425 save_initpid(&sb
, answer
);
428 /* we prune at end in case we are returning
429 * the value we were about to return */
430 prune_initpid_store();
435 static int wait_for_pid(pid_t pid
)
443 ret
= waitpid(pid
, &status
, 0);
451 if (!WIFEXITED(status
) || WEXITSTATUS(status
) != 0)
456 #define INITSCOPE "/init.scope"
457 void prune_init_slice(char *cg
)
460 size_t cg_len
= strlen(cg
), initscope_len
= strlen(INITSCOPE
);
462 if (cg_len
< initscope_len
)
465 point
= cg
+ cg_len
- initscope_len
;
466 if (strcmp(point
, INITSCOPE
) == 0) {
474 struct pid_ns_clone_args
{
478 int (*wrapped
) (int, pid_t
); // pid_from_ns or pid_to_ns
482 * Functions needed to setup cgroups in the __constructor__.
485 static bool umount_if_mounted(void)
487 if (umount2(BASEDIR
, MNT_DETACH
) < 0 && errno
!= EINVAL
) {
488 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR
, strerror(errno
));
494 /* __typeof__ should be safe to use with all compilers. */
495 typedef __typeof__(((struct statfs
*)NULL
)->f_type
) fs_type_magic
;
496 static bool has_fs_type(const struct statfs
*fs
, fs_type_magic magic_val
)
498 return (fs
->f_type
== (fs_type_magic
)magic_val
);
502 * looking at fs/proc_namespace.c, it appears we can
503 * actually expect the rootfs entry to very specifically contain
504 * " - rootfs rootfs "
505 * IIUC, so long as we've chrooted so that rootfs is not our root,
506 * the rootfs entry should always be skipped in mountinfo contents.
508 static bool is_on_ramfs(void)
516 f
= fopen("/proc/self/mountinfo", "r");
520 while (getline(&line
, &len
, f
) != -1) {
521 for (p
= line
, i
= 0; p
&& i
< 4; i
++)
522 p
= strchr(p
+ 1, ' ');
525 p2
= strchr(p
+ 1, ' ');
529 if (strcmp(p
+ 1, "/") == 0) {
530 // this is '/'. is it the ramfs?
531 p
= strchr(p2
+ 1, '-');
532 if (p
&& strncmp(p
, "- rootfs rootfs ", 16) == 0) {
544 static int pivot_enter()
546 int ret
= -1, oldroot
= -1, newroot
= -1;
548 oldroot
= open("/", O_DIRECTORY
| O_RDONLY
);
550 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
554 newroot
= open(ROOTDIR
, O_DIRECTORY
| O_RDONLY
);
556 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
560 /* change into new root fs */
561 if (fchdir(newroot
) < 0) {
562 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR
);
566 /* pivot_root into our new root fs */
567 if (pivot_root(".", ".") < 0) {
568 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno
));
573 * At this point the old-root is mounted on top of our new-root.
574 * To unmounted it we must not be chdir'd into it, so escape back
577 if (fchdir(oldroot
) < 0) {
578 lxcfs_error("%s\n", "Failed to enter old root.");
582 if (umount2(".", MNT_DETACH
) < 0) {
583 lxcfs_error("%s\n", "Failed to detach old root.");
587 if (fchdir(newroot
) < 0) {
588 lxcfs_error("%s\n", "Failed to re-enter new root.");
603 static int chroot_enter()
605 if (mount(ROOTDIR
, "/", NULL
, MS_REC
| MS_BIND
, NULL
)) {
606 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR
);
610 if (chroot(".") < 0) {
611 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno
));
615 if (chdir("/") < 0) {
616 lxcfs_error("Failed to change directory: %s.\n", strerror(errno
));
623 static int permute_and_enter(void)
627 if (statfs("/", &sb
) < 0) {
628 lxcfs_error("%s\n", "Could not stat / mountpoint.");
632 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
633 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
634 * /proc/1/mountinfo. */
635 if (has_fs_type(&sb
, RAMFS_MAGIC
) || is_on_ramfs())
636 return chroot_enter();
638 if (pivot_enter() < 0) {
639 lxcfs_error("%s\n", "Could not perform pivot root.");
646 /* Prepare our new clean root. */
647 static int permute_prepare(void)
649 if (mkdir(ROOTDIR
, 0700) < 0 && errno
!= EEXIST
) {
650 lxcfs_error("%s\n", "Failed to create directory for new root.");
654 if (mount("/", ROOTDIR
, NULL
, MS_BIND
, 0) < 0) {
655 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno
));
659 if (mount(RUNTIME_PATH
, ROOTDIR RUNTIME_PATH
, NULL
, MS_BIND
, 0) < 0) {
660 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno
));
664 if (mount(BASEDIR
, ROOTDIR BASEDIR
, NULL
, MS_REC
| MS_MOVE
, 0) < 0) {
665 printf("Failed to move " BASEDIR
" into new root: %s.\n", strerror(errno
));
672 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
673 static bool permute_root(void)
675 /* Prepare new root. */
676 if (permute_prepare() < 0)
679 /* Pivot into new root. */
680 if (permute_and_enter() < 0)
686 static bool cgfs_prepare_mounts(void)
688 if (!mkdir_p(BASEDIR
, 0700)) {
689 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
693 if (!umount_if_mounted()) {
694 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
698 if (unshare(CLONE_NEWNS
) < 0) {
699 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno
));
703 cgroup_ops
->mntns_fd
= preserve_ns(getpid(), "mnt");
704 if (cgroup_ops
->mntns_fd
< 0) {
705 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno
));
709 if (mount(NULL
, "/", NULL
, MS_REC
| MS_PRIVATE
, 0) < 0) {
710 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno
));
714 if (mount("tmpfs", BASEDIR
, "tmpfs", 0, "size=100000,mode=700") < 0) {
715 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
722 static bool cgfs_mount_hierarchies(void)
724 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT
, 0755))
727 if (!cgroup_ops
->mount(cgroup_ops
, BASEDIR
))
730 for (struct hierarchy
**h
= cgroup_ops
->hierarchies
; h
&& *h
; h
++) {
731 __do_free
char *path
= must_make_path(BASEDIR
, (*h
)->mountpoint
, NULL
);
732 (*h
)->fd
= open(path
, O_DIRECTORY
| O_CLOEXEC
| O_NOFOLLOW
);
740 static bool cgfs_setup_controllers(void)
742 if (!cgfs_prepare_mounts())
745 if (!cgfs_mount_hierarchies()) {
746 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
756 static void __attribute__((constructor
)) lxcfs_init(void)
758 __do_close_prot_errno
int init_ns
= -EBADF
;
760 char cwd
[MAXPATHLEN
];
762 cgroup_ops
= cgroup_init();
764 log_exit("Failed to initialize cgroup support");
766 /* Preserve initial namespace. */
767 init_ns
= preserve_ns(getpid(), "mnt");
769 log_exit("Failed to preserve initial mount namespace");
771 cret
= getcwd(cwd
, MAXPATHLEN
);
772 log_exit("%s - Could not retrieve current working directory", strerror(errno
));
774 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
775 * to privately mount lxcfs cgroups. */
776 if (!cgfs_setup_controllers())
777 log_exit("Failed to setup private cgroup mounts for lxcfs");
779 if (setns(init_ns
, 0) < 0)
780 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno
));
782 if (!cret
|| chdir(cwd
) < 0)
783 log_exit("%s - Could not change back to original working directory", strerror(errno
));
786 log_exit("Failed to init CPU view");
791 static void __attribute__((destructor
)) lxcfs_exit(void)
793 lxcfs_debug("%s\n", "Running destructor for liblxcfs");
795 cgroup_exit(cgroup_ops
);