3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
6 * See COPYING file for details.
13 #ifndef FUSE_USE_VERSION
14 #define FUSE_USE_VERSION 26
17 #define _FILE_OFFSET_BITS 64
36 #include <linux/magic.h>
37 #include <linux/sched.h>
38 #include <sys/epoll.h>
40 #include <sys/mount.h>
41 #include <sys/param.h>
43 #include <sys/socket.h>
44 #include <sys/syscall.h>
45 #include <sys/sysinfo.h>
50 #include "cgroup_fuse.h"
51 #include "cgroups/cgroup.h"
52 #include "cgroups/cgroup_utils.h"
53 #include "memory_utils.h"
54 #include "proc_cpuview.h"
57 static bool can_use_pidfd
;
59 /* Define pivot_root() if missing from the C library */
60 #ifndef HAVE_PIVOT_ROOT
61 static int pivot_root(const char *new_root
, const char *put_old
)
63 #ifdef __NR_pivot_root
64 return syscall(__NR_pivot_root
, new_root
, put_old
);
71 extern int pivot_root(const char *new_root
, const char *put_old
);
75 * A table caching which pid is init for a pid namespace.
76 * When looking up which pid is init for $qpid, we first
77 * 1. Stat /proc/$qpid/ns/pid.
78 * 2. Check whether the ino_t is in our store.
79 * a. if not, fork a child in qpid's ns to send us
80 * ucred.pid = 1, and read the initpid. Cache
81 * initpid and creation time for /proc/initpid
82 * in a new store entry.
83 * b. if so, verify that /proc/initpid still matches
84 * what we have saved. If not, clear the store
85 * entry and go back to a. If so, return the
88 struct pidns_init_store
{
89 ino_t ino
; /* inode number for /proc/$pid/ns/pid */
90 pid_t initpid
; /* the pid of nit in that ns */
92 long int ctime
; /* the time at which /proc/$initpid was created */
93 struct pidns_init_store
*next
;
97 /* lol - look at how they are allocated in the kernel */
98 #define PIDNS_HASH_SIZE 4096
99 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
101 static struct pidns_init_store
*pidns_hash_table
[PIDNS_HASH_SIZE
];
102 static pthread_mutex_t pidns_store_mutex
= PTHREAD_MUTEX_INITIALIZER
;
104 static void lock_mutex(pthread_mutex_t
*l
)
108 ret
= pthread_mutex_lock(l
);
110 log_exit("%s - returned %d\n", strerror(ret
), ret
);
113 struct cgroup_ops
*cgroup_ops
;
115 static void unlock_mutex(pthread_mutex_t
*l
)
119 ret
= pthread_mutex_unlock(l
);
121 log_exit("%s - returned %d\n", strerror(ret
), ret
);
124 static void store_lock(void)
126 lock_mutex(&pidns_store_mutex
);
129 static void store_unlock(void)
131 unlock_mutex(&pidns_store_mutex
);
136 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
140 #define LXCFS_PROC_PID_LEN \
141 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
143 /* Must be called under store_lock */
144 static bool initpid_still_valid(struct pidns_init_store
*entry
)
148 if (entry
->init_pidfd
>= 0) {
149 if (pidfd_send_signal(entry
->init_pidfd
, 0, NULL
, 0))
153 char path
[LXCFS_PROC_PID_LEN
];
155 snprintf(path
, sizeof(path
), "/proc/%d", entry
->initpid
);
157 if (stat(path
, &st
) || entry
->ctime
!= st
.st_ctime
)
164 /* Must be called under store_lock */
165 static void remove_initpid(struct pidns_init_store
*entry
)
167 struct pidns_init_store
*it
;
170 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
173 ino_hash
= HASH(entry
->ino
);
174 if (pidns_hash_table
[ino_hash
] == entry
) {
175 pidns_hash_table
[ino_hash
] = entry
->next
;
176 close_prot_errno_disarm(entry
->init_pidfd
);
181 it
= pidns_hash_table
[ino_hash
];
183 if (it
->next
== entry
) {
184 it
->next
= entry
->next
;
185 close_prot_errno_disarm(entry
->init_pidfd
);
194 /* Must be called under store_lock */
195 static void prune_initpid_store(void)
197 static long int last_prune
= 0;
198 long int now
, threshold
;
201 last_prune
= time(NULL
);
206 if (now
< last_prune
+ PURGE_SECS
)
209 lxcfs_debug("Pruning init pid cache");
212 threshold
= now
- 2 * PURGE_SECS
;
214 for (int i
= 0; i
< PIDNS_HASH_SIZE
; i
++) {
215 for (struct pidns_init_store
*entry
= pidns_hash_table
[i
], *prev
= NULL
; entry
;) {
216 if (entry
->lastcheck
< threshold
) {
217 struct pidns_init_store
*cur
= entry
;
219 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur
->initpid
);
222 prev
->next
= entry
->next
;
224 pidns_hash_table
[i
] = entry
->next
;
226 close_prot_errno_disarm(cur
->init_pidfd
);
236 /* Must be called under store_lock */
237 static void save_initpid(struct stat
*sb
, pid_t pid
)
239 __do_free
struct pidns_init_store
*entry
= NULL
;
240 __do_close_prot_errno
int pidfd
= -EBADF
;
241 char path
[LXCFS_PROC_PID_LEN
];
242 struct lxcfs_opts
*opts
= fuse_get_context()->private_data
;
246 if (opts
->use_pidfd
&& can_use_pidfd
) {
247 pidfd
= pidfd_open(pid
, 0);
252 snprintf(path
, sizeof(path
), "/proc/%d", pid
);
256 entry
= malloc(sizeof(*entry
));
260 ino_hash
= HASH(entry
->ino
);
261 *entry
= (struct pidns_init_store
){
264 .ctime
= st
.st_ctime
,
265 .next
= pidns_hash_table
[ino_hash
],
266 .lastcheck
= time(NULL
),
267 .init_pidfd
= move_fd(pidfd
),
269 pidns_hash_table
[ino_hash
] = move_ptr(entry
);
271 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash
, pid
);
275 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
276 * entry for the inode number and creation time. Verify that the init pid
277 * is still valid. If not, remove it. Return the entry if valid, NULL
279 * Must be called under store_lock
281 static struct pidns_init_store
*lookup_verify_initpid(struct stat
*sb
)
283 struct pidns_init_store
*entry
= pidns_hash_table
[HASH(sb
->st_ino
)];
286 if (entry
->ino
== sb
->st_ino
) {
287 if (initpid_still_valid(entry
)) {
288 entry
->lastcheck
= time(NULL
);
292 remove_initpid(entry
);
301 static int send_creds_clone_wrapper(void *arg
)
305 int sock
= *(int *)arg
;
307 /* we are the child */
312 if (send_creds(sock
, &cred
, v
, true) != SEND_CREDS_OK
)
318 * Let's use the "standard stack limit" (i.e. glibc thread size default) for
321 #define __LXCFS_STACK_SIZE (8 * 1024 * 1024)
322 static pid_t
lxcfs_clone(int (*fn
)(void *), void *arg
, int flags
)
327 stack
= malloc(__LXCFS_STACK_SIZE
);
329 return ret_errno(ENOMEM
);
332 ret
= __clone2(fn
, stack
, __LXCFS_STACK_SIZE
, flags
| SIGCHLD
, arg
, NULL
);
334 ret
= clone(fn
, stack
+ __LXCFS_STACK_SIZE
, flags
| SIGCHLD
, arg
, NULL
);
339 #define LXCFS_PROC_PID_NS_LEN \
340 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
341 STRLITERALLEN("/ns/pid") + 1)
344 * clone a task which switches to @task's namespace and writes '1'.
345 * over a unix sock so we can read the task's reaper's pid in our
348 * Note: glibc's fork() does not respect pidns, which can lead to failed
349 * assertions inside glibc (and thus failed forks) if the child's pid in
350 * the pidns and the parent pid outside are identical. Using clone prevents
353 static void write_task_init_pid_exit(int sock
, pid_t target
)
355 __do_close_prot_errno
int fd
= -EBADF
;
356 char path
[LXCFS_PROC_PID_NS_LEN
];
359 snprintf(path
, sizeof(path
), "/proc/%d/ns/pid", (int)target
);
360 fd
= open(path
, O_RDONLY
| O_CLOEXEC
);
362 log_exit("write_task_init_pid_exit open of ns/pid");
365 log_exit("Failed to setns to pid namespace of process %d", target
);
367 pid
= lxcfs_clone(send_creds_clone_wrapper
, &sock
, 0);
372 if (!wait_for_pid(pid
))
379 static pid_t
get_init_pid_for_task(pid_t task
)
387 if (socketpair(AF_UNIX
, SOCK_DGRAM
, 0, sock
) < 0)
396 write_task_init_pid_exit(sock
[0], task
);
400 if (!recv_creds(sock
[1], &cred
, &v
))
414 pid_t
lookup_initpid_in_store(pid_t pid
)
417 char path
[LXCFS_PROC_PID_NS_LEN
];
419 struct pidns_init_store
*entry
;
421 snprintf(path
, sizeof(path
), "/proc/%d/ns/pid", pid
);
427 entry
= lookup_verify_initpid(&st
);
429 answer
= entry
->initpid
;
433 answer
= get_init_pid_for_task(pid
);
435 save_initpid(&st
, answer
);
439 * Prune at the end in case we're returning the value we were about to
442 prune_initpid_store();
450 * Functions needed to setup cgroups in the __constructor__.
453 static bool umount_if_mounted(void)
455 if (umount2(BASEDIR
, MNT_DETACH
) < 0 && errno
!= EINVAL
) {
456 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR
, strerror(errno
));
462 /* __typeof__ should be safe to use with all compilers. */
463 typedef __typeof__(((struct statfs
*)NULL
)->f_type
) fs_type_magic
;
464 static bool has_fs_type(const struct statfs
*fs
, fs_type_magic magic_val
)
466 return (fs
->f_type
== (fs_type_magic
)magic_val
);
470 * looking at fs/proc_namespace.c, it appears we can
471 * actually expect the rootfs entry to very specifically contain
472 * " - rootfs rootfs "
473 * IIUC, so long as we've chrooted so that rootfs is not our root,
474 * the rootfs entry should always be skipped in mountinfo contents.
476 static bool is_on_ramfs(void)
478 __do_free
char *line
= NULL
;
479 __do_fclose
FILE *f
= NULL
;
482 f
= fopen("/proc/self/mountinfo", "re");
486 while (getline(&line
, &len
, f
) != -1) {
490 for (p
= line
, i
= 0; p
&& i
< 4; i
++)
491 p
= strchr(p
+ 1, ' ');
495 p2
= strchr(p
+ 1, ' ');
499 if (strcmp(p
+ 1, "/") == 0) {
500 /* This is '/'. Is it the ramfs? */
501 p
= strchr(p2
+ 1, '-');
502 if (p
&& strncmp(p
, "- rootfs rootfs ", 16) == 0)
510 static int pivot_enter()
512 __do_close_prot_errno
int oldroot
= -EBADF
, newroot
= -EBADF
;
514 oldroot
= open("/", O_DIRECTORY
| O_RDONLY
);
516 return log_error_errno(-1, errno
,
517 "Failed to open old root for fchdir");
519 newroot
= open(ROOTDIR
, O_DIRECTORY
| O_RDONLY
);
521 return log_error_errno(-1, errno
,
522 "Failed to open new root for fchdir");
524 /* change into new root fs */
525 if (fchdir(newroot
) < 0)
526 return log_error_errno(-1,
527 errno
, "Failed to change directory to new rootfs: %s",
530 /* pivot_root into our new root fs */
531 if (pivot_root(".", ".") < 0)
532 return log_error_errno(-1, errno
,
533 "pivot_root() syscall failed: %s",
537 * At this point the old-root is mounted on top of our new-root.
538 * To unmounted it we must not be chdir'd into it, so escape back
541 if (fchdir(oldroot
) < 0)
542 return log_error_errno(-1, errno
, "Failed to enter old root");
544 if (umount2(".", MNT_DETACH
) < 0)
545 return log_error_errno(-1, errno
, "Failed to detach old root");
547 if (fchdir(newroot
) < 0)
548 return log_error_errno(-1, errno
, "Failed to re-enter new root");
553 static int chroot_enter()
555 if (mount(ROOTDIR
, "/", NULL
, MS_REC
| MS_BIND
, NULL
)) {
556 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR
);
560 if (chroot(".") < 0) {
561 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno
));
565 if (chdir("/") < 0) {
566 lxcfs_error("Failed to change directory: %s.\n", strerror(errno
));
573 static int permute_and_enter(void)
577 if (statfs("/", &sb
) < 0) {
578 lxcfs_error("%s\n", "Could not stat / mountpoint.");
582 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
583 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
584 * /proc/1/mountinfo. */
585 if (has_fs_type(&sb
, RAMFS_MAGIC
) || is_on_ramfs())
586 return chroot_enter();
588 if (pivot_enter() < 0) {
589 lxcfs_error("%s\n", "Could not perform pivot root.");
596 /* Prepare our new clean root. */
597 static int permute_prepare(void)
599 if (mkdir(ROOTDIR
, 0700) < 0 && errno
!= EEXIST
) {
600 lxcfs_error("%s\n", "Failed to create directory for new root.");
604 if (mount("/", ROOTDIR
, NULL
, MS_BIND
, 0) < 0) {
605 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno
));
609 if (mount(RUNTIME_PATH
, ROOTDIR RUNTIME_PATH
, NULL
, MS_BIND
, 0) < 0) {
610 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno
));
614 if (mount(BASEDIR
, ROOTDIR BASEDIR
, NULL
, MS_REC
| MS_MOVE
, 0) < 0) {
615 printf("Failed to move " BASEDIR
" into new root: %s.\n", strerror(errno
));
622 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
623 static bool permute_root(void)
625 /* Prepare new root. */
626 if (permute_prepare() < 0)
629 /* Pivot into new root. */
630 if (permute_and_enter() < 0)
636 static bool cgfs_prepare_mounts(void)
638 if (!mkdir_p(BASEDIR
, 0700)) {
639 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
643 if (!umount_if_mounted()) {
644 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
648 if (unshare(CLONE_NEWNS
) < 0) {
649 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno
));
653 cgroup_ops
->mntns_fd
= preserve_ns(getpid(), "mnt");
654 if (cgroup_ops
->mntns_fd
< 0) {
655 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno
));
659 if (mount(NULL
, "/", NULL
, MS_REC
| MS_PRIVATE
, 0) < 0) {
660 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno
));
664 if (mount("tmpfs", BASEDIR
, "tmpfs", 0, "size=100000,mode=700") < 0) {
665 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
672 static bool cgfs_mount_hierarchies(void)
674 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT
, 0755))
677 if (!cgroup_ops
->mount(cgroup_ops
, BASEDIR
))
680 for (struct hierarchy
**h
= cgroup_ops
->hierarchies
; h
&& *h
; h
++) {
681 __do_free
char *path
= must_make_path(BASEDIR
, (*h
)->mountpoint
, NULL
);
682 (*h
)->fd
= open(path
, O_DIRECTORY
| O_CLOEXEC
| O_NOFOLLOW
);
690 static bool cgfs_setup_controllers(void)
692 if (!cgfs_prepare_mounts())
695 if (!cgfs_mount_hierarchies()) {
696 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
706 static void __attribute__((constructor
)) lxcfs_init(void)
708 __do_close_prot_errno
int init_ns
= -EBADF
, pidfd
= -EBADF
;
712 char cwd
[MAXPATHLEN
];
714 cgroup_ops
= cgroup_init();
716 log_exit("Failed to initialize cgroup support");
718 /* Preserve initial namespace. */
720 init_ns
= preserve_ns(pid
, "mnt");
722 log_exit("Failed to preserve initial mount namespace");
724 cret
= getcwd(cwd
, MAXPATHLEN
);
726 log_exit("%s - Could not retrieve current working directory", strerror(errno
));
728 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
729 * to privately mount lxcfs cgroups. */
730 if (!cgfs_setup_controllers())
731 log_exit("Failed to setup private cgroup mounts for lxcfs");
733 if (setns(init_ns
, 0) < 0)
734 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno
));
736 if (!cret
|| chdir(cwd
) < 0)
737 log_exit("%s - Could not change back to original working directory", strerror(errno
));
740 log_exit("Failed to init CPU view");
742 fprintf(stderr
, "mount namespace: %d\n", cgroup_ops
->mntns_fd
);
743 fprintf(stderr
, "hierarchies:\n");
745 for (struct hierarchy
**h
= cgroup_ops
->hierarchies
; h
&& *h
; h
++, i
++) {
746 __do_free
char *controllers
= lxc_string_join(",", (const char **)(*h
)->controllers
, false);
747 fprintf(stderr
, " %2d: fd: %3d: %s\n", i
, (*h
)->fd
, controllers
?: "");
750 pidfd
= pidfd_open(pid
, 0);
751 if (pidfd
>= 0 && pidfd_send_signal(pidfd
, 0, NULL
, 0) == 0) {
752 can_use_pidfd
= true;
753 lxcfs_error("Kernel supports pidfds");
757 static void __attribute__((destructor
)) lxcfs_exit(void)
759 lxcfs_debug("%s\n", "Running destructor for liblxcfs");
761 cgroup_exit(cgroup_ops
);