]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
Merge pull request #330 from brauner/master
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #ifndef _GNU_SOURCE
10 #define _GNU_SOURCE
11 #endif
12
13 #ifndef FUSE_USE_VERSION
14 #define FUSE_USE_VERSION 26
15 #endif
16
17 #define _FILE_OFFSET_BITS 64
18
19 #include <dirent.h>
20 #include <errno.h>
21 #include <fcntl.h>
22 #include <fuse.h>
23 #include <inttypes.h>
24 #include <libgen.h>
25 #include <pthread.h>
26 #include <sched.h>
27 #include <stdarg.h>
28 #include <stdbool.h>
29 #include <stdint.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <wait.h>
36 #include <linux/magic.h>
37 #include <linux/sched.h>
38 #include <sys/epoll.h>
39 #include <sys/mman.h>
40 #include <sys/mount.h>
41 #include <sys/param.h>
42 #include <signal.h>
43 #include <sys/socket.h>
44 #include <sys/syscall.h>
45 #include <sys/sysinfo.h>
46 #include <sys/vfs.h>
47
48 #include "bindings.h"
49 #include "config.h"
50 #include "cgroup_fuse.h"
51 #include "cgroups/cgroup.h"
52 #include "cgroups/cgroup_utils.h"
53 #include "memory_utils.h"
54 #include "proc_cpuview.h"
55 #include "utils.h"
56
57 static bool can_use_pidfd;
58
59 /* Define pivot_root() if missing from the C library */
60 #ifndef HAVE_PIVOT_ROOT
61 static int pivot_root(const char *new_root, const char *put_old)
62 {
63 #ifdef __NR_pivot_root
64 return syscall(__NR_pivot_root, new_root, put_old);
65 #else
66 errno = ENOSYS;
67 return -1;
68 #endif
69 }
70 #else
71 extern int pivot_root(const char *new_root, const char *put_old);
72 #endif
73
74 /*
75 * A table caching which pid is init for a pid namespace.
76 * When looking up which pid is init for $qpid, we first
77 * 1. Stat /proc/$qpid/ns/pid.
78 * 2. Check whether the ino_t is in our store.
79 * a. if not, fork a child in qpid's ns to send us
80 * ucred.pid = 1, and read the initpid. Cache
81 * initpid and creation time for /proc/initpid
82 * in a new store entry.
83 * b. if so, verify that /proc/initpid still matches
84 * what we have saved. If not, clear the store
85 * entry and go back to a. If so, return the
86 * cached initpid.
87 */
88 struct pidns_init_store {
89 ino_t ino; /* inode number for /proc/$pid/ns/pid */
90 pid_t initpid; /* the pid of nit in that ns */
91 int init_pidfd;
92 long int ctime; /* the time at which /proc/$initpid was created */
93 struct pidns_init_store *next;
94 long int lastcheck;
95 };
96
97 /* lol - look at how they are allocated in the kernel */
98 #define PIDNS_HASH_SIZE 4096
99 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
100
101 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
102 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
103
104 static void lock_mutex(pthread_mutex_t *l)
105 {
106 int ret;
107
108 ret = pthread_mutex_lock(l);
109 if (ret)
110 log_exit("%s - returned %d\n", strerror(ret), ret);
111 }
112
113 struct cgroup_ops *cgroup_ops;
114
115 static void unlock_mutex(pthread_mutex_t *l)
116 {
117 int ret;
118
119 ret = pthread_mutex_unlock(l);
120 if (ret)
121 log_exit("%s - returned %d\n", strerror(ret), ret);
122 }
123
124 static void store_lock(void)
125 {
126 lock_mutex(&pidns_store_mutex);
127 }
128
129 static void store_unlock(void)
130 {
131 unlock_mutex(&pidns_store_mutex);
132 }
133
134 /* /proc/ = 6
135 * +
136 * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t)
137 * +
138 * \0 = 1
139 */
140 #define LXCFS_PROC_PID_LEN \
141 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
142
143 /* Must be called under store_lock */
144 static bool initpid_still_valid(struct pidns_init_store *entry)
145 {
146 bool valid = true;
147
148 if (entry->init_pidfd >= 0) {
149 if (pidfd_send_signal(entry->init_pidfd, 0, NULL, 0))
150 valid = false;
151 } else {
152 struct stat st;
153 char path[LXCFS_PROC_PID_LEN];
154
155 snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
156
157 if (stat(path, &st) || entry->ctime != st.st_ctime)
158 valid = false;
159 }
160
161 return valid;
162 }
163
164 /* Must be called under store_lock */
165 static void remove_initpid(struct pidns_init_store *entry)
166 {
167 struct pidns_init_store *it;
168 int ino_hash;
169
170 lxcfs_debug("Removing cached entry for pid %d from init pid cache",
171 entry->initpid);
172
173 ino_hash = HASH(entry->ino);
174 if (pidns_hash_table[ino_hash] == entry) {
175 pidns_hash_table[ino_hash] = entry->next;
176 close_prot_errno_disarm(entry->init_pidfd);
177 free_disarm(entry);
178 return;
179 }
180
181 it = pidns_hash_table[ino_hash];
182 while (it) {
183 if (it->next == entry) {
184 it->next = entry->next;
185 close_prot_errno_disarm(entry->init_pidfd);
186 free_disarm(entry);
187 return;
188 }
189 it = it->next;
190 }
191 }
192
193 #define PURGE_SECS 5
194 /* Must be called under store_lock */
195 static void prune_initpid_store(void)
196 {
197 static long int last_prune = 0;
198 long int now, threshold;
199
200 if (!last_prune) {
201 last_prune = time(NULL);
202 return;
203 }
204
205 now = time(NULL);
206 if (now < last_prune + PURGE_SECS)
207 return;
208
209 lxcfs_debug("Pruning init pid cache");
210
211 last_prune = now;
212 threshold = now - 2 * PURGE_SECS;
213
214 for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
215 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
216 if (entry->lastcheck < threshold) {
217 struct pidns_init_store *cur = entry;
218
219 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
220
221 if (prev)
222 prev->next = entry->next;
223 else
224 pidns_hash_table[i] = entry->next;
225 entry = entry->next;
226 close_prot_errno_disarm(cur->init_pidfd);
227 free_disarm(cur);
228 } else {
229 prev = entry;
230 entry = entry->next;
231 }
232 }
233 }
234 }
235
236 /* Must be called under store_lock */
237 static void save_initpid(struct stat *sb, pid_t pid)
238 {
239 __do_free struct pidns_init_store *entry = NULL;
240 __do_close_prot_errno int pidfd = -EBADF;
241 char path[LXCFS_PROC_PID_LEN];
242 struct lxcfs_opts *opts = fuse_get_context()->private_data;
243 struct stat st;
244 int ino_hash;
245
246 if (opts->use_pidfd && can_use_pidfd) {
247 pidfd = pidfd_open(pid, 0);
248 if (pidfd < 0)
249 return;
250 }
251
252 snprintf(path, sizeof(path), "/proc/%d", pid);
253 if (stat(path, &st))
254 return;
255
256 entry = malloc(sizeof(*entry));
257 if (entry)
258 return;
259
260 ino_hash = HASH(entry->ino);
261 *entry = (struct pidns_init_store){
262 .ino = sb->st_ino,
263 .initpid = pid,
264 .ctime = st.st_ctime,
265 .next = pidns_hash_table[ino_hash],
266 .lastcheck = time(NULL),
267 .init_pidfd = move_fd(pidfd),
268 };
269 pidns_hash_table[ino_hash] = move_ptr(entry);
270
271 lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
272 }
273
274 /*
275 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
276 * entry for the inode number and creation time. Verify that the init pid
277 * is still valid. If not, remove it. Return the entry if valid, NULL
278 * otherwise.
279 * Must be called under store_lock
280 */
281 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
282 {
283 struct pidns_init_store *entry = pidns_hash_table[HASH(sb->st_ino)];
284
285 while (entry) {
286 if (entry->ino == sb->st_ino) {
287 if (initpid_still_valid(entry)) {
288 entry->lastcheck = time(NULL);
289 return entry;
290 }
291
292 remove_initpid(entry);
293 return NULL;
294 }
295 entry = entry->next;
296 }
297
298 return NULL;
299 }
300
301 static int send_creds_clone_wrapper(void *arg)
302 {
303 struct ucred cred;
304 char v;
305 int sock = *(int *)arg;
306
307 /* we are the child */
308 cred.uid = 0;
309 cred.gid = 0;
310 cred.pid = 1;
311 v = '1';
312 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
313 return 1;
314 return 0;
315 }
316
317 /*
318 * Let's use the "standard stack limit" (i.e. glibc thread size default) for
319 * stack sizes: 8MB.
320 */
321 #define __LXCFS_STACK_SIZE (8 * 1024 * 1024)
322 static pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags)
323 {
324 pid_t ret;
325 void *stack;
326
327 stack = malloc(__LXCFS_STACK_SIZE);
328 if (!stack)
329 return ret_errno(ENOMEM);
330
331 #ifdef __ia64__
332 ret = __clone2(fn, stack, __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
333 #else
334 ret = clone(fn, stack + __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
335 #endif
336 return ret;
337 }
338
339 #define LXCFS_PROC_PID_NS_LEN \
340 (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
341 STRLITERALLEN("/ns/pid") + 1)
342
343 /*
344 * clone a task which switches to @task's namespace and writes '1'.
345 * over a unix sock so we can read the task's reaper's pid in our
346 * namespace
347 *
348 * Note: glibc's fork() does not respect pidns, which can lead to failed
349 * assertions inside glibc (and thus failed forks) if the child's pid in
350 * the pidns and the parent pid outside are identical. Using clone prevents
351 * this issue.
352 */
353 static void write_task_init_pid_exit(int sock, pid_t target)
354 {
355 __do_close_prot_errno int fd = -EBADF;
356 char path[LXCFS_PROC_PID_NS_LEN];
357 pid_t pid;
358
359 snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
360 fd = open(path, O_RDONLY | O_CLOEXEC);
361 if (fd < 0)
362 log_exit("write_task_init_pid_exit open of ns/pid");
363
364 if (setns(fd, 0))
365 log_exit("Failed to setns to pid namespace of process %d", target);
366
367 pid = lxcfs_clone(send_creds_clone_wrapper, &sock, 0);
368 if (pid < 0)
369 _exit(EXIT_FAILURE);
370
371 if (pid != 0) {
372 if (!wait_for_pid(pid))
373 _exit(EXIT_FAILURE);
374
375 _exit(EXIT_SUCCESS);
376 }
377 }
378
379 static pid_t get_init_pid_for_task(pid_t task)
380 {
381 char v = '0';
382 pid_t pid_ret = -1;
383 pid_t pid;
384 int sock[2];
385 struct ucred cred;
386
387 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
388 return -1;
389
390 pid = fork();
391 if (pid < 0)
392 goto out;
393
394 if (pid == 0) {
395 close(sock[1]);
396 write_task_init_pid_exit(sock[0], task);
397 _exit(EXIT_SUCCESS);
398 }
399
400 if (!recv_creds(sock[1], &cred, &v))
401 goto out;
402
403 pid_ret = cred.pid;
404
405 out:
406 close(sock[0]);
407 close(sock[1]);
408 if (pid > 0)
409 wait_for_pid(pid);
410
411 return pid_ret;
412 }
413
414 pid_t lookup_initpid_in_store(pid_t pid)
415 {
416 pid_t answer = 0;
417 char path[LXCFS_PROC_PID_NS_LEN];
418 struct stat st;
419 struct pidns_init_store *entry;
420
421 snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
422
423 store_lock();
424 if (stat(path, &st))
425 goto out;
426
427 entry = lookup_verify_initpid(&st);
428 if (entry) {
429 answer = entry->initpid;
430 goto out;
431 }
432
433 answer = get_init_pid_for_task(pid);
434 if (answer > 0)
435 save_initpid(&st, answer);
436
437 out:
438 /*
439 * Prune at the end in case we're returning the value we were about to
440 * return.
441 */
442 prune_initpid_store();
443
444 store_unlock();
445
446 return answer;
447 }
448
449 /*
450 * Functions needed to setup cgroups in the __constructor__.
451 */
452
453 static bool umount_if_mounted(void)
454 {
455 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
456 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
457 return false;
458 }
459 return true;
460 }
461
462 /* __typeof__ should be safe to use with all compilers. */
463 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
464 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
465 {
466 return (fs->f_type == (fs_type_magic)magic_val);
467 }
468
469 /*
470 * looking at fs/proc_namespace.c, it appears we can
471 * actually expect the rootfs entry to very specifically contain
472 * " - rootfs rootfs "
473 * IIUC, so long as we've chrooted so that rootfs is not our root,
474 * the rootfs entry should always be skipped in mountinfo contents.
475 */
476 static bool is_on_ramfs(void)
477 {
478 __do_free char *line = NULL;
479 __do_fclose FILE *f = NULL;
480 size_t len = 0;
481
482 f = fopen("/proc/self/mountinfo", "re");
483 if (!f)
484 return false;
485
486 while (getline(&line, &len, f) != -1) {
487 int i;
488 char *p, *p2;
489
490 for (p = line, i = 0; p && i < 4; i++)
491 p = strchr(p + 1, ' ');
492 if (!p)
493 continue;
494
495 p2 = strchr(p + 1, ' ');
496 if (!p2)
497 continue;
498 *p2 = '\0';
499 if (strcmp(p + 1, "/") == 0) {
500 /* This is '/'. Is it the ramfs? */
501 p = strchr(p2 + 1, '-');
502 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
503 return true;
504 }
505 }
506
507 return false;
508 }
509
510 static int pivot_enter()
511 {
512 __do_close_prot_errno int oldroot = -EBADF, newroot = -EBADF;
513
514 oldroot = open("/", O_DIRECTORY | O_RDONLY);
515 if (oldroot < 0)
516 return log_error_errno(-1, errno,
517 "Failed to open old root for fchdir");
518
519 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
520 if (newroot < 0)
521 return log_error_errno(-1, errno,
522 "Failed to open new root for fchdir");
523
524 /* change into new root fs */
525 if (fchdir(newroot) < 0)
526 return log_error_errno(-1,
527 errno, "Failed to change directory to new rootfs: %s",
528 ROOTDIR);
529
530 /* pivot_root into our new root fs */
531 if (pivot_root(".", ".") < 0)
532 return log_error_errno(-1, errno,
533 "pivot_root() syscall failed: %s",
534 strerror(errno));
535
536 /*
537 * At this point the old-root is mounted on top of our new-root.
538 * To unmounted it we must not be chdir'd into it, so escape back
539 * to the old-root.
540 */
541 if (fchdir(oldroot) < 0)
542 return log_error_errno(-1, errno, "Failed to enter old root");
543
544 if (umount2(".", MNT_DETACH) < 0)
545 return log_error_errno(-1, errno, "Failed to detach old root");
546
547 if (fchdir(newroot) < 0)
548 return log_error_errno(-1, errno, "Failed to re-enter new root");
549
550 return 0;
551 }
552
553 static int chroot_enter()
554 {
555 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
556 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
557 return -1;
558 }
559
560 if (chroot(".") < 0) {
561 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
562 return -1;
563 }
564
565 if (chdir("/") < 0) {
566 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
567 return -1;
568 }
569
570 return 0;
571 }
572
573 static int permute_and_enter(void)
574 {
575 struct statfs sb;
576
577 if (statfs("/", &sb) < 0) {
578 lxcfs_error("%s\n", "Could not stat / mountpoint.");
579 return -1;
580 }
581
582 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
583 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
584 * /proc/1/mountinfo. */
585 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
586 return chroot_enter();
587
588 if (pivot_enter() < 0) {
589 lxcfs_error("%s\n", "Could not perform pivot root.");
590 return -1;
591 }
592
593 return 0;
594 }
595
596 /* Prepare our new clean root. */
597 static int permute_prepare(void)
598 {
599 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
600 lxcfs_error("%s\n", "Failed to create directory for new root.");
601 return -1;
602 }
603
604 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
605 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
606 return -1;
607 }
608
609 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
610 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
611 return -1;
612 }
613
614 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
615 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
616 return -1;
617 }
618
619 return 0;
620 }
621
622 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
623 static bool permute_root(void)
624 {
625 /* Prepare new root. */
626 if (permute_prepare() < 0)
627 return false;
628
629 /* Pivot into new root. */
630 if (permute_and_enter() < 0)
631 return false;
632
633 return true;
634 }
635
636 static bool cgfs_prepare_mounts(void)
637 {
638 if (!mkdir_p(BASEDIR, 0700)) {
639 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
640 return false;
641 }
642
643 if (!umount_if_mounted()) {
644 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
645 return false;
646 }
647
648 if (unshare(CLONE_NEWNS) < 0) {
649 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
650 return false;
651 }
652
653 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
654 if (cgroup_ops->mntns_fd < 0) {
655 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
656 return false;
657 }
658
659 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
660 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
661 return false;
662 }
663
664 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
665 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
666 return false;
667 }
668
669 return true;
670 }
671
672 static bool cgfs_mount_hierarchies(void)
673 {
674 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
675 return false;
676
677 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
678 return false;
679
680 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
681 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
682 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
683 if ((*h)->fd < 0)
684 return false;
685 }
686
687 return true;
688 }
689
690 static bool cgfs_setup_controllers(void)
691 {
692 if (!cgfs_prepare_mounts())
693 return false;
694
695 if (!cgfs_mount_hierarchies()) {
696 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
697 return false;
698 }
699
700 if (!permute_root())
701 return false;
702
703 return true;
704 }
705
706 static void __attribute__((constructor)) lxcfs_init(void)
707 {
708 __do_close_prot_errno int init_ns = -EBADF, pidfd = -EBADF;
709 int i = 0;
710 pid_t pid;
711 char *cret;
712 char cwd[MAXPATHLEN];
713
714 cgroup_ops = cgroup_init();
715 if (!cgroup_ops)
716 log_exit("Failed to initialize cgroup support");
717
718 /* Preserve initial namespace. */
719 pid = getpid();
720 init_ns = preserve_ns(pid, "mnt");
721 if (init_ns < 0)
722 log_exit("Failed to preserve initial mount namespace");
723
724 cret = getcwd(cwd, MAXPATHLEN);
725 if (!cret)
726 log_exit("%s - Could not retrieve current working directory", strerror(errno));
727
728 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
729 * to privately mount lxcfs cgroups. */
730 if (!cgfs_setup_controllers())
731 log_exit("Failed to setup private cgroup mounts for lxcfs");
732
733 if (setns(init_ns, 0) < 0)
734 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
735
736 if (!cret || chdir(cwd) < 0)
737 log_exit("%s - Could not change back to original working directory", strerror(errno));
738
739 if (!init_cpuview())
740 log_exit("Failed to init CPU view");
741
742 fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd);
743 fprintf(stderr, "hierarchies:\n");
744
745 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
746 __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
747 fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
748 }
749
750 pidfd = pidfd_open(pid, 0);
751 if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
752 can_use_pidfd = true;
753 lxcfs_error("Kernel supports pidfds");
754 }
755 }
756
757 static void __attribute__((destructor)) lxcfs_exit(void)
758 {
759 lxcfs_debug("%s\n", "Running destructor for liblxcfs");
760 free_cpuview();
761 cgroup_exit(cgroup_ops);
762 }