src/bindings.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE
   5 #endif
   6
   7 #include "config.h"
   8
   9 #include <dirent.h>
  10 #include <errno.h>
  11 #include <fcntl.h>
  12 #include <fuse.h>
  13 #include <inttypes.h>
  14 #include <libgen.h>
  15 #include <linux/magic.h>
  16 #include <linux/sched.h>
  17 #include <pthread.h>
  18 #include <sched.h>
  19 #include <stdarg.h>
  20 #include <stdbool.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <sys/epoll.h>
  26 #include <sys/mman.h>
  27 #include <sys/mount.h>
  28 #include <sys/param.h>
  29 #include <sys/socket.h>
  30 #include <sys/syscall.h>
  31 #include <sys/sysinfo.h>
  32 #include <sys/vfs.h>
  33 #include <time.h>
  34 #include <unistd.h>
  35 #include <wait.h>
  36
  37 #include "api_extensions.h"
  38 #include "bindings.h"
  39 #include "cgroup_fuse.h"
  40 #include "cgroups/cgroup.h"
  41 #include "cgroups/cgroup_utils.h"
  42 #include "memory_utils.h"
  43 #include "proc_cpuview.h"
  44 #include "syscall_numbers.h"
  45 #include "utils.h"
  46
  47 static bool can_use_pidfd;
  48 static bool can_use_swap;
  49 static bool can_use_sys_cpu;
  50 static bool has_versioned_opts;
  51
  52 static volatile sig_atomic_t reload_successful;
  53
  54 bool liblxcfs_functional(void)
  55 {
  56         return reload_successful != 0;
  57 }
  58
  59 bool liblxcfs_can_use_swap(void)
  60 {
  61         return can_use_swap;
  62 }
  63
  64 bool liblxcfs_can_use_sys_cpu(void)
  65 {
  66         return can_use_sys_cpu;
  67 }
  68
  69 bool liblxcfs_has_versioned_opts(void)
  70 {
  71         return has_versioned_opts;
  72 }
  73
  74 /* Define pivot_root() if missing from the C library */
  75 #ifndef HAVE_PIVOT_ROOT
  76 static int pivot_root(const char *new_root, const char *put_old)
  77 {
  78         return syscall(__NR_pivot_root, new_root, put_old);
  79 }
  80 #else
  81 extern int pivot_root(const char *new_root, const char *put_old);
  82 #endif
  83
  84 /*
  85  * A table caching which pid is init for a pid namespace.
  86  * When looking up which pid is init for $qpid, we first
  87  * 1. Stat /proc/$qpid/ns/pid.
  88  * 2. Check whether the ino_t is in our store.
  89  *   a. if not, fork a child in qpid's ns to send us
  90  *       ucred.pid = 1, and read the initpid.  Cache
  91  *       initpid and creation time for /proc/initpid
  92  *       in a new store entry.
  93  *   b. if so, verify that /proc/initpid still matches
  94  *       what we have saved.  If not, clear the store
  95  *       entry and go back to a.  If so, return the
  96  *       cached initpid.
  97  */
  98 struct pidns_init_store {
  99         ino_t ino;     /* inode number for /proc/$pid/ns/pid */
 100         pid_t initpid; /* the pid of nit in that ns */
 101         int init_pidfd;
 102         int64_t ctime; /* the time at which /proc/$initpid was created */
 103         struct pidns_init_store *next;
 104         int64_t lastcheck;
 105 };
 106
 107 /* lol - look at how they are allocated in the kernel */
 108 #define PIDNS_HASH_SIZE 4096
 109 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 110
 111 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 112 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 113
 114 static void mutex_lock(pthread_mutex_t *l)
 115 {
 116         int ret;
 117
 118         ret = pthread_mutex_lock(l);
 119         if (ret)
 120                 log_exit("%s - returned %d\n", strerror(ret), ret);
 121 }
 122
 123 struct cgroup_ops *cgroup_ops;
 124
 125 static void mutex_unlock(pthread_mutex_t *l)
 126 {
 127         int ret;
 128
 129         ret = pthread_mutex_unlock(l);
 130         if (ret)
 131                 log_exit("%s - returned %d\n", strerror(ret), ret);
 132 }
 133
 134 static inline void store_lock(void)
 135 {
 136         mutex_lock(&pidns_store_mutex);
 137 }
 138
 139 static inline void store_unlock(void)
 140 {
 141         mutex_unlock(&pidns_store_mutex);
 142 }
 143
 144 /* /proc/       =    6
 145  *                +
 146  * <pid-as-str> =   INTTYPE_TO_STRLEN(pid_t)
 147  *                +
 148  * \0           =    1
 149  */
 150 #define LXCFS_PROC_PID_LEN \
 151         (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
 152
 153 static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
 154 {
 155         int ret;
 156
 157         if (entry->init_pidfd < 0)
 158                 return ret_errno(ENOSYS);
 159
 160         ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
 161         if (ret < 0) {
 162                 if (errno == ENOSYS)
 163                         return ret_errno(ENOSYS);
 164
 165                 return 0;
 166         }
 167
 168         return 1;
 169 }
 170
 171 static int initpid_still_valid_stat(struct pidns_init_store *entry)
 172 {
 173         struct stat st;
 174         char path[LXCFS_PROC_PID_LEN];
 175
 176         snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
 177         if (stat(path, &st) || entry->ctime != st.st_ctime)
 178                 return 0;
 179
 180         return 1;
 181 }
 182
 183 /* Must be called under store_lock */
 184 static bool initpid_still_valid(struct pidns_init_store *entry)
 185 {
 186         int ret;
 187
 188         ret = initpid_still_valid_pidfd(entry);
 189         if (ret < 0)
 190                 ret = initpid_still_valid_stat(entry);
 191
 192         return ret == 1;
 193 }
 194
 195 /* Must be called under store_lock */
 196 static void remove_initpid(struct pidns_init_store *entry)
 197 {
 198         struct pidns_init_store *it;
 199         int ino_hash;
 200
 201         lxcfs_debug("Removing cached entry for pid %d from init pid cache",
 202                     entry->initpid);
 203
 204         ino_hash = HASH(entry->ino);
 205         if (pidns_hash_table[ino_hash] == entry) {
 206                 pidns_hash_table[ino_hash] = entry->next;
 207                 close_prot_errno_disarm(entry->init_pidfd);
 208                 free_disarm(entry);
 209                 return;
 210         }
 211
 212         it = pidns_hash_table[ino_hash];
 213         while (it) {
 214                 if (it->next == entry) {
 215                         it->next = entry->next;
 216                         close_prot_errno_disarm(entry->init_pidfd);
 217                         free_disarm(entry);
 218                         return;
 219                 }
 220                 it = it->next;
 221         }
 222 }
 223
 224 #define PURGE_SECS 5
 225 /* Must be called under store_lock */
 226 static void prune_initpid_store(void)
 227 {
 228         static int64_t last_prune = 0;
 229         int64_t now, threshold;
 230
 231         if (!last_prune) {
 232                 last_prune = time(NULL);
 233                 return;
 234         }
 235
 236         now = time(NULL);
 237         if (now < (last_prune + PURGE_SECS))
 238                 return;
 239
 240         lxcfs_debug("Pruning init pid cache");
 241
 242         last_prune = now;
 243         threshold = now - 2 * PURGE_SECS;
 244
 245         for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
 246                 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
 247                         if (entry->lastcheck < threshold) {
 248                                 struct pidns_init_store *cur = entry;
 249
 250                                 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
 251
 252                                 if (prev)
 253                                         prev->next = entry->next;
 254                                 else
 255                                         pidns_hash_table[i] = entry->next;
 256                                 entry = entry->next;
 257                                 close_prot_errno_disarm(cur->init_pidfd);
 258                                 free_disarm(cur);
 259                         } else {
 260                                 prev = entry;
 261                                 entry = entry->next;
 262                         }
 263                 }
 264         }
 265 }
 266
 267 static void clear_initpid_store(void)
 268 {
 269         store_lock();
 270         for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
 271                 for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) {
 272                         struct pidns_init_store *cur = entry;
 273
 274                         lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
 275
 276                         pidns_hash_table[i] = entry->next;
 277                         entry = entry->next;
 278                         close_prot_errno_disarm(cur->init_pidfd);
 279                         free_disarm(cur);
 280                 }
 281         }
 282         store_unlock();
 283 }
 284
 285 /* Must be called under store_lock */
 286 static void save_initpid(ino_t pidns_inode, pid_t pid)
 287 {
 288         __do_free struct pidns_init_store *entry = NULL;
 289         __do_close int pidfd = -EBADF;
 290         const struct lxcfs_opts *opts = fuse_get_context()->private_data;
 291         char path[LXCFS_PROC_PID_LEN];
 292         struct stat st;
 293         int ino_hash;
 294
 295         if (opts && opts->use_pidfd && can_use_pidfd) {
 296                 pidfd = pidfd_open(pid, 0);
 297                 if (pidfd < 0)
 298                         return;
 299         }
 300
 301         snprintf(path, sizeof(path), "/proc/%d", pid);
 302         if (stat(path, &st))
 303                 return;
 304
 305         entry = zalloc(sizeof(*entry));
 306         if (!entry)
 307                 return;
 308
 309         ino_hash = HASH(pidns_inode);
 310         *entry = (struct pidns_init_store){
 311                 .ino            = pidns_inode,
 312                 .initpid        = pid,
 313                 .ctime          = st.st_ctime,
 314                 .next           = pidns_hash_table[ino_hash],
 315                 .lastcheck      = time(NULL),
 316                 .init_pidfd     = move_fd(pidfd),
 317         };
 318         pidns_hash_table[ino_hash] = move_ptr(entry);
 319
 320         lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
 321 }
 322
 323 /*
 324  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 325  * entry for the inode number and creation time.  Verify that the init pid
 326  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 327  * otherwise.
 328  * Must be called under store_lock
 329  */
 330 static pid_t lookup_verify_initpid(ino_t pidns_inode)
 331 {
 332         struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
 333
 334         while (entry) {
 335                 if (entry->ino == pidns_inode) {
 336                         if (initpid_still_valid(entry)) {
 337                                 entry->lastcheck = time(NULL);
 338                                 return entry->initpid;
 339                         }
 340
 341                         remove_initpid(entry);
 342                         return ret_errno(ESRCH);
 343                 }
 344                 entry = entry->next;
 345         }
 346
 347         return ret_errno(ESRCH);
 348 }
 349
 350 static bool send_creds_ok(int sock_fd)
 351 {
 352         char v = '1'; /* we are the child */
 353         struct ucred cred = {
 354             .uid = 0,
 355             .gid = 0,
 356             .pid = 1,
 357         };
 358
 359         return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK;
 360 }
 361
 362 __returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd)
 363 {
 364         /*
 365          * These flags don't interest at all so we don't jump through any hoops
 366          * of retrieving them and passing them to the kernel.
 367          */
 368         errno = EINVAL;
 369         if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
 370                       CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
 371                 return -EINVAL;
 372
 373 #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
 374         /* On s390/s390x and cris the order of the first and second arguments
 375          * of the system call is reversed.
 376          */
 377         return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd);
 378 #elif defined(__sparc__) && defined(__arch64__)
 379         {
 380                 /*
 381                  * sparc64 always returns the other process id in %o0, and a
 382                  * boolean flag whether this is the child or the parent in %o1.
 383                  * Inline assembly is needed to get the flag returned in %o1.
 384                  */
 385                 register long g1 asm("g1") = __NR_clone;
 386                 register long o0 asm("o0") = flags | SIGCHLD;
 387                 register long o1 asm("o1") = 0; /* is parent/child indicator */
 388                 register long o2 asm("o2") = (unsigned long)pidfd;
 389                 long is_error, retval, in_child;
 390                 pid_t child_pid;
 391
 392                 asm volatile(
 393 #if defined(__arch64__)
 394                     "t 0x6d\n\t" /* 64-bit trap */
 395 #else
 396                     "t 0x10\n\t" /* 32-bit trap */
 397 #endif
 398                     /*
 399                      * catch errors: On sparc, the carry bit (csr) in the
 400                      * processor status register (psr) is used instead of a
 401                      * full register.
 402                      */
 403                     "addx %%g0, 0, %%g1"
 404                     : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */
 405                     : "r"(g1), "r"(o0), "r"(o1), "r"(o2)     /* inputs */
 406                     : "%cc");                                /* clobbers */
 407
 408                 is_error = g1;
 409                 retval = o0;
 410                 in_child = o1;
 411
 412                 if (is_error) {
 413                         errno = retval;
 414                         return -1;
 415                 }
 416
 417                 if (in_child)
 418                         return 0;
 419
 420                 child_pid = retval;
 421                 return child_pid;
 422         }
 423 #elif defined(__ia64__)
 424         /* On ia64 the stack and stack size are passed as separate arguments. */
 425         return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd);
 426 #else
 427         return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd);
 428 #endif
 429 }
 430
 431 #define LXCFS_PROC_PID_NS_LEN                                    \
 432         (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
 433          STRLITERALLEN("/ns/pid") + 1)
 434
 435 /*
 436  * clone a task which switches to @task's namespace and writes '1'.
 437  * over a unix sock so we can read the task's reaper's pid in our
 438  * namespace
 439  *
 440  * Note: glibc's fork() does not respect pidns, which can lead to failed
 441  * assertions inside glibc (and thus failed forks) if the child's pid in
 442  * the pidns and the parent pid outside are identical. Using clone prevents
 443  * this issue.
 444  */
 445 static void write_task_init_pid_exit(int sock, pid_t target)
 446 {
 447         __do_close int fd = -EBADF;
 448         char path[LXCFS_PROC_PID_NS_LEN];
 449         pid_t pid;
 450
 451         snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
 452         fd = open(path, O_RDONLY | O_CLOEXEC);
 453         if (fd < 0)
 454                 log_exit("write_task_init_pid_exit open of ns/pid");
 455
 456         if (setns(fd, 0))
 457                 log_exit("Failed to setns to pid namespace of process %d", target);
 458
 459         pid = lxcfs_raw_clone(0, NULL);
 460         if (pid < 0)
 461                 _exit(EXIT_FAILURE);
 462
 463         if (pid == 0) {
 464                 if (!send_creds_ok(sock))
 465                         _exit(EXIT_FAILURE);
 466
 467                 _exit(EXIT_SUCCESS);
 468         }
 469
 470         if (!wait_for_pid(pid))
 471                 _exit(EXIT_FAILURE);
 472
 473         _exit(EXIT_SUCCESS);
 474 }
 475
 476 static pid_t scm_init_pid(pid_t task)
 477 {
 478         char v = '0';
 479         pid_t pid_ret = -1;
 480         struct ucred cred = {
 481                 .pid = -1,
 482                 .uid = -1,
 483                 .gid = -1,
 484         };
 485         pid_t pid;
 486         int sock[2];
 487
 488         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
 489                 return -1;
 490
 491         pid = fork();
 492         if (pid < 0)
 493                 goto out;
 494
 495         if (pid == 0) {
 496                 close(sock[1]);
 497                 write_task_init_pid_exit(sock[0], task);
 498                 _exit(EXIT_SUCCESS);
 499         }
 500
 501         if (!recv_creds(sock[1], &cred, &v))
 502                 goto out;
 503
 504         pid_ret = cred.pid;
 505
 506 out:
 507         close(sock[0]);
 508         close(sock[1]);
 509         if (pid > 0)
 510                 wait_for_pid(pid);
 511
 512         return pid_ret;
 513 }
 514
 515 pid_t lookup_initpid_in_store(pid_t pid)
 516 {
 517         pid_t hashed_pid = 0;
 518         char path[LXCFS_PROC_PID_NS_LEN];
 519         struct stat st;
 520
 521         snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
 522         if (stat(path, &st))
 523                 return ret_errno(ESRCH);
 524
 525         store_lock();
 526
 527         hashed_pid = lookup_verify_initpid(st.st_ino);
 528         if (hashed_pid < 0) {
 529                 /* release the mutex as the following call is expensive */
 530                 store_unlock();
 531
 532                 hashed_pid = scm_init_pid(pid);
 533
 534                 store_lock();
 535
 536                 if (hashed_pid > 0)
 537                         save_initpid(st.st_ino, hashed_pid);
 538         }
 539
 540         /*
 541          * Prune at the end in case we're pruning the value
 542          * we were about to return.
 543          */
 544         prune_initpid_store();
 545         store_unlock();
 546
 547         return hashed_pid;
 548 }
 549
 550 /*
 551  * Functions needed to setup cgroups in the __constructor__.
 552  */
 553
 554 static bool umount_if_mounted(void)
 555 {
 556         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
 557                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
 558                 return false;
 559         }
 560         return true;
 561 }
 562
 563 /* __typeof__ should be safe to use with all compilers. */
 564 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
 565 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
 566 {
 567         return (fs->f_type == (fs_type_magic)magic_val);
 568 }
 569
 570 /*
 571  * looking at fs/proc_namespace.c, it appears we can
 572  * actually expect the rootfs entry to very specifically contain
 573  * " - rootfs rootfs "
 574  * IIUC, so long as we've chrooted so that rootfs is not our root,
 575  * the rootfs entry should always be skipped in mountinfo contents.
 576  */
 577 static bool is_on_ramfs(void)
 578 {
 579         __do_free char *line = NULL;
 580         __do_free void *fopen_cache = NULL;
 581         __do_fclose FILE *f = NULL;
 582         size_t len = 0;
 583
 584         f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
 585         if (!f)
 586                 return false;
 587
 588         while (getline(&line, &len, f) != -1) {
 589                 int i;
 590                 char *p, *p2;
 591
 592                 for (p = line, i = 0; p && i < 4; i++)
 593                         p = strchr(p + 1, ' ');
 594                 if (!p)
 595                         continue;
 596
 597                 p2 = strchr(p + 1, ' ');
 598                 if (!p2)
 599                         continue;
 600                 *p2 = '\0';
 601                 if (strcmp(p + 1, "/") == 0) {
 602                         /* This is '/'. Is it the ramfs? */
 603                         p = strchr(p2 + 1, '-');
 604                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
 605                                 return true;
 606                 }
 607         }
 608
 609         return false;
 610 }
 611
 612 static int pivot_enter()
 613 {
 614         __do_close int oldroot = -EBADF, newroot = -EBADF;
 615
 616         oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
 617         if (oldroot < 0)
 618                 return log_error_errno(-1, errno,
 619                                        "Failed to open old root for fchdir");
 620
 621         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
 622         if (newroot < 0)
 623                 return log_error_errno(-1, errno,
 624                                        "Failed to open new root for fchdir");
 625
 626         /* change into new root fs */
 627         if (fchdir(newroot) < 0)
 628                 return log_error_errno(-1,
 629                                        errno, "Failed to change directory to new rootfs: %s",
 630                                        ROOTDIR);
 631
 632         /* pivot_root into our new root fs */
 633         if (pivot_root(".", ".") < 0)
 634                 return log_error_errno(-1, errno,
 635                                        "pivot_root() syscall failed: %s",
 636                                        strerror(errno));
 637
 638         /*
 639          * At this point the old-root is mounted on top of our new-root.
 640          * To unmounted it we must not be chdir'd into it, so escape back
 641          * to the old-root.
 642          */
 643         if (fchdir(oldroot) < 0)
 644                 return log_error_errno(-1, errno, "Failed to enter old root");
 645
 646         if (umount2(".", MNT_DETACH) < 0)
 647                 return log_error_errno(-1, errno, "Failed to detach old root");
 648
 649         if (fchdir(newroot) < 0)
 650                 return log_error_errno(-1, errno, "Failed to re-enter new root");
 651
 652         return 0;
 653 }
 654
 655 static int chroot_enter()
 656 {
 657         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
 658                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
 659                 return -1;
 660         }
 661
 662         if (chroot(".") < 0) {
 663                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
 664                 return -1;
 665         }
 666
 667         if (chdir("/") < 0) {
 668                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
 669                 return -1;
 670         }
 671
 672         return 0;
 673 }
 674
 675 static int permute_and_enter(void)
 676 {
 677         struct statfs sb;
 678
 679         if (statfs("/", &sb) < 0) {
 680                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
 681                 return -1;
 682         }
 683
 684         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
 685          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
 686          * /proc/1/mountinfo. */
 687         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
 688                 return chroot_enter();
 689
 690         if (pivot_enter() < 0) {
 691                 lxcfs_error("%s\n", "Could not perform pivot root.");
 692                 return -1;
 693         }
 694
 695         return 0;
 696 }
 697
 698 /* Prepare our new clean root. */
 699 static int permute_prepare(void)
 700 {
 701         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
 702                 lxcfs_error("%s\n", "Failed to create directory for new root.");
 703                 return -1;
 704         }
 705
 706         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
 707                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
 708                 return -1;
 709         }
 710
 711         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
 712                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
 713                 return -1;
 714         }
 715
 716         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
 717                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
 718                 return -1;
 719         }
 720
 721         return 0;
 722 }
 723
 724 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
 725 static bool permute_root(void)
 726 {
 727         /* Prepare new root. */
 728         if (permute_prepare() < 0)
 729                 return false;
 730
 731         /* Pivot into new root. */
 732         if (permute_and_enter() < 0)
 733                 return false;
 734
 735         return true;
 736 }
 737
 738 static bool cgfs_prepare_mounts(void)
 739 {
 740         if (!mkdir_p(BASEDIR, 0700)) {
 741                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
 742                 return false;
 743         }
 744
 745         if (!umount_if_mounted()) {
 746                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
 747                 return false;
 748         }
 749
 750         if (unshare(CLONE_NEWNS) < 0) {
 751                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
 752                 return false;
 753         }
 754
 755         cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
 756         if (cgroup_ops->mntns_fd < 0) {
 757                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
 758                 return false;
 759         }
 760
 761         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
 762                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
 763                 return false;
 764         }
 765
 766         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
 767                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
 768                 return false;
 769         }
 770
 771         return true;
 772 }
 773
 774 static bool cgfs_mount_hierarchies(void)
 775 {
 776         if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
 777                 return false;
 778
 779         if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
 780                 return false;
 781
 782         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
 783                 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
 784                 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
 785                 if ((*h)->fd < 0)
 786                         return false;
 787         }
 788
 789         return true;
 790 }
 791
 792 static bool cgfs_setup_controllers(void)
 793 {
 794         if (!cgfs_prepare_mounts())
 795                 return false;
 796
 797         if (!cgfs_mount_hierarchies())
 798                 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
 799
 800         if (!permute_root())
 801                 return false;
 802
 803         return true;
 804 }
 805
 806 static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra)
 807 {
 808         int ret;
 809
 810         if (reload_successful) {
 811                 reload_successful = 0;
 812
 813                 /* write() is async signal safe */
 814                 ret = write(STDERR_FILENO,
 815                             "Switched into non-virtualization mode\n",
 816                             STRLITERALLEN("Switched into non-virtualization mode\n"));
 817                 if (ret < 0)
 818                         goto please_compiler;
 819         } else {
 820                 reload_successful = 1;
 821
 822                 /* write() is async signal safe */
 823                 ret = write(STDERR_FILENO, "Switched into virtualization mode\n",
 824                             STRLITERALLEN("Switched into virtualization mode\n"));
 825                 if (ret < 0)
 826                         goto please_compiler;
 827         }
 828
 829 please_compiler:
 830         /*
 831          * The write() syscall is a function whose return value needs to be
 832          * checked. Otherwise the compiler will warn.Another one could be to
 833          * use syscall(__NR_write, ...) directly but whatever.
 834          */
 835         return;
 836 }
 837
 838 static void __attribute__((constructor)) lxcfs_init(void)
 839 {
 840         __do_close int init_ns = -EBADF, root_fd = -EBADF,
 841                                   pidfd = -EBADF;
 842         int i = 0;
 843         pid_t pid;
 844
 845         lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
 846
 847         cgroup_ops = cgroup_init();
 848         if (!cgroup_ops) {
 849                 lxcfs_info("Failed to initialize cgroup support");
 850                 goto broken_upgrade;
 851         }
 852
 853         /* Preserve initial namespace. */
 854         pid = getpid();
 855         init_ns = preserve_ns(pid, "mnt");
 856         if (init_ns < 0) {
 857                 lxcfs_info("Failed to preserve initial mount namespace");
 858                 goto broken_upgrade;
 859         }
 860
 861         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
 862          * to privately mount lxcfs cgroups. */
 863         if (!cgfs_setup_controllers()) {
 864                 log_exit("Failed to setup private cgroup mounts for lxcfs");
 865                 goto broken_upgrade;
 866         }
 867
 868         if (setns(init_ns, 0) < 0) {
 869                 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
 870                 goto broken_upgrade;
 871         }
 872
 873         if (!init_cpuview()) {
 874                 log_exit("Failed to init CPU view");
 875                 goto broken_upgrade;
 876         }
 877
 878         lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
 879         lxcfs_info("hierarchies:");
 880
 881         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
 882                 char **controller_list = (*h)->controllers;
 883                 __do_free char *controllers = NULL;
 884                 if (controller_list && *controller_list)
 885                         controllers = lxc_string_join(",", (const char **)controller_list, false);
 886                 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
 887         }
 888
 889         pidfd = pidfd_open(pid, 0);
 890         if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
 891                 can_use_pidfd = true;
 892                 lxcfs_info("Kernel supports pidfds");
 893         }
 894
 895         can_use_swap = cgroup_ops->can_use_swap(cgroup_ops);
 896         if (can_use_swap)
 897                 lxcfs_info("Kernel supports swap accounting");
 898         else
 899                 lxcfs_info("Kernel does not support swap accounting");
 900
 901         lxcfs_info("api_extensions:");
 902         for (i = 0; i < nr_api_extensions; i++)
 903                 lxcfs_info("- %s", api_extensions[i]);
 904
 905         root_fd = open("/", O_PATH | O_CLOEXEC);
 906         if (root_fd < 0)
 907                 lxcfs_info("%s - Failed to open root directory", strerror(errno));
 908         else if (fchdir(root_fd) < 0)
 909                 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
 910
 911         if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) {
 912                 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno));
 913                 goto broken_upgrade;
 914         }
 915
 916         reload_successful = 1;
 917         return;
 918
 919 broken_upgrade:
 920         reload_successful = 0;
 921         lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
 922 }
 923
 924 static void __attribute__((destructor)) lxcfs_exit(void)
 925 {
 926         lxcfs_info("Running destructor %s", __func__);
 927
 928         clear_initpid_store();
 929         free_cpuview();
 930         cgroup_exit(cgroup_ops);
 931 }
 932
 933 void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data)
 934 {
 935         struct fuse_context *fc = fuse_get_context();
 936         can_use_sys_cpu = true;
 937         has_versioned_opts = true;
 938         return fc->private_data;
 939 }