src/bindings.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include "config.h"
   4
   5 #include <dirent.h>
   6 #include <errno.h>
   7 #include <fcntl.h>
   8 #include <inttypes.h>
   9 #include <libgen.h>
  10 #include <linux/magic.h>
  11 #include <linux/sched.h>
  12 #include <pthread.h>
  13 #include <sched.h>
  14 #include <stdarg.h>
  15 #include <stdbool.h>
  16 #include <stdint.h>
  17 #include <stdio.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20 #include <sys/epoll.h>
  21 #include <sys/mman.h>
  22 #include <sys/mount.h>
  23 #include <sys/param.h>
  24 #include <sys/socket.h>
  25 #include <sys/syscall.h>
  26 #include <sys/sysinfo.h>
  27 #include <sys/vfs.h>
  28 #include <time.h>
  29 #include <unistd.h>
  30 #include <wait.h>
  31
  32 #include "bindings.h"
  33
  34 #include "api_extensions.h"
  35 #include "cgroup_fuse.h"
  36 #include "cgroups/cgroup.h"
  37 #include "cgroups/cgroup_utils.h"
  38 #include "memory_utils.h"
  39 #include "proc_cpuview.h"
  40 #include "syscall_numbers.h"
  41 #include "utils.h"
  42
  43 static bool can_use_pidfd;
  44 static bool can_use_swap;
  45 static bool can_use_sys_cpu;
  46 static bool has_versioned_opts;
  47
  48 static volatile sig_atomic_t reload_successful;
  49
  50 bool liblxcfs_functional(void)
  51 {
  52         return reload_successful != 0;
  53 }
  54
  55 bool liblxcfs_can_use_swap(void)
  56 {
  57         return can_use_swap;
  58 }
  59
  60 bool liblxcfs_can_use_sys_cpu(void)
  61 {
  62         return can_use_sys_cpu;
  63 }
  64
  65 bool liblxcfs_has_versioned_opts(void)
  66 {
  67         return has_versioned_opts;
  68 }
  69
  70 /* Define pivot_root() if missing from the C library */
  71 #ifndef HAVE_PIVOT_ROOT
  72 static int pivot_root(const char *new_root, const char *put_old)
  73 {
  74         return syscall(__NR_pivot_root, new_root, put_old);
  75 }
  76 #else
  77 extern int pivot_root(const char *new_root, const char *put_old);
  78 #endif
  79
  80 /*
  81  * A table caching which pid is init for a pid namespace.
  82  * When looking up which pid is init for $qpid, we first
  83  * 1. Stat /proc/$qpid/ns/pid.
  84  * 2. Check whether the ino_t is in our store.
  85  *   a. if not, fork a child in qpid's ns to send us
  86  *       ucred.pid = 1, and read the initpid.  Cache
  87  *       initpid and creation time for /proc/initpid
  88  *       in a new store entry.
  89  *   b. if so, verify that /proc/initpid still matches
  90  *       what we have saved.  If not, clear the store
  91  *       entry and go back to a.  If so, return the
  92  *       cached initpid.
  93  */
  94 struct pidns_init_store {
  95         ino_t ino;     /* inode number for /proc/$pid/ns/pid */
  96         pid_t initpid; /* the pid of nit in that ns */
  97         int init_pidfd;
  98         int64_t ctime; /* the time at which /proc/$initpid was created */
  99         struct pidns_init_store *next;
 100         int64_t lastcheck;
 101 };
 102
 103 /* lol - look at how they are allocated in the kernel */
 104 #define PIDNS_HASH_SIZE 4096
 105 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 106
 107 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 108 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 109
 110 static void mutex_lock(pthread_mutex_t *l)
 111 {
 112         int ret;
 113
 114         ret = pthread_mutex_lock(l);
 115         if (ret)
 116                 log_exit("%s - returned %d\n", strerror(ret), ret);
 117 }
 118
 119 struct cgroup_ops *cgroup_ops;
 120
 121 static void mutex_unlock(pthread_mutex_t *l)
 122 {
 123         int ret;
 124
 125         ret = pthread_mutex_unlock(l);
 126         if (ret)
 127                 log_exit("%s - returned %d\n", strerror(ret), ret);
 128 }
 129
 130 static inline void store_lock(void)
 131 {
 132         mutex_lock(&pidns_store_mutex);
 133 }
 134
 135 static inline void store_unlock(void)
 136 {
 137         mutex_unlock(&pidns_store_mutex);
 138 }
 139
 140 /* /proc/       =    6
 141  *                +
 142  * <pid-as-str> =   INTTYPE_TO_STRLEN(pid_t)
 143  *                +
 144  * \0           =    1
 145  */
 146 #define LXCFS_PROC_PID_LEN \
 147         (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
 148
 149 static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
 150 {
 151         int ret;
 152
 153         if (entry->init_pidfd < 0)
 154                 return ret_errno(ENOSYS);
 155
 156         ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
 157         if (ret < 0) {
 158                 if (errno == ENOSYS)
 159                         return ret_errno(ENOSYS);
 160
 161                 return 0;
 162         }
 163
 164         return 1;
 165 }
 166
 167 static int initpid_still_valid_stat(struct pidns_init_store *entry)
 168 {
 169         struct stat st;
 170         char path[LXCFS_PROC_PID_LEN];
 171
 172         snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
 173         if (stat(path, &st) || entry->ctime != st.st_ctime)
 174                 return 0;
 175
 176         return 1;
 177 }
 178
 179 /* Must be called under store_lock */
 180 static bool initpid_still_valid(struct pidns_init_store *entry)
 181 {
 182         int ret;
 183
 184         ret = initpid_still_valid_pidfd(entry);
 185         if (ret < 0)
 186                 ret = initpid_still_valid_stat(entry);
 187
 188         return ret == 1;
 189 }
 190
 191 /* Must be called under store_lock */
 192 static void remove_initpid(struct pidns_init_store *entry)
 193 {
 194         struct pidns_init_store *it;
 195         int ino_hash;
 196
 197         lxcfs_debug("Removing cached entry for pid %d from init pid cache",
 198                     entry->initpid);
 199
 200         ino_hash = HASH(entry->ino);
 201         if (pidns_hash_table[ino_hash] == entry) {
 202                 pidns_hash_table[ino_hash] = entry->next;
 203                 close_prot_errno_disarm(entry->init_pidfd);
 204                 free_disarm(entry);
 205                 return;
 206         }
 207
 208         it = pidns_hash_table[ino_hash];
 209         while (it) {
 210                 if (it->next == entry) {
 211                         it->next = entry->next;
 212                         close_prot_errno_disarm(entry->init_pidfd);
 213                         free_disarm(entry);
 214                         return;
 215                 }
 216                 it = it->next;
 217         }
 218 }
 219
 220 #define PURGE_SECS 5
 221 /* Must be called under store_lock */
 222 static void prune_initpid_store(void)
 223 {
 224         static int64_t last_prune = 0;
 225         int64_t now, threshold;
 226
 227         if (!last_prune) {
 228                 last_prune = time(NULL);
 229                 return;
 230         }
 231
 232         now = time(NULL);
 233         if (now < (last_prune + PURGE_SECS))
 234                 return;
 235
 236         lxcfs_debug("Pruning init pid cache");
 237
 238         last_prune = now;
 239         threshold = now - 2 * PURGE_SECS;
 240
 241         for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
 242                 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
 243                         if (entry->lastcheck < threshold) {
 244                                 struct pidns_init_store *cur = entry;
 245
 246                                 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
 247
 248                                 if (prev)
 249                                         prev->next = entry->next;
 250                                 else
 251                                         pidns_hash_table[i] = entry->next;
 252                                 entry = entry->next;
 253                                 close_prot_errno_disarm(cur->init_pidfd);
 254                                 free_disarm(cur);
 255                         } else {
 256                                 prev = entry;
 257                                 entry = entry->next;
 258                         }
 259                 }
 260         }
 261 }
 262
 263 static void clear_initpid_store(void)
 264 {
 265         store_lock();
 266         for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
 267                 for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) {
 268                         struct pidns_init_store *cur = entry;
 269
 270                         lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
 271
 272                         pidns_hash_table[i] = entry->next;
 273                         entry = entry->next;
 274                         close_prot_errno_disarm(cur->init_pidfd);
 275                         free_disarm(cur);
 276                 }
 277         }
 278         store_unlock();
 279 }
 280
 281 /* Must be called under store_lock */
 282 static void save_initpid(ino_t pidns_inode, pid_t pid)
 283 {
 284         __do_free struct pidns_init_store *entry = NULL;
 285         __do_close int pidfd = -EBADF;
 286         const struct lxcfs_opts *opts = fuse_get_context()->private_data;
 287         char path[LXCFS_PROC_PID_LEN];
 288         struct stat st;
 289         int ino_hash;
 290
 291         if (opts && opts->use_pidfd && can_use_pidfd) {
 292                 pidfd = pidfd_open(pid, 0);
 293                 if (pidfd < 0)
 294                         return;
 295         }
 296
 297         snprintf(path, sizeof(path), "/proc/%d", pid);
 298         if (stat(path, &st))
 299                 return;
 300
 301         entry = zalloc(sizeof(*entry));
 302         if (!entry)
 303                 return;
 304
 305         ino_hash = HASH(pidns_inode);
 306         *entry = (struct pidns_init_store){
 307                 .ino            = pidns_inode,
 308                 .initpid        = pid,
 309                 .ctime          = st.st_ctime,
 310                 .next           = pidns_hash_table[ino_hash],
 311                 .lastcheck      = time(NULL),
 312                 .init_pidfd     = move_fd(pidfd),
 313         };
 314         pidns_hash_table[ino_hash] = move_ptr(entry);
 315
 316         lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
 317 }
 318
 319 /*
 320  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 321  * entry for the inode number and creation time.  Verify that the init pid
 322  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 323  * otherwise.
 324  * Must be called under store_lock
 325  */
 326 static pid_t lookup_verify_initpid(ino_t pidns_inode)
 327 {
 328         struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
 329
 330         while (entry) {
 331                 if (entry->ino == pidns_inode) {
 332                         if (initpid_still_valid(entry)) {
 333                                 entry->lastcheck = time(NULL);
 334                                 return entry->initpid;
 335                         }
 336
 337                         remove_initpid(entry);
 338                         return ret_errno(ESRCH);
 339                 }
 340                 entry = entry->next;
 341         }
 342
 343         return ret_errno(ESRCH);
 344 }
 345
 346 static bool send_creds_ok(int sock_fd)
 347 {
 348         char v = '1'; /* we are the child */
 349         struct ucred cred = {
 350             .uid = 0,
 351             .gid = 0,
 352             .pid = 1,
 353         };
 354
 355         return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK;
 356 }
 357
 358 __returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd)
 359 {
 360         /*
 361          * These flags don't interest at all so we don't jump through any hoops
 362          * of retrieving them and passing them to the kernel.
 363          */
 364         errno = EINVAL;
 365         if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
 366                       CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
 367                 return -EINVAL;
 368
 369 #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
 370         /* On s390/s390x and cris the order of the first and second arguments
 371          * of the system call is reversed.
 372          */
 373         return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd);
 374 #elif defined(__sparc__) && defined(__arch64__)
 375         {
 376                 /*
 377                  * sparc64 always returns the other process id in %o0, and a
 378                  * boolean flag whether this is the child or the parent in %o1.
 379                  * Inline assembly is needed to get the flag returned in %o1.
 380                  */
 381                 register long g1 asm("g1") = __NR_clone;
 382                 register long o0 asm("o0") = flags | SIGCHLD;
 383                 register long o1 asm("o1") = 0; /* is parent/child indicator */
 384                 register long o2 asm("o2") = (unsigned long)pidfd;
 385                 long is_error, retval, in_child;
 386                 pid_t child_pid;
 387
 388                 asm volatile(
 389 #if defined(__arch64__)
 390                     "t 0x6d\n\t" /* 64-bit trap */
 391 #else
 392                     "t 0x10\n\t" /* 32-bit trap */
 393 #endif
 394                     /*
 395                      * catch errors: On sparc, the carry bit (csr) in the
 396                      * processor status register (psr) is used instead of a
 397                      * full register.
 398                      */
 399                     "addx %%g0, 0, %%g1"
 400                     : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */
 401                     : "r"(g1), "r"(o0), "r"(o1), "r"(o2)     /* inputs */
 402                     : "%cc");                                /* clobbers */
 403
 404                 is_error = g1;
 405                 retval = o0;
 406                 in_child = o1;
 407
 408                 if (is_error) {
 409                         errno = retval;
 410                         return -1;
 411                 }
 412
 413                 if (in_child)
 414                         return 0;
 415
 416                 child_pid = retval;
 417                 return child_pid;
 418         }
 419 #elif defined(__ia64__)
 420         /* On ia64 the stack and stack size are passed as separate arguments. */
 421         return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd);
 422 #else
 423         return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd);
 424 #endif
 425 }
 426
 427 #define LXCFS_PROC_PID_NS_LEN                                    \
 428         (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
 429          STRLITERALLEN("/ns/pid") + 1)
 430
 431 /*
 432  * clone a task which switches to @task's namespace and writes '1'.
 433  * over a unix sock so we can read the task's reaper's pid in our
 434  * namespace
 435  *
 436  * Note: glibc's fork() does not respect pidns, which can lead to failed
 437  * assertions inside glibc (and thus failed forks) if the child's pid in
 438  * the pidns and the parent pid outside are identical. Using clone prevents
 439  * this issue.
 440  */
 441 static void write_task_init_pid_exit(int sock, pid_t target)
 442 {
 443         __do_close int fd = -EBADF;
 444         char path[LXCFS_PROC_PID_NS_LEN];
 445         pid_t pid;
 446
 447         snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
 448         fd = open(path, O_RDONLY | O_CLOEXEC);
 449         if (fd < 0)
 450                 log_exit("write_task_init_pid_exit open of ns/pid");
 451
 452         if (setns(fd, 0))
 453                 log_exit("Failed to setns to pid namespace of process %d", target);
 454
 455         pid = lxcfs_raw_clone(0, NULL);
 456         if (pid < 0)
 457                 _exit(EXIT_FAILURE);
 458
 459         if (pid == 0) {
 460                 if (!send_creds_ok(sock))
 461                         _exit(EXIT_FAILURE);
 462
 463                 _exit(EXIT_SUCCESS);
 464         }
 465
 466         if (!wait_for_pid(pid))
 467                 _exit(EXIT_FAILURE);
 468
 469         _exit(EXIT_SUCCESS);
 470 }
 471
 472 static pid_t scm_init_pid(pid_t task)
 473 {
 474         char v = '0';
 475         pid_t pid_ret = -1;
 476         struct ucred cred = {
 477                 .pid = -1,
 478                 .uid = -1,
 479                 .gid = -1,
 480         };
 481         pid_t pid;
 482         int sock[2];
 483
 484         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
 485                 return -1;
 486
 487         pid = fork();
 488         if (pid < 0)
 489                 goto out;
 490
 491         if (pid == 0) {
 492                 close(sock[1]);
 493                 write_task_init_pid_exit(sock[0], task);
 494                 _exit(EXIT_SUCCESS);
 495         }
 496
 497         if (!recv_creds(sock[1], &cred, &v))
 498                 goto out;
 499
 500         pid_ret = cred.pid;
 501
 502 out:
 503         close(sock[0]);
 504         close(sock[1]);
 505         if (pid > 0)
 506                 wait_for_pid(pid);
 507
 508         return pid_ret;
 509 }
 510
 511 pid_t lookup_initpid_in_store(pid_t pid)
 512 {
 513         pid_t hashed_pid = 0;
 514         char path[LXCFS_PROC_PID_NS_LEN];
 515         struct stat st;
 516
 517         snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
 518         if (stat(path, &st))
 519                 return ret_errno(ESRCH);
 520
 521         store_lock();
 522
 523         hashed_pid = lookup_verify_initpid(st.st_ino);
 524         if (hashed_pid < 0) {
 525                 /* release the mutex as the following call is expensive */
 526                 store_unlock();
 527
 528                 hashed_pid = scm_init_pid(pid);
 529
 530                 store_lock();
 531
 532                 if (hashed_pid > 0)
 533                         save_initpid(st.st_ino, hashed_pid);
 534         }
 535
 536         /*
 537          * Prune at the end in case we're pruning the value
 538          * we were about to return.
 539          */
 540         prune_initpid_store();
 541         store_unlock();
 542
 543         return hashed_pid;
 544 }
 545
 546 /*
 547  * Functions needed to setup cgroups in the __constructor__.
 548  */
 549
 550 static bool umount_if_mounted(void)
 551 {
 552         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
 553                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
 554                 return false;
 555         }
 556         return true;
 557 }
 558
 559 /* __typeof__ should be safe to use with all compilers. */
 560 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
 561 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
 562 {
 563         return (fs->f_type == (fs_type_magic)magic_val);
 564 }
 565
 566 /*
 567  * looking at fs/proc_namespace.c, it appears we can
 568  * actually expect the rootfs entry to very specifically contain
 569  * " - rootfs rootfs "
 570  * IIUC, so long as we've chrooted so that rootfs is not our root,
 571  * the rootfs entry should always be skipped in mountinfo contents.
 572  */
 573 static bool is_on_ramfs(void)
 574 {
 575         __do_free char *line = NULL;
 576         __do_free void *fopen_cache = NULL;
 577         __do_fclose FILE *f = NULL;
 578         size_t len = 0;
 579
 580         f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
 581         if (!f)
 582                 return false;
 583
 584         while (getline(&line, &len, f) != -1) {
 585                 int i;
 586                 char *p, *p2;
 587
 588                 for (p = line, i = 0; p && i < 4; i++)
 589                         p = strchr(p + 1, ' ');
 590                 if (!p)
 591                         continue;
 592
 593                 p2 = strchr(p + 1, ' ');
 594                 if (!p2)
 595                         continue;
 596                 *p2 = '\0';
 597                 if (strcmp(p + 1, "/") == 0) {
 598                         /* This is '/'. Is it the ramfs? */
 599                         p = strchr(p2 + 1, '-');
 600                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
 601                                 return true;
 602                 }
 603         }
 604
 605         return false;
 606 }
 607
 608 static int pivot_enter(void)
 609 {
 610         __do_close int oldroot = -EBADF, newroot = -EBADF;
 611
 612         oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
 613         if (oldroot < 0)
 614                 return log_error_errno(-1, errno,
 615                                        "Failed to open old root for fchdir");
 616
 617         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
 618         if (newroot < 0)
 619                 return log_error_errno(-1, errno,
 620                                        "Failed to open new root for fchdir");
 621
 622         /* change into new root fs */
 623         if (fchdir(newroot) < 0)
 624                 return log_error_errno(-1,
 625                                        errno, "Failed to change directory to new rootfs: %s",
 626                                        ROOTDIR);
 627
 628         /* pivot_root into our new root fs */
 629         if (pivot_root(".", ".") < 0)
 630                 return log_error_errno(-1, errno,
 631                                        "pivot_root() syscall failed: %s",
 632                                        strerror(errno));
 633
 634         /*
 635          * At this point the old-root is mounted on top of our new-root.
 636          * To unmounted it we must not be chdir'd into it, so escape back
 637          * to the old-root.
 638          */
 639         if (fchdir(oldroot) < 0)
 640                 return log_error_errno(-1, errno, "Failed to enter old root");
 641
 642         if (umount2(".", MNT_DETACH) < 0)
 643                 return log_error_errno(-1, errno, "Failed to detach old root");
 644
 645         if (fchdir(newroot) < 0)
 646                 return log_error_errno(-1, errno, "Failed to re-enter new root");
 647
 648         return 0;
 649 }
 650
 651 static int chroot_enter(void)
 652 {
 653         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
 654                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
 655                 return -1;
 656         }
 657
 658         if (chroot(".") < 0) {
 659                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
 660                 return -1;
 661         }
 662
 663         if (chdir("/") < 0) {
 664                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
 665                 return -1;
 666         }
 667
 668         return 0;
 669 }
 670
 671 static int permute_and_enter(void)
 672 {
 673         struct statfs sb;
 674
 675         if (statfs("/", &sb) < 0) {
 676                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
 677                 return -1;
 678         }
 679
 680         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
 681          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
 682          * /proc/1/mountinfo. */
 683         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
 684                 return chroot_enter();
 685
 686         if (pivot_enter() < 0) {
 687                 lxcfs_error("%s\n", "Could not perform pivot root.");
 688                 return -1;
 689         }
 690
 691         return 0;
 692 }
 693
 694 /* Prepare our new clean root. */
 695 static int permute_prepare(void)
 696 {
 697         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
 698                 lxcfs_error("%s\n", "Failed to create directory for new root.");
 699                 return -1;
 700         }
 701
 702         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
 703                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
 704                 return -1;
 705         }
 706
 707         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
 708                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
 709                 return -1;
 710         }
 711
 712         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
 713                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
 714                 return -1;
 715         }
 716
 717         return 0;
 718 }
 719
 720 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
 721 static bool permute_root(void)
 722 {
 723         /* Prepare new root. */
 724         if (permute_prepare() < 0)
 725                 return false;
 726
 727         /* Pivot into new root. */
 728         if (permute_and_enter() < 0)
 729                 return false;
 730
 731         return true;
 732 }
 733
 734 static bool cgfs_prepare_mounts(void)
 735 {
 736         if (!mkdir_p(BASEDIR, 0700)) {
 737                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
 738                 return false;
 739         }
 740
 741         if (!umount_if_mounted()) {
 742                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
 743                 return false;
 744         }
 745
 746         if (unshare(CLONE_NEWNS) < 0) {
 747                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
 748                 return false;
 749         }
 750
 751         cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
 752         if (cgroup_ops->mntns_fd < 0) {
 753                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
 754                 return false;
 755         }
 756
 757         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
 758                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
 759                 return false;
 760         }
 761
 762         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
 763                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
 764                 return false;
 765         }
 766
 767         return true;
 768 }
 769
 770 static bool cgfs_mount_hierarchies(void)
 771 {
 772         if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
 773                 return false;
 774
 775         if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
 776                 return false;
 777
 778         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
 779                 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
 780                 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
 781                 if ((*h)->fd < 0)
 782                         return false;
 783         }
 784
 785         return true;
 786 }
 787
 788 static bool cgfs_setup_controllers(void)
 789 {
 790         if (!cgfs_prepare_mounts())
 791                 return false;
 792
 793         if (!cgfs_mount_hierarchies())
 794                 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
 795
 796         if (!permute_root())
 797                 return false;
 798
 799         return true;
 800 }
 801
 802 static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra)
 803 {
 804         int ret;
 805
 806         if (reload_successful) {
 807                 reload_successful = 0;
 808
 809                 /* write() is async signal safe */
 810                 ret = write(STDERR_FILENO,
 811                             "Switched into non-virtualization mode\n",
 812                             STRLITERALLEN("Switched into non-virtualization mode\n"));
 813                 if (ret < 0)
 814                         goto please_compiler;
 815         } else {
 816                 reload_successful = 1;
 817
 818                 /* write() is async signal safe */
 819                 ret = write(STDERR_FILENO, "Switched into virtualization mode\n",
 820                             STRLITERALLEN("Switched into virtualization mode\n"));
 821                 if (ret < 0)
 822                         goto please_compiler;
 823         }
 824
 825 please_compiler:
 826         /*
 827          * The write() syscall is a function whose return value needs to be
 828          * checked. Otherwise the compiler will warn.Another one could be to
 829          * use syscall(__NR_write, ...) directly but whatever.
 830          */
 831         return;
 832 }
 833
 834 static void __attribute__((constructor)) lxcfs_init(void)
 835 {
 836         __do_close int init_ns = -EBADF, root_fd = -EBADF,
 837                                   pidfd = -EBADF;
 838         int i = 0;
 839         pid_t pid;
 840
 841         lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
 842
 843         cgroup_ops = cgroup_init();
 844         if (!cgroup_ops) {
 845                 lxcfs_info("Failed to initialize cgroup support");
 846                 goto broken_upgrade;
 847         }
 848
 849         /* Preserve initial namespace. */
 850         pid = getpid();
 851         init_ns = preserve_ns(pid, "mnt");
 852         if (init_ns < 0) {
 853                 lxcfs_info("Failed to preserve initial mount namespace");
 854                 goto broken_upgrade;
 855         }
 856
 857         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
 858          * to privately mount lxcfs cgroups. */
 859         if (!cgfs_setup_controllers()) {
 860                 log_exit("Failed to setup private cgroup mounts for lxcfs");
 861                 goto broken_upgrade;
 862         }
 863
 864         if (setns(init_ns, 0) < 0) {
 865                 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
 866                 goto broken_upgrade;
 867         }
 868
 869         if (!init_cpuview()) {
 870                 log_exit("Failed to init CPU view");
 871                 goto broken_upgrade;
 872         }
 873
 874         lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
 875         lxcfs_info("hierarchies:");
 876
 877         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
 878                 char **controller_list = (*h)->controllers;
 879                 __do_free char *controllers = NULL;
 880                 if (controller_list && *controller_list)
 881                         controllers = lxc_string_join(",", (const char **)controller_list, false);
 882                 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
 883         }
 884
 885         pidfd = pidfd_open(pid, 0);
 886         if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
 887                 can_use_pidfd = true;
 888                 lxcfs_info("Kernel supports pidfds");
 889         }
 890
 891         can_use_swap = cgroup_ops->can_use_swap(cgroup_ops);
 892         if (can_use_swap)
 893                 lxcfs_info("Kernel supports swap accounting");
 894         else
 895                 lxcfs_info("Kernel does not support swap accounting");
 896
 897         lxcfs_info("api_extensions:");
 898         for (size_t nr = 0; nr < nr_api_extensions; nr++)
 899                 lxcfs_info("- %s", api_extensions[nr]);
 900
 901         root_fd = open("/", O_PATH | O_CLOEXEC);
 902         if (root_fd < 0)
 903                 lxcfs_info("%s - Failed to open root directory", strerror(errno));
 904         else if (fchdir(root_fd) < 0)
 905                 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
 906
 907         if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) {
 908                 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno));
 909                 goto broken_upgrade;
 910         }
 911
 912         reload_successful = 1;
 913         return;
 914
 915 broken_upgrade:
 916         reload_successful = 0;
 917         lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
 918 }
 919
 920 static void __attribute__((destructor)) lxcfs_exit(void)
 921 {
 922         lxcfs_info("Running destructor %s", __func__);
 923
 924         clear_initpid_store();
 925         free_cpuview();
 926         cgroup_exit(cgroup_ops);
 927 }
 928
 929 void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data)
 930 {
 931         struct fuse_context *fc = fuse_get_context();
 932         can_use_sys_cpu = true;
 933         has_versioned_opts = true;
 934         return fc->private_data;
 935 }