src/bindings.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include "config.h"
   4
   5 #include <dirent.h>
   6 #include <errno.h>
   7 #include <fcntl.h>
   8 #include <inttypes.h>
   9 #include <libgen.h>
  10 #include <linux/magic.h>
  11 #include <linux/sched.h>
  12 #include <pthread.h>
  13 #include <sched.h>
  14 #include <stdarg.h>
  15 #include <stdbool.h>
  16 #include <stdint.h>
  17 #include <stdio.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20 #include <sys/epoll.h>
  21 #include <sys/mman.h>
  22 #include <sys/mount.h>
  23 #include <sys/param.h>
  24 #include <sys/socket.h>
  25 #include <sys/syscall.h>
  26 #include <sys/sysinfo.h>
  27 #include <sys/vfs.h>
  28 #include <time.h>
  29 #include <unistd.h>
  30 #include <wait.h>
  31
  32 #include "bindings.h"
  33
  34 #include "api_extensions.h"
  35 #include "cgroup_fuse.h"
  36 #include "cgroups/cgroup.h"
  37 #include "cgroups/cgroup_utils.h"
  38 #include "memory_utils.h"
  39 #include "proc_cpuview.h"
  40 #include "syscall_numbers.h"
  41 #include "utils.h"
  42
  43 static bool can_use_pidfd;
  44 static bool can_use_swap;
  45 static bool can_use_sys_cpu;
  46 static bool has_versioned_opts;
  47 static bool memory_is_cgroupv2;
  48
  49 static volatile sig_atomic_t reload_successful;
  50
  51 bool liblxcfs_functional(void)
  52 {
  53         return reload_successful != 0;
  54 }
  55
  56 bool liblxcfs_can_use_swap(void)
  57 {
  58         return can_use_swap;
  59 }
  60
  61 bool liblxcfs_can_use_sys_cpu(void)
  62 {
  63         return can_use_sys_cpu;
  64 }
  65
  66 bool liblxcfs_has_versioned_opts(void)
  67 {
  68         return has_versioned_opts;
  69 }
  70
  71 bool liblxcfs_memory_is_cgroupv2(void)
  72 {
  73         return memory_is_cgroupv2;
  74 }
  75
  76 /* Define pivot_root() if missing from the C library */
  77 #ifndef HAVE_PIVOT_ROOT
  78 static int pivot_root(const char *new_root, const char *put_old)
  79 {
  80         return syscall(__NR_pivot_root, new_root, put_old);
  81 }
  82 #else
  83 extern int pivot_root(const char *new_root, const char *put_old);
  84 #endif
  85
  86 /*
  87  * A table caching which pid is init for a pid namespace.
  88  * When looking up which pid is init for $qpid, we first
  89  * 1. Stat /proc/$qpid/ns/pid.
  90  * 2. Check whether the ino_t is in our store.
  91  *   a. if not, fork a child in qpid's ns to send us
  92  *       ucred.pid = 1, and read the initpid.  Cache
  93  *       initpid and creation time for /proc/initpid
  94  *       in a new store entry.
  95  *   b. if so, verify that /proc/initpid still matches
  96  *       what we have saved.  If not, clear the store
  97  *       entry and go back to a.  If so, return the
  98  *       cached initpid.
  99  */
 100 struct pidns_init_store {
 101         ino_t ino;     /* inode number for /proc/$pid/ns/pid */
 102         pid_t initpid; /* the pid of nit in that ns */
 103         int init_pidfd;
 104         int64_t ctime; /* the time at which /proc/$initpid was created */
 105         struct pidns_init_store *next;
 106         int64_t lastcheck;
 107 };
 108
 109 /* lol - look at how they are allocated in the kernel */
 110 #define PIDNS_HASH_SIZE 4096
 111 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 112
 113 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 114 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 115
 116 static void mutex_lock(pthread_mutex_t *l)
 117 {
 118         int ret;
 119
 120         ret = pthread_mutex_lock(l);
 121         if (ret)
 122                 log_exit("%s - returned %d\n", strerror(ret), ret);
 123 }
 124
 125 struct cgroup_ops *cgroup_ops;
 126
 127 static void mutex_unlock(pthread_mutex_t *l)
 128 {
 129         int ret;
 130
 131         ret = pthread_mutex_unlock(l);
 132         if (ret)
 133                 log_exit("%s - returned %d\n", strerror(ret), ret);
 134 }
 135
 136 static inline void store_lock(void)
 137 {
 138         mutex_lock(&pidns_store_mutex);
 139 }
 140
 141 static inline void store_unlock(void)
 142 {
 143         mutex_unlock(&pidns_store_mutex);
 144 }
 145
 146 /* /proc/       =    6
 147  *                +
 148  * <pid-as-str> =   INTTYPE_TO_STRLEN(pid_t)
 149  *                +
 150  * \0           =    1
 151  */
 152 #define LXCFS_PROC_PID_LEN \
 153         (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
 154
 155 static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
 156 {
 157         int ret;
 158
 159         if (entry->init_pidfd < 0)
 160                 return ret_errno(ENOSYS);
 161
 162         ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
 163         if (ret < 0) {
 164                 if (errno == ENOSYS)
 165                         return ret_errno(ENOSYS);
 166
 167                 return 0;
 168         }
 169
 170         return 1;
 171 }
 172
 173 static int initpid_still_valid_stat(struct pidns_init_store *entry)
 174 {
 175         struct stat st;
 176         char path[LXCFS_PROC_PID_LEN];
 177
 178         snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
 179         if (stat(path, &st) || entry->ctime != st.st_ctime)
 180                 return 0;
 181
 182         return 1;
 183 }
 184
 185 /* Must be called under store_lock */
 186 static bool initpid_still_valid(struct pidns_init_store *entry)
 187 {
 188         int ret;
 189
 190         ret = initpid_still_valid_pidfd(entry);
 191         if (ret < 0)
 192                 ret = initpid_still_valid_stat(entry);
 193
 194         return ret == 1;
 195 }
 196
 197 /* Must be called under store_lock */
 198 static void remove_initpid(struct pidns_init_store *entry)
 199 {
 200         struct pidns_init_store *it;
 201         int ino_hash;
 202
 203         lxcfs_debug("Removing cached entry for pid %d from init pid cache",
 204                     entry->initpid);
 205
 206         ino_hash = HASH(entry->ino);
 207         if (pidns_hash_table[ino_hash] == entry) {
 208                 pidns_hash_table[ino_hash] = entry->next;
 209                 close_prot_errno_disarm(entry->init_pidfd);
 210                 free_disarm(entry);
 211                 return;
 212         }
 213
 214         it = pidns_hash_table[ino_hash];
 215         while (it) {
 216                 if (it->next == entry) {
 217                         it->next = entry->next;
 218                         close_prot_errno_disarm(entry->init_pidfd);
 219                         free_disarm(entry);
 220                         return;
 221                 }
 222                 it = it->next;
 223         }
 224 }
 225
 226 #define PURGE_SECS 5
 227 /* Must be called under store_lock */
 228 static void prune_initpid_store(void)
 229 {
 230         static int64_t last_prune = 0;
 231         int64_t now, threshold;
 232
 233         if (!last_prune) {
 234                 last_prune = time(NULL);
 235                 return;
 236         }
 237
 238         now = time(NULL);
 239         if (now < (last_prune + PURGE_SECS))
 240                 return;
 241
 242         lxcfs_debug("Pruning init pid cache");
 243
 244         last_prune = now;
 245         threshold = now - 2 * PURGE_SECS;
 246
 247         for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
 248                 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
 249                         if (entry->lastcheck < threshold) {
 250                                 struct pidns_init_store *cur = entry;
 251
 252                                 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
 253
 254                                 if (prev)
 255                                         prev->next = entry->next;
 256                                 else
 257                                         pidns_hash_table[i] = entry->next;
 258                                 entry = entry->next;
 259                                 close_prot_errno_disarm(cur->init_pidfd);
 260                                 free_disarm(cur);
 261                         } else {
 262                                 prev = entry;
 263                                 entry = entry->next;
 264                         }
 265                 }
 266         }
 267 }
 268
 269 static void clear_initpid_store(void)
 270 {
 271         store_lock();
 272         for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
 273                 for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) {
 274                         struct pidns_init_store *cur = entry;
 275
 276                         lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
 277
 278                         pidns_hash_table[i] = entry->next;
 279                         entry = entry->next;
 280                         close_prot_errno_disarm(cur->init_pidfd);
 281                         free_disarm(cur);
 282                 }
 283         }
 284         store_unlock();
 285 }
 286
 287 /* Must be called under store_lock */
 288 static void save_initpid(ino_t pidns_inode, pid_t pid)
 289 {
 290         __do_free struct pidns_init_store *entry = NULL;
 291         __do_close int pidfd = -EBADF;
 292         const struct lxcfs_opts *opts = fuse_get_context()->private_data;
 293         char path[LXCFS_PROC_PID_LEN];
 294         struct stat st;
 295         int ino_hash;
 296
 297         if (opts && opts->use_pidfd && can_use_pidfd) {
 298                 pidfd = pidfd_open(pid, 0);
 299                 if (pidfd < 0)
 300                         return;
 301         }
 302
 303         snprintf(path, sizeof(path), "/proc/%d", pid);
 304         if (stat(path, &st))
 305                 return;
 306
 307         entry = zalloc(sizeof(*entry));
 308         if (!entry)
 309                 return;
 310
 311         ino_hash = HASH(pidns_inode);
 312         *entry = (struct pidns_init_store){
 313                 .ino            = pidns_inode,
 314                 .initpid        = pid,
 315                 .ctime          = st.st_ctime,
 316                 .next           = pidns_hash_table[ino_hash],
 317                 .lastcheck      = time(NULL),
 318                 .init_pidfd     = move_fd(pidfd),
 319         };
 320         pidns_hash_table[ino_hash] = move_ptr(entry);
 321
 322         lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
 323 }
 324
 325 /*
 326  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 327  * entry for the inode number and creation time.  Verify that the init pid
 328  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 329  * otherwise.
 330  * Must be called under store_lock
 331  */
 332 static pid_t lookup_verify_initpid(ino_t pidns_inode)
 333 {
 334         struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
 335
 336         while (entry) {
 337                 if (entry->ino == pidns_inode) {
 338                         if (initpid_still_valid(entry)) {
 339                                 entry->lastcheck = time(NULL);
 340                                 return entry->initpid;
 341                         }
 342
 343                         remove_initpid(entry);
 344                         return ret_errno(ESRCH);
 345                 }
 346                 entry = entry->next;
 347         }
 348
 349         return ret_errno(ESRCH);
 350 }
 351
 352 static bool send_creds_ok(int sock_fd)
 353 {
 354         char v = '1'; /* we are the child */
 355         struct ucred cred = {
 356             .uid = 0,
 357             .gid = 0,
 358             .pid = 1,
 359         };
 360
 361         return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK;
 362 }
 363
 364 __returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd)
 365 {
 366         /*
 367          * These flags don't interest at all so we don't jump through any hoops
 368          * of retrieving them and passing them to the kernel.
 369          */
 370         errno = EINVAL;
 371         if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
 372                       CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
 373                 return -EINVAL;
 374
 375 #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
 376         /* On s390/s390x and cris the order of the first and second arguments
 377          * of the system call is reversed.
 378          */
 379         return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd);
 380 #elif defined(__sparc__) && defined(__arch64__)
 381         {
 382                 /*
 383                  * sparc64 always returns the other process id in %o0, and a
 384                  * boolean flag whether this is the child or the parent in %o1.
 385                  * Inline assembly is needed to get the flag returned in %o1.
 386                  */
 387                 register long g1 asm("g1") = __NR_clone;
 388                 register long o0 asm("o0") = flags | SIGCHLD;
 389                 register long o1 asm("o1") = 0; /* is parent/child indicator */
 390                 register long o2 asm("o2") = (unsigned long)pidfd;
 391                 long is_error, retval, in_child;
 392                 pid_t child_pid;
 393
 394                 asm volatile(
 395 #if defined(__arch64__)
 396                     "t 0x6d\n\t" /* 64-bit trap */
 397 #else
 398                     "t 0x10\n\t" /* 32-bit trap */
 399 #endif
 400                     /*
 401                      * catch errors: On sparc, the carry bit (csr) in the
 402                      * processor status register (psr) is used instead of a
 403                      * full register.
 404                      */
 405                     "addx %%g0, 0, %%g1"
 406                     : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */
 407                     : "r"(g1), "r"(o0), "r"(o1), "r"(o2)     /* inputs */
 408                     : "%cc");                                /* clobbers */
 409
 410                 is_error = g1;
 411                 retval = o0;
 412                 in_child = o1;
 413
 414                 if (is_error) {
 415                         errno = retval;
 416                         return -1;
 417                 }
 418
 419                 if (in_child)
 420                         return 0;
 421
 422                 child_pid = retval;
 423                 return child_pid;
 424         }
 425 #elif defined(__ia64__)
 426         /* On ia64 the stack and stack size are passed as separate arguments. */
 427         return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd);
 428 #else
 429         return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd);
 430 #endif
 431 }
 432
 433 #define LXCFS_PROC_PID_NS_LEN                                    \
 434         (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
 435          STRLITERALLEN("/ns/pid") + 1)
 436
 437 /*
 438  * clone a task which switches to @task's namespace and writes '1'.
 439  * over a unix sock so we can read the task's reaper's pid in our
 440  * namespace
 441  *
 442  * Note: glibc's fork() does not respect pidns, which can lead to failed
 443  * assertions inside glibc (and thus failed forks) if the child's pid in
 444  * the pidns and the parent pid outside are identical. Using clone prevents
 445  * this issue.
 446  */
 447 static void write_task_init_pid_exit(int sock, pid_t target)
 448 {
 449         __do_close int fd = -EBADF;
 450         char path[LXCFS_PROC_PID_NS_LEN];
 451         pid_t pid;
 452
 453         snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
 454         fd = open(path, O_RDONLY | O_CLOEXEC);
 455         if (fd < 0)
 456                 log_exit("write_task_init_pid_exit open of ns/pid");
 457
 458         if (setns(fd, 0))
 459                 log_exit("Failed to setns to pid namespace of process %d", target);
 460
 461         pid = lxcfs_raw_clone(0, NULL);
 462         if (pid < 0)
 463                 _exit(EXIT_FAILURE);
 464
 465         if (pid == 0) {
 466                 if (!send_creds_ok(sock))
 467                         _exit(EXIT_FAILURE);
 468
 469                 _exit(EXIT_SUCCESS);
 470         }
 471
 472         if (!wait_for_pid(pid))
 473                 _exit(EXIT_FAILURE);
 474
 475         _exit(EXIT_SUCCESS);
 476 }
 477
 478 static pid_t scm_init_pid(pid_t task)
 479 {
 480         char v = '0';
 481         pid_t pid_ret = -1;
 482         struct ucred cred = {
 483                 .pid = -1,
 484                 .uid = -1,
 485                 .gid = -1,
 486         };
 487         pid_t pid;
 488         int sock[2];
 489
 490         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
 491                 return -1;
 492
 493         pid = fork();
 494         if (pid < 0)
 495                 goto out;
 496
 497         if (pid == 0) {
 498                 close(sock[1]);
 499                 write_task_init_pid_exit(sock[0], task);
 500                 _exit(EXIT_SUCCESS);
 501         }
 502
 503         if (!recv_creds(sock[1], &cred, &v))
 504                 goto out;
 505
 506         pid_ret = cred.pid;
 507
 508 out:
 509         close(sock[0]);
 510         close(sock[1]);
 511         if (pid > 0)
 512                 wait_for_pid(pid);
 513
 514         return pid_ret;
 515 }
 516
 517 pid_t lookup_initpid_in_store(pid_t pid)
 518 {
 519         pid_t hashed_pid = 0;
 520         char path[LXCFS_PROC_PID_NS_LEN];
 521         struct stat st;
 522
 523         snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
 524         if (stat(path, &st))
 525                 return ret_errno(ESRCH);
 526
 527         store_lock();
 528
 529         hashed_pid = lookup_verify_initpid(st.st_ino);
 530         if (hashed_pid < 0) {
 531                 /* release the mutex as the following call is expensive */
 532                 store_unlock();
 533
 534                 hashed_pid = scm_init_pid(pid);
 535
 536                 store_lock();
 537
 538                 if (hashed_pid > 0)
 539                         save_initpid(st.st_ino, hashed_pid);
 540         }
 541
 542         /*
 543          * Prune at the end in case we're pruning the value
 544          * we were about to return.
 545          */
 546         prune_initpid_store();
 547         store_unlock();
 548
 549         return hashed_pid;
 550 }
 551
 552 /*
 553  * Functions needed to setup cgroups in the __constructor__.
 554  */
 555
 556 static bool umount_if_mounted(void)
 557 {
 558         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
 559                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
 560                 return false;
 561         }
 562         return true;
 563 }
 564
 565 /* __typeof__ should be safe to use with all compilers. */
 566 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
 567 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
 568 {
 569         return (fs->f_type == (fs_type_magic)magic_val);
 570 }
 571
 572 /*
 573  * looking at fs/proc_namespace.c, it appears we can
 574  * actually expect the rootfs entry to very specifically contain
 575  * " - rootfs rootfs "
 576  * IIUC, so long as we've chrooted so that rootfs is not our root,
 577  * the rootfs entry should always be skipped in mountinfo contents.
 578  */
 579 static bool is_on_ramfs(void)
 580 {
 581         __do_free char *line = NULL;
 582         __do_free void *fopen_cache = NULL;
 583         __do_fclose FILE *f = NULL;
 584         size_t len = 0;
 585
 586         f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
 587         if (!f)
 588                 return false;
 589
 590         while (getline(&line, &len, f) != -1) {
 591                 int i;
 592                 char *p, *p2;
 593
 594                 for (p = line, i = 0; p && i < 4; i++)
 595                         p = strchr(p + 1, ' ');
 596                 if (!p)
 597                         continue;
 598
 599                 p2 = strchr(p + 1, ' ');
 600                 if (!p2)
 601                         continue;
 602                 *p2 = '\0';
 603                 if (strcmp(p + 1, "/") == 0) {
 604                         /* This is '/'. Is it the ramfs? */
 605                         p = strchr(p2 + 1, '-');
 606                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
 607                                 return true;
 608                 }
 609         }
 610
 611         return false;
 612 }
 613
 614 static int pivot_enter(void)
 615 {
 616         __do_close int oldroot = -EBADF, newroot = -EBADF;
 617
 618         oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
 619         if (oldroot < 0)
 620                 return log_error_errno(-1, errno,
 621                                        "Failed to open old root for fchdir");
 622
 623         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
 624         if (newroot < 0)
 625                 return log_error_errno(-1, errno,
 626                                        "Failed to open new root for fchdir");
 627
 628         /* change into new root fs */
 629         if (fchdir(newroot) < 0)
 630                 return log_error_errno(-1,
 631                                        errno, "Failed to change directory to new rootfs: %s",
 632                                        ROOTDIR);
 633
 634         /* pivot_root into our new root fs */
 635         if (pivot_root(".", ".") < 0)
 636                 return log_error_errno(-1, errno,
 637                                        "pivot_root() syscall failed: %s",
 638                                        strerror(errno));
 639
 640         /*
 641          * At this point the old-root is mounted on top of our new-root.
 642          * To unmounted it we must not be chdir'd into it, so escape back
 643          * to the old-root.
 644          */
 645         if (fchdir(oldroot) < 0)
 646                 return log_error_errno(-1, errno, "Failed to enter old root");
 647
 648         if (umount2(".", MNT_DETACH) < 0)
 649                 return log_error_errno(-1, errno, "Failed to detach old root");
 650
 651         if (fchdir(newroot) < 0)
 652                 return log_error_errno(-1, errno, "Failed to re-enter new root");
 653
 654         return 0;
 655 }
 656
 657 static int chroot_enter(void)
 658 {
 659         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
 660                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
 661                 return -1;
 662         }
 663
 664         if (chroot(".") < 0) {
 665                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
 666                 return -1;
 667         }
 668
 669         if (chdir("/") < 0) {
 670                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
 671                 return -1;
 672         }
 673
 674         return 0;
 675 }
 676
 677 static int permute_and_enter(void)
 678 {
 679         struct statfs sb;
 680
 681         if (statfs("/", &sb) < 0) {
 682                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
 683                 return -1;
 684         }
 685
 686         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
 687          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
 688          * /proc/1/mountinfo. */
 689         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
 690                 return chroot_enter();
 691
 692         if (pivot_enter() < 0) {
 693                 lxcfs_error("%s\n", "Could not perform pivot root.");
 694                 return -1;
 695         }
 696
 697         return 0;
 698 }
 699
 700 /* Prepare our new clean root. */
 701 static int permute_prepare(void)
 702 {
 703         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
 704                 lxcfs_error("%s\n", "Failed to create directory for new root.");
 705                 return -1;
 706         }
 707
 708         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
 709                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
 710                 return -1;
 711         }
 712
 713         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
 714                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
 715                 return -1;
 716         }
 717
 718         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
 719                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
 720                 return -1;
 721         }
 722
 723         return 0;
 724 }
 725
 726 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
 727 static bool permute_root(void)
 728 {
 729         /* Prepare new root. */
 730         if (permute_prepare() < 0)
 731                 return false;
 732
 733         /* Pivot into new root. */
 734         if (permute_and_enter() < 0)
 735                 return false;
 736
 737         return true;
 738 }
 739
 740 static bool cgfs_prepare_mounts(void)
 741 {
 742         if (!mkdir_p(BASEDIR, 0700)) {
 743                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
 744                 return false;
 745         }
 746
 747         if (!umount_if_mounted()) {
 748                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
 749                 return false;
 750         }
 751
 752         if (unshare(CLONE_NEWNS) < 0) {
 753                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
 754                 return false;
 755         }
 756
 757         cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
 758         if (cgroup_ops->mntns_fd < 0) {
 759                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
 760                 return false;
 761         }
 762
 763         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
 764                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
 765                 return false;
 766         }
 767
 768         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
 769                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
 770                 return false;
 771         }
 772
 773         return true;
 774 }
 775
 776 static bool cgfs_mount_hierarchies(void)
 777 {
 778         if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
 779                 return false;
 780
 781         if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
 782                 return false;
 783
 784         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
 785                 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
 786                 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
 787                 if ((*h)->fd < 0)
 788                         return false;
 789         }
 790
 791         return true;
 792 }
 793
 794 static bool cgfs_setup_controllers(void)
 795 {
 796         if (!cgfs_prepare_mounts())
 797                 return false;
 798
 799         if (!cgfs_mount_hierarchies())
 800                 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
 801
 802         if (!permute_root())
 803                 return false;
 804
 805         return true;
 806 }
 807
 808 static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra)
 809 {
 810         int ret;
 811
 812         if (reload_successful) {
 813                 reload_successful = 0;
 814
 815                 /* write() is async signal safe */
 816                 ret = write(STDERR_FILENO,
 817                             "Switched into non-virtualization mode\n",
 818                             STRLITERALLEN("Switched into non-virtualization mode\n"));
 819                 if (ret < 0)
 820                         goto please_compiler;
 821         } else {
 822                 reload_successful = 1;
 823
 824                 /* write() is async signal safe */
 825                 ret = write(STDERR_FILENO, "Switched into virtualization mode\n",
 826                             STRLITERALLEN("Switched into virtualization mode\n"));
 827                 if (ret < 0)
 828                         goto please_compiler;
 829         }
 830
 831 please_compiler:
 832         /*
 833          * The write() syscall is a function whose return value needs to be
 834          * checked. Otherwise the compiler will warn.Another one could be to
 835          * use syscall(__NR_write, ...) directly but whatever.
 836          */
 837         return;
 838 }
 839
 840 static void __attribute__((constructor)) lxcfs_init(void)
 841 {
 842         __do_close int init_ns = -EBADF, root_fd = -EBADF,
 843                                   pidfd = -EBADF;
 844         int i = 0;
 845         pid_t pid;
 846         struct hierarchy *hierarchy;
 847
 848         lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
 849
 850         cgroup_ops = cgroup_init();
 851         if (!cgroup_ops) {
 852                 lxcfs_info("Failed to initialize cgroup support");
 853                 goto broken_upgrade;
 854         }
 855
 856         /* Preserve initial namespace. */
 857         pid = getpid();
 858         init_ns = preserve_ns(pid, "mnt");
 859         if (init_ns < 0) {
 860                 lxcfs_info("Failed to preserve initial mount namespace");
 861                 goto broken_upgrade;
 862         }
 863
 864         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
 865          * to privately mount lxcfs cgroups. */
 866         if (!cgfs_setup_controllers()) {
 867                 log_exit("Failed to setup private cgroup mounts for lxcfs");
 868                 goto broken_upgrade;
 869         }
 870
 871         if (setns(init_ns, 0) < 0) {
 872                 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
 873                 goto broken_upgrade;
 874         }
 875
 876         if (!init_cpuview()) {
 877                 log_exit("Failed to init CPU view");
 878                 goto broken_upgrade;
 879         }
 880
 881         lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
 882         lxcfs_info("hierarchies:");
 883
 884         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
 885                 char **controller_list = (*h)->controllers;
 886                 __do_free char *controllers = NULL;
 887                 if (controller_list && *controller_list)
 888                         controllers = lxc_string_join(",", (const char **)controller_list, false);
 889                 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
 890         }
 891
 892         pidfd = pidfd_open(pid, 0);
 893         if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
 894                 can_use_pidfd = true;
 895                 lxcfs_info("Kernel supports pidfds");
 896         }
 897
 898         can_use_swap = cgroup_ops->can_use_swap(cgroup_ops);
 899         if (can_use_swap)
 900                 lxcfs_info("Kernel supports swap accounting");
 901         else
 902                 lxcfs_info("Kernel does not support swap accounting");
 903
 904         hierarchy = cgroup_ops->get_hierarchy(cgroup_ops, "memory");
 905         memory_is_cgroupv2 = hierarchy && is_unified_hierarchy(hierarchy);
 906
 907         lxcfs_info("api_extensions:");
 908         for (size_t nr = 0; nr < nr_api_extensions; nr++)
 909                 lxcfs_info("- %s", api_extensions[nr]);
 910
 911         root_fd = open("/", O_PATH | O_CLOEXEC);
 912         if (root_fd < 0)
 913                 lxcfs_info("%s - Failed to open root directory", strerror(errno));
 914         else if (fchdir(root_fd) < 0)
 915                 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
 916
 917         if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) {
 918                 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno));
 919                 goto broken_upgrade;
 920         }
 921
 922         reload_successful = 1;
 923         return;
 924
 925 broken_upgrade:
 926         reload_successful = 0;
 927         lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
 928 }
 929
 930 static void __attribute__((destructor)) lxcfs_exit(void)
 931 {
 932         lxcfs_info("Running destructor %s", __func__);
 933
 934         clear_initpid_store();
 935         free_cpuview();
 936         cgroup_exit(cgroup_ops);
 937 }
 938
 939 void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data)
 940 {
 941         struct fuse_context *fc = fuse_get_context();
 942 #if HAVE_FUSE_RETURNS_DT_TYPE
 943         can_use_sys_cpu = true;
 944 #endif
 945         has_versioned_opts = true;
 946         return fc ? fc->private_data : NULL;
 947 }