src/bindings.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include "config.h"
   4
   5 #include <dirent.h>
   6 #include <errno.h>
   7 #include <fcntl.h>
   8 #include <inttypes.h>
   9 #include <libgen.h>
  10 #include <linux/magic.h>
  11 #include <linux/sched.h>
  12 #include <pthread.h>
  13 #include <sched.h>
  14 #include <stdarg.h>
  15 #include <stdbool.h>
  16 #include <stdint.h>
  17 #include <stdio.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20 #include <sys/epoll.h>
  21 #include <sys/mman.h>
  22 #include <sys/mount.h>
  23 #include <sys/param.h>
  24 #include <sys/socket.h>
  25 #include <sys/syscall.h>
  26 #include <sys/sysinfo.h>
  27 #include <sys/vfs.h>
  28 #include <time.h>
  29 #include <unistd.h>
  30 #include <wait.h>
  31
  32 #include "bindings.h"
  33
  34 #include "api_extensions.h"
  35 #include "cgroup_fuse.h"
  36 #include "cgroups/cgroup.h"
  37 #include "cgroups/cgroup_utils.h"
  38 #include "memory_utils.h"
  39 #include "proc_cpuview.h"
  40 #include "syscall_numbers.h"
  41 #include "utils.h"
  42
  43 static bool can_use_pidfd;
  44 static bool can_use_swap;
  45 static bool can_use_sys_cpu;
  46 static bool has_versioned_opts;
  47 static bool memory_is_cgroupv2;
  48 static __u32 host_personality;
  49
  50 static volatile sig_atomic_t reload_successful;
  51
  52 bool liblxcfs_functional(void)
  53 {
  54         return reload_successful != 0;
  55 }
  56
  57 bool liblxcfs_can_use_swap(void)
  58 {
  59         return can_use_swap;
  60 }
  61
  62 bool liblxcfs_can_use_sys_cpu(void)
  63 {
  64         return can_use_sys_cpu;
  65 }
  66
  67 bool liblxcfs_has_versioned_opts(void)
  68 {
  69         return has_versioned_opts;
  70 }
  71
  72 bool liblxcfs_memory_is_cgroupv2(void)
  73 {
  74         return memory_is_cgroupv2;
  75 }
  76
  77 __u32 liblxcfs_personality(void)
  78 {
  79         return host_personality;
  80 }
  81
  82 /* Define pivot_root() if missing from the C library */
  83 #ifndef HAVE_PIVOT_ROOT
  84 static int pivot_root(const char *new_root, const char *put_old)
  85 {
  86         return syscall(__NR_pivot_root, new_root, put_old);
  87 }
  88 #else
  89 extern int pivot_root(const char *new_root, const char *put_old);
  90 #endif
  91
  92 /*
  93  * A table caching which pid is init for a pid namespace.
  94  * When looking up which pid is init for $qpid, we first
  95  * 1. Stat /proc/$qpid/ns/pid.
  96  * 2. Check whether the ino_t is in our store.
  97  *   a. if not, fork a child in qpid's ns to send us
  98  *       ucred.pid = 1, and read the initpid.  Cache
  99  *       initpid and creation time for /proc/initpid
 100  *       in a new store entry.
 101  *   b. if so, verify that /proc/initpid still matches
 102  *       what we have saved.  If not, clear the store
 103  *       entry and go back to a.  If so, return the
 104  *       cached initpid.
 105  */
 106 struct pidns_init_store {
 107         ino_t ino;     /* inode number for /proc/$pid/ns/pid */
 108         pid_t initpid; /* the pid of nit in that ns */
 109         int init_pidfd;
 110         int64_t ctime; /* the time at which /proc/$initpid was created */
 111         struct pidns_init_store *next;
 112         int64_t lastcheck;
 113 };
 114
 115 /* lol - look at how they are allocated in the kernel */
 116 #define PIDNS_HASH_SIZE 4096
 117 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 118
 119 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 120 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 121
 122 static void mutex_lock(pthread_mutex_t *l)
 123 {
 124         int ret;
 125
 126         ret = pthread_mutex_lock(l);
 127         if (ret)
 128                 log_exit("%s - returned %d\n", strerror(ret), ret);
 129 }
 130
 131 struct cgroup_ops *cgroup_ops;
 132
 133 static void mutex_unlock(pthread_mutex_t *l)
 134 {
 135         int ret;
 136
 137         ret = pthread_mutex_unlock(l);
 138         if (ret)
 139                 log_exit("%s - returned %d\n", strerror(ret), ret);
 140 }
 141
 142 static inline void store_lock(void)
 143 {
 144         mutex_lock(&pidns_store_mutex);
 145 }
 146
 147 static inline void store_unlock(void)
 148 {
 149         mutex_unlock(&pidns_store_mutex);
 150 }
 151
 152 #define define_interruptible_lock(type, lockname, lockfn)           \
 153         int lockname##_interruptible(type *l)                       \
 154         {                                                           \
 155                 int ret = ETIMEDOUT;                                \
 156                 while (!fuse_interrupted() && (ret == ETIMEDOUT)) { \
 157                         struct timespec deadline;                   \
 158                         clock_gettime(CLOCK_REALTIME, &deadline);   \
 159                         deadline.tv_sec += 1;                       \
 160                         ret = lockfn(l, &deadline);                 \
 161                 }                                                   \
 162                 return -ret;                                        \
 163         }
 164
 165 define_interruptible_lock(pthread_mutex_t, mutex_lock, pthread_mutex_timedlock)
 166 define_interruptible_lock(pthread_rwlock_t, rwlock_rdlock, pthread_rwlock_timedrdlock)
 167 define_interruptible_lock(pthread_rwlock_t, rwlock_wrlock, pthread_rwlock_timedwrlock)
 168
 169 #undef define_interruptible_lock
 170
 171 /* /proc/       =    6
 172  *                +
 173  * <pid-as-str> =   INTTYPE_TO_STRLEN(pid_t)
 174  *                +
 175  * \0           =    1
 176  */
 177 #define LXCFS_PROC_PID_LEN \
 178         (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
 179
 180 static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
 181 {
 182         int ret;
 183
 184         if (entry->init_pidfd < 0)
 185                 return ret_errno(ENOSYS);
 186
 187         ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
 188         if (ret < 0) {
 189                 if (errno == ENOSYS)
 190                         return ret_errno(ENOSYS);
 191
 192                 return 0;
 193         }
 194
 195         return 1;
 196 }
 197
 198 static int initpid_still_valid_stat(struct pidns_init_store *entry)
 199 {
 200         struct stat st;
 201         char path[LXCFS_PROC_PID_LEN];
 202
 203         snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
 204         if (stat(path, &st) || entry->ctime != st.st_ctime)
 205                 return 0;
 206
 207         return 1;
 208 }
 209
 210 /* Must be called under store_lock */
 211 static bool initpid_still_valid(struct pidns_init_store *entry)
 212 {
 213         int ret;
 214
 215         ret = initpid_still_valid_pidfd(entry);
 216         if (ret < 0)
 217                 ret = initpid_still_valid_stat(entry);
 218
 219         return ret == 1;
 220 }
 221
 222 /* Must be called under store_lock */
 223 static void remove_initpid(struct pidns_init_store *entry)
 224 {
 225         struct pidns_init_store *it;
 226         int ino_hash;
 227
 228         lxcfs_debug("Removing cached entry for pid %d from init pid cache",
 229                     entry->initpid);
 230
 231         ino_hash = HASH(entry->ino);
 232         if (pidns_hash_table[ino_hash] == entry) {
 233                 pidns_hash_table[ino_hash] = entry->next;
 234                 close_prot_errno_disarm(entry->init_pidfd);
 235                 free_disarm(entry);
 236                 return;
 237         }
 238
 239         it = pidns_hash_table[ino_hash];
 240         while (it) {
 241                 if (it->next == entry) {
 242                         it->next = entry->next;
 243                         close_prot_errno_disarm(entry->init_pidfd);
 244                         free_disarm(entry);
 245                         return;
 246                 }
 247                 it = it->next;
 248         }
 249 }
 250
 251 #define PURGE_SECS 5
 252 /* Must be called under store_lock */
 253 static void prune_initpid_store(void)
 254 {
 255         static int64_t last_prune = 0;
 256         int64_t now, threshold;
 257
 258         if (!last_prune) {
 259                 last_prune = time(NULL);
 260                 return;
 261         }
 262
 263         now = time(NULL);
 264         if (now < (last_prune + PURGE_SECS))
 265                 return;
 266
 267         lxcfs_debug("Pruning init pid cache");
 268
 269         last_prune = now;
 270         threshold = now - 2 * PURGE_SECS;
 271
 272         for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
 273                 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
 274                         if (entry->lastcheck < threshold) {
 275                                 struct pidns_init_store *cur = entry;
 276
 277                                 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
 278
 279                                 if (prev)
 280                                         prev->next = entry->next;
 281                                 else
 282                                         pidns_hash_table[i] = entry->next;
 283                                 entry = entry->next;
 284                                 close_prot_errno_disarm(cur->init_pidfd);
 285                                 free_disarm(cur);
 286                         } else {
 287                                 prev = entry;
 288                                 entry = entry->next;
 289                         }
 290                 }
 291         }
 292 }
 293
 294 static void clear_initpid_store(void)
 295 {
 296         store_lock();
 297         for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
 298                 for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) {
 299                         struct pidns_init_store *cur = entry;
 300
 301                         lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
 302
 303                         pidns_hash_table[i] = entry->next;
 304                         entry = entry->next;
 305                         close_prot_errno_disarm(cur->init_pidfd);
 306                         free_disarm(cur);
 307                 }
 308         }
 309         store_unlock();
 310 }
 311
 312 /* Must be called under store_lock */
 313 static void save_initpid(ino_t pidns_inode, pid_t pid)
 314 {
 315         __do_free struct pidns_init_store *entry = NULL;
 316         __do_close int pidfd = -EBADF;
 317         const struct lxcfs_opts *opts = fuse_get_context()->private_data;
 318         char path[LXCFS_PROC_PID_LEN];
 319         struct stat st;
 320         int ino_hash;
 321
 322         if (opts && opts->use_pidfd && can_use_pidfd) {
 323                 pidfd = pidfd_open(pid, 0);
 324                 if (pidfd < 0)
 325                         return;
 326         }
 327
 328         snprintf(path, sizeof(path), "/proc/%d", pid);
 329         if (stat(path, &st))
 330                 return;
 331
 332         entry = zalloc(sizeof(*entry));
 333         if (!entry)
 334                 return;
 335
 336         ino_hash = HASH(pidns_inode);
 337         *entry = (struct pidns_init_store){
 338                 .ino            = pidns_inode,
 339                 .initpid        = pid,
 340                 .ctime          = st.st_ctime,
 341                 .next           = pidns_hash_table[ino_hash],
 342                 .lastcheck      = time(NULL),
 343                 .init_pidfd     = move_fd(pidfd),
 344         };
 345         pidns_hash_table[ino_hash] = move_ptr(entry);
 346
 347         lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
 348 }
 349
 350 /*
 351  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 352  * entry for the inode number and creation time.  Verify that the init pid
 353  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 354  * otherwise.
 355  * Must be called under store_lock
 356  */
 357 static pid_t lookup_verify_initpid(ino_t pidns_inode)
 358 {
 359         struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
 360
 361         while (entry) {
 362                 if (entry->ino == pidns_inode) {
 363                         if (initpid_still_valid(entry)) {
 364                                 entry->lastcheck = time(NULL);
 365                                 return entry->initpid;
 366                         }
 367
 368                         remove_initpid(entry);
 369                         return ret_errno(ESRCH);
 370                 }
 371                 entry = entry->next;
 372         }
 373
 374         return ret_errno(ESRCH);
 375 }
 376
 377 static bool send_creds_ok(int sock_fd)
 378 {
 379         char v = '1'; /* we are the child */
 380         struct ucred cred = {
 381             .uid = 0,
 382             .gid = 0,
 383             .pid = 1,
 384         };
 385
 386         return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK;
 387 }
 388
 389 __returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd)
 390 {
 391         /*
 392          * These flags don't interest at all so we don't jump through any hoops
 393          * of retrieving them and passing them to the kernel.
 394          */
 395         errno = EINVAL;
 396         if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
 397                       CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
 398                 return -EINVAL;
 399
 400 #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
 401         /* On s390/s390x and cris the order of the first and second arguments
 402          * of the system call is reversed.
 403          */
 404         return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd);
 405 #elif defined(__sparc__) && defined(__arch64__)
 406         {
 407                 /*
 408                  * sparc64 always returns the other process id in %o0, and a
 409                  * boolean flag whether this is the child or the parent in %o1.
 410                  * Inline assembly is needed to get the flag returned in %o1.
 411                  */
 412                 register long g1 asm("g1") = __NR_clone;
 413                 register long o0 asm("o0") = flags | SIGCHLD;
 414                 register long o1 asm("o1") = 0; /* is parent/child indicator */
 415                 register long o2 asm("o2") = (unsigned long)pidfd;
 416                 long is_error, retval, in_child;
 417                 pid_t child_pid;
 418
 419                 asm volatile(
 420 #if defined(__arch64__)
 421                     "t 0x6d\n\t" /* 64-bit trap */
 422 #else
 423                     "t 0x10\n\t" /* 32-bit trap */
 424 #endif
 425                     /*
 426                      * catch errors: On sparc, the carry bit (csr) in the
 427                      * processor status register (psr) is used instead of a
 428                      * full register.
 429                      */
 430                     "addx %%g0, 0, %%g1"
 431                     : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */
 432                     : "r"(g1), "r"(o0), "r"(o1), "r"(o2)     /* inputs */
 433                     : "%cc");                                /* clobbers */
 434
 435                 is_error = g1;
 436                 retval = o0;
 437                 in_child = o1;
 438
 439                 if (is_error) {
 440                         errno = retval;
 441                         return -1;
 442                 }
 443
 444                 if (in_child)
 445                         return 0;
 446
 447                 child_pid = retval;
 448                 return child_pid;
 449         }
 450 #elif defined(__ia64__)
 451         /* On ia64 the stack and stack size are passed as separate arguments. */
 452         return syscall(__NR_clone, flags | SIGCHLD, NULL, 0, pidfd);
 453 #else
 454         return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd);
 455 #endif
 456 }
 457
 458 #define LXCFS_PROC_PID_NS_LEN                                    \
 459         (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
 460          STRLITERALLEN("/ns/pid") + 1)
 461
 462 /*
 463  * clone a task which switches to @task's namespace and writes '1'.
 464  * over a unix sock so we can read the task's reaper's pid in our
 465  * namespace
 466  *
 467  * Note: glibc's fork() does not respect pidns, which can lead to failed
 468  * assertions inside glibc (and thus failed forks) if the child's pid in
 469  * the pidns and the parent pid outside are identical. Using clone prevents
 470  * this issue.
 471  */
 472 static void write_task_init_pid_exit(int sock, pid_t target)
 473 {
 474         __do_close int fd = -EBADF;
 475         char path[LXCFS_PROC_PID_NS_LEN];
 476         pid_t pid;
 477
 478         snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
 479         fd = open(path, O_RDONLY | O_CLOEXEC);
 480         if (fd < 0)
 481                 log_exit("write_task_init_pid_exit open of ns/pid");
 482
 483         if (setns(fd, 0))
 484                 log_exit("Failed to setns to pid namespace of process %d", target);
 485
 486         pid = lxcfs_raw_clone(0, NULL);
 487         if (pid < 0)
 488                 _exit(EXIT_FAILURE);
 489
 490         if (pid == 0) {
 491                 if (!send_creds_ok(sock))
 492                         _exit(EXIT_FAILURE);
 493
 494                 _exit(EXIT_SUCCESS);
 495         }
 496
 497         if (!wait_for_pid(pid))
 498                 _exit(EXIT_FAILURE);
 499
 500         _exit(EXIT_SUCCESS);
 501 }
 502
 503 static pid_t scm_init_pid(pid_t task)
 504 {
 505         char v = '0';
 506         pid_t pid_ret = -1;
 507         struct ucred cred = {
 508                 .pid = -1,
 509                 .uid = -1,
 510                 .gid = -1,
 511         };
 512         pid_t pid;
 513         int sock[2];
 514
 515         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
 516                 return -1;
 517
 518         pid = fork();
 519         if (pid < 0)
 520                 goto out;
 521
 522         if (pid == 0) {
 523                 close(sock[1]);
 524                 write_task_init_pid_exit(sock[0], task);
 525                 _exit(EXIT_SUCCESS);
 526         }
 527
 528         if (!recv_creds(sock[1], &cred, &v))
 529                 goto out;
 530
 531         pid_ret = cred.pid;
 532
 533 out:
 534         close(sock[0]);
 535         close(sock[1]);
 536         if (pid > 0)
 537                 wait_for_pid(pid);
 538
 539         return pid_ret;
 540 }
 541
 542 pid_t lookup_initpid_in_store(pid_t pid)
 543 {
 544         pid_t hashed_pid = 0;
 545         char path[LXCFS_PROC_PID_NS_LEN];
 546         struct stat st;
 547
 548         snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
 549         if (stat(path, &st))
 550                 return ret_errno(ESRCH);
 551
 552         store_lock();
 553
 554         hashed_pid = lookup_verify_initpid(st.st_ino);
 555         if (hashed_pid < 0) {
 556                 /* release the mutex as the following call is expensive */
 557                 store_unlock();
 558
 559                 hashed_pid = scm_init_pid(pid);
 560
 561                 store_lock();
 562
 563                 if (hashed_pid > 0)
 564                         save_initpid(st.st_ino, hashed_pid);
 565         }
 566
 567         /*
 568          * Prune at the end in case we're pruning the value
 569          * we were about to return.
 570          */
 571         prune_initpid_store();
 572         store_unlock();
 573
 574         return hashed_pid;
 575 }
 576
 577 /*
 578  * Functions needed to setup cgroups in the __constructor__.
 579  */
 580
 581 static bool umount_if_mounted(void)
 582 {
 583         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
 584                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
 585                 return false;
 586         }
 587         return true;
 588 }
 589
 590 /* __typeof__ should be safe to use with all compilers. */
 591 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
 592 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
 593 {
 594         return (fs->f_type == (fs_type_magic)magic_val);
 595 }
 596
 597 /*
 598  * looking at fs/proc_namespace.c, it appears we can
 599  * actually expect the rootfs entry to very specifically contain
 600  * " - rootfs rootfs "
 601  * IIUC, so long as we've chrooted so that rootfs is not our root,
 602  * the rootfs entry should always be skipped in mountinfo contents.
 603  */
 604 static bool is_on_ramfs(void)
 605 {
 606         __do_free char *line = NULL;
 607         __do_free void *fopen_cache = NULL;
 608         __do_fclose FILE *f = NULL;
 609         size_t len = 0;
 610
 611         f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
 612         if (!f)
 613                 return false;
 614
 615         while (getline(&line, &len, f) != -1) {
 616                 int i;
 617                 char *p, *p2;
 618
 619                 for (p = line, i = 0; p && i < 4; i++)
 620                         p = strchr(p + 1, ' ');
 621                 if (!p)
 622                         continue;
 623
 624                 p2 = strchr(p + 1, ' ');
 625                 if (!p2)
 626                         continue;
 627                 *p2 = '\0';
 628                 if (strcmp(p + 1, "/") == 0) {
 629                         /* This is '/'. Is it the ramfs? */
 630                         p = strchr(p2 + 1, '-');
 631                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
 632                                 return true;
 633                 }
 634         }
 635
 636         return false;
 637 }
 638
 639 static int pivot_enter(void)
 640 {
 641         __do_close int oldroot = -EBADF, newroot = -EBADF;
 642
 643         oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
 644         if (oldroot < 0)
 645                 return log_error_errno(-1, errno,
 646                                        "Failed to open old root for fchdir");
 647
 648         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
 649         if (newroot < 0)
 650                 return log_error_errno(-1, errno,
 651                                        "Failed to open new root for fchdir");
 652
 653         /* change into new root fs */
 654         if (fchdir(newroot) < 0)
 655                 return log_error_errno(-1,
 656                                        errno, "Failed to change directory to new rootfs: %s",
 657                                        ROOTDIR);
 658
 659         /* pivot_root into our new root fs */
 660         if (pivot_root(".", ".") < 0)
 661                 return log_error_errno(-1, errno,
 662                                        "pivot_root() syscall failed: %s",
 663                                        strerror(errno));
 664
 665         /*
 666          * At this point the old-root is mounted on top of our new-root.
 667          * To unmounted it we must not be chdir'd into it, so escape back
 668          * to the old-root.
 669          */
 670         if (fchdir(oldroot) < 0)
 671                 return log_error_errno(-1, errno, "Failed to enter old root");
 672
 673         if (umount2(".", MNT_DETACH) < 0)
 674                 return log_error_errno(-1, errno, "Failed to detach old root");
 675
 676         if (fchdir(newroot) < 0)
 677                 return log_error_errno(-1, errno, "Failed to re-enter new root");
 678
 679         return 0;
 680 }
 681
 682 static int chroot_enter(void)
 683 {
 684         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
 685                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
 686                 return -1;
 687         }
 688
 689         if (chroot(".") < 0) {
 690                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
 691                 return -1;
 692         }
 693
 694         if (chdir("/") < 0) {
 695                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
 696                 return -1;
 697         }
 698
 699         return 0;
 700 }
 701
 702 static int permute_and_enter(void)
 703 {
 704         struct statfs sb;
 705
 706         if (statfs("/", &sb) < 0) {
 707                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
 708                 return -1;
 709         }
 710
 711         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
 712          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
 713          * /proc/1/mountinfo. */
 714         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
 715                 return chroot_enter();
 716
 717         if (pivot_enter() < 0) {
 718                 lxcfs_error("%s\n", "Could not perform pivot root.");
 719                 return -1;
 720         }
 721
 722         return 0;
 723 }
 724
 725 /* Prepare our new clean root. */
 726 static int permute_prepare(void)
 727 {
 728         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
 729                 lxcfs_error("%s\n", "Failed to create directory for new root.");
 730                 return -1;
 731         }
 732
 733         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
 734                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
 735                 return -1;
 736         }
 737
 738         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
 739                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
 740                 return -1;
 741         }
 742
 743         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
 744                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
 745                 return -1;
 746         }
 747
 748         return 0;
 749 }
 750
 751 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
 752 static bool permute_root(void)
 753 {
 754         /* Prepare new root. */
 755         if (permute_prepare() < 0)
 756                 return false;
 757
 758         /* Pivot into new root. */
 759         if (permute_and_enter() < 0)
 760                 return false;
 761
 762         return true;
 763 }
 764
 765 static bool cgfs_prepare_mounts(void)
 766 {
 767         if (!mkdir_p(BASEDIR, 0700)) {
 768                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
 769                 return false;
 770         }
 771
 772         if (!umount_if_mounted()) {
 773                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
 774                 return false;
 775         }
 776
 777         if (unshare(CLONE_NEWNS) < 0) {
 778                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
 779                 return false;
 780         }
 781
 782         cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
 783         if (cgroup_ops->mntns_fd < 0) {
 784                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
 785                 return false;
 786         }
 787
 788         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
 789                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
 790                 return false;
 791         }
 792
 793         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
 794                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
 795                 return false;
 796         }
 797
 798         return true;
 799 }
 800
 801 static bool cgfs_mount_hierarchies(void)
 802 {
 803         if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
 804                 return false;
 805
 806         if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
 807                 return false;
 808
 809         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
 810                 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
 811                 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
 812                 if ((*h)->fd < 0)
 813                         return false;
 814         }
 815
 816         return true;
 817 }
 818
 819 static bool cgfs_setup_controllers(void)
 820 {
 821         if (!cgfs_prepare_mounts())
 822                 return false;
 823
 824         if (!cgfs_mount_hierarchies())
 825                 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
 826
 827         if (!permute_root())
 828                 return false;
 829
 830         return true;
 831 }
 832
 833 static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra)
 834 {
 835         int ret;
 836
 837         if (reload_successful) {
 838                 reload_successful = 0;
 839
 840                 /* write() is async signal safe */
 841                 ret = write(STDERR_FILENO,
 842                             "Switched into non-virtualization mode\n",
 843                             STRLITERALLEN("Switched into non-virtualization mode\n"));
 844                 if (ret < 0)
 845                         goto please_compiler;
 846         } else {
 847                 reload_successful = 1;
 848
 849                 /* write() is async signal safe */
 850                 ret = write(STDERR_FILENO, "Switched into virtualization mode\n",
 851                             STRLITERALLEN("Switched into virtualization mode\n"));
 852                 if (ret < 0)
 853                         goto please_compiler;
 854         }
 855
 856 please_compiler:
 857         /*
 858          * The write() syscall is a function whose return value needs to be
 859          * checked. Otherwise the compiler will warn.Another one could be to
 860          * use syscall(__NR_write, ...) directly but whatever.
 861          */
 862         return;
 863 }
 864
 865 static void __attribute__((constructor)) lxcfs_init(void)
 866 {
 867         __do_close int init_ns = -EBADF, root_fd = -EBADF,
 868                                   pidfd = -EBADF;
 869         __do_free char *cgroup = NULL;
 870         int i = 0;
 871         pid_t pid;
 872         struct hierarchy *hierarchy;
 873
 874         lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
 875
 876         cgroup_ops = cgroup_init();
 877         if (!cgroup_ops) {
 878                 lxcfs_info("Failed to initialize cgroup support");
 879                 goto broken_upgrade;
 880         }
 881
 882         /* Preserve initial namespace. */
 883         pid = getpid();
 884         init_ns = preserve_ns(pid, "mnt");
 885         if (init_ns < 0) {
 886                 lxcfs_info("Failed to preserve initial mount namespace");
 887                 goto broken_upgrade;
 888         }
 889
 890         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
 891          * to privately mount lxcfs cgroups. */
 892         if (!cgfs_setup_controllers()) {
 893                 log_exit("Failed to setup private cgroup mounts for lxcfs");
 894                 goto broken_upgrade;
 895         }
 896
 897         if (setns(init_ns, 0) < 0) {
 898                 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
 899                 goto broken_upgrade;
 900         }
 901
 902         if (!init_cpuview()) {
 903                 log_exit("Failed to init CPU view");
 904                 goto broken_upgrade;
 905         }
 906
 907         lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
 908         lxcfs_info("hierarchies:");
 909
 910         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
 911                 char **controller_list = (*h)->controllers;
 912                 __do_free char *controllers = NULL;
 913                 if (controller_list && *controller_list)
 914                         controllers = lxc_string_join(",", (const char **)controller_list, false);
 915                 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
 916         }
 917
 918         pidfd = pidfd_open(pid, 0);
 919         if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
 920                 can_use_pidfd = true;
 921                 lxcfs_info("Kernel supports pidfds");
 922         }
 923
 924         cgroup = get_pid_cgroup(pid, "memory");
 925         can_use_swap = cgroup && cgroup_ops->can_use_swap(cgroup_ops, cgroup);
 926         if (can_use_swap)
 927                 lxcfs_info("Kernel supports swap accounting");
 928         else
 929                 lxcfs_info("Kernel does not support swap accounting");
 930
 931         hierarchy = cgroup_ops->get_hierarchy(cgroup_ops, "memory");
 932         memory_is_cgroupv2 = hierarchy && is_unified_hierarchy(hierarchy);
 933
 934         lxcfs_info("api_extensions:");
 935         for (size_t nr = 0; nr < nr_api_extensions; nr++)
 936                 lxcfs_info("- %s", api_extensions[nr]);
 937
 938         root_fd = open("/", O_PATH | O_CLOEXEC);
 939         if (root_fd < 0)
 940                 lxcfs_info("%s - Failed to open root directory", strerror(errno));
 941         else if (fchdir(root_fd) < 0)
 942                 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
 943
 944         if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) {
 945                 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno));
 946                 goto broken_upgrade;
 947         }
 948
 949         if (get_task_personality(getpid(), &host_personality) < 0) {
 950                 lxcfs_info("Failed to retrieve host personality");
 951                 goto broken_upgrade;
 952         }
 953
 954         reload_successful = 1;
 955         return;
 956
 957 broken_upgrade:
 958         reload_successful = 0;
 959         lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
 960 }
 961
 962 static void __attribute__((destructor)) lxcfs_exit(void)
 963 {
 964         lxcfs_info("Running destructor %s", __func__);
 965
 966         clear_initpid_store();
 967         free_cpuview();
 968         cgroup_exit(cgroup_ops);
 969 }
 970
 971 void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data)
 972 {
 973         struct fuse_context *fc = fuse_get_context();
 974 #if HAVE_FUSE_RETURNS_DT_TYPE
 975         can_use_sys_cpu = true;
 976 #endif
 977         has_versioned_opts = true;
 978         return fc ? fc->private_data : NULL;
 979 }