src/bindings.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE
   5 #endif
   6
   7 #ifndef FUSE_USE_VERSION
   8 #define FUSE_USE_VERSION 26
   9 #endif
  10
  11 #define _FILE_OFFSET_BITS 64
  12
  13 #include <dirent.h>
  14 #include <errno.h>
  15 #include <fcntl.h>
  16 #include <fuse.h>
  17 #include <inttypes.h>
  18 #include <libgen.h>
  19 #include <linux/magic.h>
  20 #include <linux/sched.h>
  21 #include <pthread.h>
  22 #include <sched.h>
  23 #include <stdarg.h>
  24 #include <stdbool.h>
  25 #include <stdint.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <sys/epoll.h>
  30 #include <sys/mman.h>
  31 #include <sys/mount.h>
  32 #include <sys/param.h>
  33 #include <sys/socket.h>
  34 #include <sys/syscall.h>
  35 #include <sys/sysinfo.h>
  36 #include <sys/vfs.h>
  37 #include <time.h>
  38 #include <unistd.h>
  39 #include <wait.h>
  40
  41 #include "api_extensions.h"
  42 #include "bindings.h"
  43 #include "cgroup_fuse.h"
  44 #include "cgroups/cgroup.h"
  45 #include "cgroups/cgroup_utils.h"
  46 #include "config.h"
  47 #include "memory_utils.h"
  48 #include "proc_cpuview.h"
  49 #include "syscall_numbers.h"
  50 #include "utils.h"
  51
  52 static bool can_use_pidfd;
  53
  54 static volatile sig_atomic_t reload_successful;
  55
  56 bool liblxcfs_functional(void)
  57 {
  58         return reload_successful != 0;
  59 }
  60
  61 /* Define pivot_root() if missing from the C library */
  62 #ifndef HAVE_PIVOT_ROOT
  63 static int pivot_root(const char *new_root, const char *put_old)
  64 {
  65         return syscall(__NR_pivot_root, new_root, put_old);
  66 }
  67 #else
  68 extern int pivot_root(const char *new_root, const char *put_old);
  69 #endif
  70
  71 /*
  72  * A table caching which pid is init for a pid namespace.
  73  * When looking up which pid is init for $qpid, we first
  74  * 1. Stat /proc/$qpid/ns/pid.
  75  * 2. Check whether the ino_t is in our store.
  76  *   a. if not, fork a child in qpid's ns to send us
  77  *       ucred.pid = 1, and read the initpid.  Cache
  78  *       initpid and creation time for /proc/initpid
  79  *       in a new store entry.
  80  *   b. if so, verify that /proc/initpid still matches
  81  *       what we have saved.  If not, clear the store
  82  *       entry and go back to a.  If so, return the
  83  *       cached initpid.
  84  */
  85 struct pidns_init_store {
  86         ino_t ino;     /* inode number for /proc/$pid/ns/pid */
  87         pid_t initpid; /* the pid of nit in that ns */
  88         int init_pidfd;
  89         int64_t ctime; /* the time at which /proc/$initpid was created */
  90         struct pidns_init_store *next;
  91         int64_t lastcheck;
  92 };
  93
  94 /* lol - look at how they are allocated in the kernel */
  95 #define PIDNS_HASH_SIZE 4096
  96 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
  97
  98 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
  99 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 100
 101 static void lock_mutex(pthread_mutex_t *l)
 102 {
 103         int ret;
 104
 105         ret = pthread_mutex_lock(l);
 106         if (ret)
 107                 log_exit("%s - returned %d\n", strerror(ret), ret);
 108 }
 109
 110 struct cgroup_ops *cgroup_ops;
 111
 112 static void unlock_mutex(pthread_mutex_t *l)
 113 {
 114         int ret;
 115
 116         ret = pthread_mutex_unlock(l);
 117         if (ret)
 118                 log_exit("%s - returned %d\n", strerror(ret), ret);
 119 }
 120
 121 static inline void unlock_mutex_function(pthread_mutex_t **mutex)
 122 {
 123         if (*mutex)
 124                 unlock_mutex(*mutex);
 125 }
 126 #define __do_unlock call_cleaner(unlock_mutex)
 127
 128 static pthread_mutex_t* __attribute__((warn_unused_result)) store_lock(void)
 129 {
 130         lock_mutex(&pidns_store_mutex);
 131         return &pidns_store_mutex;
 132 }
 133
 134 /* /proc/       =    6
 135  *                +
 136  * <pid-as-str> =   INTTYPE_TO_STRLEN(pid_t)
 137  *                +
 138  * \0           =    1
 139  */
 140 #define LXCFS_PROC_PID_LEN \
 141         (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1)
 142
 143 static int initpid_still_valid_pidfd(struct pidns_init_store *entry)
 144 {
 145         int ret;
 146
 147         if (entry->init_pidfd < 0)
 148                 return ret_errno(ENOSYS);
 149
 150         ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0);
 151         if (ret < 0) {
 152                 if (errno == ENOSYS)
 153                         return ret_errno(ENOSYS);
 154
 155                 return 0;
 156         }
 157
 158         return 1;
 159 }
 160
 161 static int initpid_still_valid_stat(struct pidns_init_store *entry)
 162 {
 163         struct stat st;
 164         char path[LXCFS_PROC_PID_LEN];
 165
 166         snprintf(path, sizeof(path), "/proc/%d", entry->initpid);
 167         if (stat(path, &st) || entry->ctime != st.st_ctime)
 168                 return 0;
 169
 170         return 1;
 171 }
 172
 173 /* Must be called under store_lock */
 174 static bool initpid_still_valid(struct pidns_init_store *entry)
 175 {
 176         int ret;
 177
 178         ret = initpid_still_valid_pidfd(entry);
 179         if (ret < 0)
 180                 ret = initpid_still_valid_stat(entry);
 181
 182         return ret == 1;
 183 }
 184
 185 /* Must be called under store_lock */
 186 static void remove_initpid(struct pidns_init_store *entry)
 187 {
 188         struct pidns_init_store *it;
 189         int ino_hash;
 190
 191         lxcfs_debug("Removing cached entry for pid %d from init pid cache",
 192                     entry->initpid);
 193
 194         ino_hash = HASH(entry->ino);
 195         if (pidns_hash_table[ino_hash] == entry) {
 196                 pidns_hash_table[ino_hash] = entry->next;
 197                 close_prot_errno_disarm(entry->init_pidfd);
 198                 free_disarm(entry);
 199                 return;
 200         }
 201
 202         it = pidns_hash_table[ino_hash];
 203         while (it) {
 204                 if (it->next == entry) {
 205                         it->next = entry->next;
 206                         close_prot_errno_disarm(entry->init_pidfd);
 207                         free_disarm(entry);
 208                         return;
 209                 }
 210                 it = it->next;
 211         }
 212 }
 213
 214 #define PURGE_SECS 5
 215 /* Must be called under store_lock */
 216 static void prune_initpid_store(void)
 217 {
 218         static int64_t last_prune = 0;
 219         int64_t now, threshold;
 220
 221         if (!last_prune) {
 222                 last_prune = time(NULL);
 223                 return;
 224         }
 225
 226         now = time(NULL);
 227         if (now < last_prune + PURGE_SECS)
 228                 return;
 229
 230         lxcfs_debug("Pruning init pid cache");
 231
 232         last_prune = now;
 233         threshold = now - 2 * PURGE_SECS;
 234
 235         for (int i = 0; i < PIDNS_HASH_SIZE; i++) {
 236                 for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) {
 237                         if (entry->lastcheck < threshold) {
 238                                 struct pidns_init_store *cur = entry;
 239
 240                                 lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid);
 241
 242                                 if (prev)
 243                                         prev->next = entry->next;
 244                                 else
 245                                         pidns_hash_table[i] = entry->next;
 246                                 entry = entry->next;
 247                                 close_prot_errno_disarm(cur->init_pidfd);
 248                                 free_disarm(cur);
 249                         } else {
 250                                 prev = entry;
 251                                 entry = entry->next;
 252                         }
 253                 }
 254         }
 255 }
 256
 257 /* Must be called under store_lock */
 258 static void save_initpid(ino_t pidns_inode, pid_t pid)
 259 {
 260         __do_free struct pidns_init_store *entry = NULL;
 261         __do_close int pidfd = -EBADF;
 262         char path[LXCFS_PROC_PID_LEN];
 263         struct lxcfs_opts *opts = fuse_get_context()->private_data;
 264         struct stat st;
 265         int ino_hash;
 266
 267         if (opts && opts->use_pidfd && can_use_pidfd) {
 268                 pidfd = pidfd_open(pid, 0);
 269                 if (pidfd < 0)
 270                         return;
 271         }
 272
 273         snprintf(path, sizeof(path), "/proc/%d", pid);
 274         if (stat(path, &st))
 275                 return;
 276
 277         entry = malloc(sizeof(*entry));
 278         if (entry)
 279                 return;
 280
 281         ino_hash = HASH(entry->ino);
 282         *entry = (struct pidns_init_store){
 283                 .ino            = pidns_inode,
 284                 .initpid        = pid,
 285                 .ctime          = st.st_ctime,
 286                 .next           = pidns_hash_table[ino_hash],
 287                 .lastcheck      = time(NULL),
 288                 .init_pidfd     = move_fd(pidfd),
 289         };
 290         pidns_hash_table[ino_hash] = move_ptr(entry);
 291
 292         lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid);
 293 }
 294
 295 /*
 296  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 297  * entry for the inode number and creation time.  Verify that the init pid
 298  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 299  * otherwise.
 300  * Must be called under store_lock
 301  */
 302 static struct pidns_init_store *lookup_verify_initpid(ino_t pidns_inode)
 303 {
 304         struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)];
 305
 306         while (entry) {
 307                 if (entry->ino == pidns_inode) {
 308                         if (initpid_still_valid(entry)) {
 309                                 entry->lastcheck = time(NULL);
 310                                 return entry;
 311                         }
 312
 313                         remove_initpid(entry);
 314                         return NULL;
 315                 }
 316                 entry = entry->next;
 317         }
 318
 319         return NULL;
 320 }
 321
 322 static int send_creds_clone_wrapper(void *arg)
 323 {
 324         int sock = PTR_TO_INT(arg);
 325         char v = '1'; /* we are the child */
 326         struct ucred cred = {
 327             .uid = 0,
 328             .gid = 0,
 329             .pid = 1,
 330         };
 331
 332         return send_creds(sock, &cred, v, true) != SEND_CREDS_OK;
 333 }
 334
 335 /*
 336  * Let's use the "standard stack limit" (i.e. glibc thread size default) for
 337  * stack sizes: 8MB.
 338  */
 339 #define __LXCFS_STACK_SIZE (8 * 1024 * 1024)
 340 pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags)
 341 {
 342         pid_t ret;
 343         void *stack;
 344
 345         stack = malloc(__LXCFS_STACK_SIZE);
 346         if (!stack)
 347                 return ret_errno(ENOMEM);
 348
 349 #ifdef __ia64__
 350         ret = __clone2(fn, stack, __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
 351 #else
 352         ret = clone(fn, stack + __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL);
 353 #endif
 354         return ret;
 355 }
 356
 357 #define LXCFS_PROC_PID_NS_LEN                                    \
 358         (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \
 359          STRLITERALLEN("/ns/pid") + 1)
 360
 361 /*
 362  * clone a task which switches to @task's namespace and writes '1'.
 363  * over a unix sock so we can read the task's reaper's pid in our
 364  * namespace
 365  *
 366  * Note: glibc's fork() does not respect pidns, which can lead to failed
 367  * assertions inside glibc (and thus failed forks) if the child's pid in
 368  * the pidns and the parent pid outside are identical. Using clone prevents
 369  * this issue.
 370  */
 371 static void write_task_init_pid_exit(int sock, pid_t target)
 372 {
 373         __do_close int fd = -EBADF;
 374         char path[LXCFS_PROC_PID_NS_LEN];
 375         pid_t pid;
 376
 377         snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target);
 378         fd = open(path, O_RDONLY | O_CLOEXEC);
 379         if (fd < 0)
 380                 log_exit("write_task_init_pid_exit open of ns/pid");
 381
 382         if (setns(fd, 0))
 383                 log_exit("Failed to setns to pid namespace of process %d", target);
 384
 385         pid = lxcfs_clone(send_creds_clone_wrapper, INT_TO_PTR(sock), 0);
 386         if (pid < 0)
 387                 _exit(EXIT_FAILURE);
 388
 389         if (pid != 0) {
 390                 if (!wait_for_pid(pid))
 391                         _exit(EXIT_FAILURE);
 392
 393                 _exit(EXIT_SUCCESS);
 394         }
 395 }
 396
 397 static pid_t get_init_pid_for_task(pid_t task)
 398 {
 399         char v = '0';
 400         pid_t pid_ret = -1;
 401         pid_t pid;
 402         int sock[2];
 403         struct ucred cred;
 404
 405         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0)
 406                 return -1;
 407
 408         pid = fork();
 409         if (pid < 0)
 410                 goto out;
 411
 412         if (pid == 0) {
 413                 close(sock[1]);
 414                 write_task_init_pid_exit(sock[0], task);
 415                 _exit(EXIT_SUCCESS);
 416         }
 417
 418         if (!recv_creds(sock[1], &cred, &v))
 419                 goto out;
 420
 421         pid_ret = cred.pid;
 422
 423 out:
 424         close(sock[0]);
 425         close(sock[1]);
 426         if (pid > 0)
 427                 wait_for_pid(pid);
 428
 429         return pid_ret;
 430 }
 431
 432 pid_t lookup_initpid_in_store(pid_t pid)
 433 {
 434         __do_unlock pthread_mutex_t *store_mutex = NULL;
 435         pid_t answer = 0;
 436         char path[LXCFS_PROC_PID_NS_LEN];
 437         struct stat st;
 438         struct pidns_init_store *entry;
 439
 440         snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
 441
 442         if (stat(path, &st))
 443                 goto out;
 444
 445         store_mutex = store_lock();
 446
 447         entry = lookup_verify_initpid(st.st_ino);
 448         if (entry) {
 449                 answer = entry->initpid;
 450                 goto out;
 451         }
 452
 453         /* release the mutex as the following call is expensive */
 454         unlock_mutex(move_ptr(store_mutex));
 455         answer = get_init_pid_for_task(pid);
 456         store_mutex = store_lock();
 457
 458         if (answer > 0)
 459                 save_initpid(st.st_ino, answer);
 460
 461 out:
 462         /*
 463          * Prune at the end in case we're returning the value we were about to
 464          * return.
 465          */
 466         prune_initpid_store();
 467
 468         return answer;
 469 }
 470
 471 /*
 472  * Functions needed to setup cgroups in the __constructor__.
 473  */
 474
 475 static bool umount_if_mounted(void)
 476 {
 477         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
 478                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
 479                 return false;
 480         }
 481         return true;
 482 }
 483
 484 /* __typeof__ should be safe to use with all compilers. */
 485 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
 486 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
 487 {
 488         return (fs->f_type == (fs_type_magic)magic_val);
 489 }
 490
 491 /*
 492  * looking at fs/proc_namespace.c, it appears we can
 493  * actually expect the rootfs entry to very specifically contain
 494  * " - rootfs rootfs "
 495  * IIUC, so long as we've chrooted so that rootfs is not our root,
 496  * the rootfs entry should always be skipped in mountinfo contents.
 497  */
 498 static bool is_on_ramfs(void)
 499 {
 500         __do_free char *line = NULL;
 501         __do_free void *fopen_cache = NULL;
 502         __do_fclose FILE *f = NULL;
 503         size_t len = 0;
 504
 505         f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
 506         if (!f)
 507                 return false;
 508
 509         while (getline(&line, &len, f) != -1) {
 510                 int i;
 511                 char *p, *p2;
 512
 513                 for (p = line, i = 0; p && i < 4; i++)
 514                         p = strchr(p + 1, ' ');
 515                 if (!p)
 516                         continue;
 517
 518                 p2 = strchr(p + 1, ' ');
 519                 if (!p2)
 520                         continue;
 521                 *p2 = '\0';
 522                 if (strcmp(p + 1, "/") == 0) {
 523                         /* This is '/'. Is it the ramfs? */
 524                         p = strchr(p2 + 1, '-');
 525                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
 526                                 return true;
 527                 }
 528         }
 529
 530         return false;
 531 }
 532
 533 static int pivot_enter()
 534 {
 535         __do_close int oldroot = -EBADF, newroot = -EBADF;
 536
 537         oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
 538         if (oldroot < 0)
 539                 return log_error_errno(-1, errno,
 540                                        "Failed to open old root for fchdir");
 541
 542         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
 543         if (newroot < 0)
 544                 return log_error_errno(-1, errno,
 545                                        "Failed to open new root for fchdir");
 546
 547         /* change into new root fs */
 548         if (fchdir(newroot) < 0)
 549                 return log_error_errno(-1,
 550                                        errno, "Failed to change directory to new rootfs: %s",
 551                                        ROOTDIR);
 552
 553         /* pivot_root into our new root fs */
 554         if (pivot_root(".", ".") < 0)
 555                 return log_error_errno(-1, errno,
 556                                        "pivot_root() syscall failed: %s",
 557                                        strerror(errno));
 558
 559         /*
 560          * At this point the old-root is mounted on top of our new-root.
 561          * To unmounted it we must not be chdir'd into it, so escape back
 562          * to the old-root.
 563          */
 564         if (fchdir(oldroot) < 0)
 565                 return log_error_errno(-1, errno, "Failed to enter old root");
 566
 567         if (umount2(".", MNT_DETACH) < 0)
 568                 return log_error_errno(-1, errno, "Failed to detach old root");
 569
 570         if (fchdir(newroot) < 0)
 571                 return log_error_errno(-1, errno, "Failed to re-enter new root");
 572
 573         return 0;
 574 }
 575
 576 static int chroot_enter()
 577 {
 578         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
 579                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
 580                 return -1;
 581         }
 582
 583         if (chroot(".") < 0) {
 584                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
 585                 return -1;
 586         }
 587
 588         if (chdir("/") < 0) {
 589                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
 590                 return -1;
 591         }
 592
 593         return 0;
 594 }
 595
 596 static int permute_and_enter(void)
 597 {
 598         struct statfs sb;
 599
 600         if (statfs("/", &sb) < 0) {
 601                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
 602                 return -1;
 603         }
 604
 605         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
 606          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
 607          * /proc/1/mountinfo. */
 608         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
 609                 return chroot_enter();
 610
 611         if (pivot_enter() < 0) {
 612                 lxcfs_error("%s\n", "Could not perform pivot root.");
 613                 return -1;
 614         }
 615
 616         return 0;
 617 }
 618
 619 /* Prepare our new clean root. */
 620 static int permute_prepare(void)
 621 {
 622         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
 623                 lxcfs_error("%s\n", "Failed to create directory for new root.");
 624                 return -1;
 625         }
 626
 627         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
 628                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
 629                 return -1;
 630         }
 631
 632         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
 633                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
 634                 return -1;
 635         }
 636
 637         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
 638                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
 639                 return -1;
 640         }
 641
 642         return 0;
 643 }
 644
 645 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
 646 static bool permute_root(void)
 647 {
 648         /* Prepare new root. */
 649         if (permute_prepare() < 0)
 650                 return false;
 651
 652         /* Pivot into new root. */
 653         if (permute_and_enter() < 0)
 654                 return false;
 655
 656         return true;
 657 }
 658
 659 static bool cgfs_prepare_mounts(void)
 660 {
 661         if (!mkdir_p(BASEDIR, 0700)) {
 662                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
 663                 return false;
 664         }
 665
 666         if (!umount_if_mounted()) {
 667                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
 668                 return false;
 669         }
 670
 671         if (unshare(CLONE_NEWNS) < 0) {
 672                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
 673                 return false;
 674         }
 675
 676         cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
 677         if (cgroup_ops->mntns_fd < 0) {
 678                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
 679                 return false;
 680         }
 681
 682         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
 683                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
 684                 return false;
 685         }
 686
 687         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
 688                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
 689                 return false;
 690         }
 691
 692         return true;
 693 }
 694
 695 static bool cgfs_mount_hierarchies(void)
 696 {
 697         if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
 698                 return false;
 699
 700         if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
 701                 return false;
 702
 703         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
 704                 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
 705                 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
 706                 if ((*h)->fd < 0)
 707                         return false;
 708         }
 709
 710         return true;
 711 }
 712
 713 static bool cgfs_setup_controllers(void)
 714 {
 715         if (!cgfs_prepare_mounts())
 716                 return false;
 717
 718         if (!cgfs_mount_hierarchies())
 719                 return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts");
 720
 721         if (!permute_root())
 722                 return false;
 723
 724         return true;
 725 }
 726
 727 static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra)
 728 {
 729         int ret;
 730
 731         if (reload_successful) {
 732                 reload_successful = 0;
 733
 734                 /* write() is async signal safe */
 735                 ret = write(STDERR_FILENO,
 736                             "Switched into non-virtualization mode\n",
 737                             STRLITERALLEN("Switched into non-virtualization mode\n"));
 738                 if (ret < 0)
 739                         goto please_compiler;
 740         } else {
 741                 reload_successful = 1;
 742
 743                 /* write() is async signal safe */
 744                 ret = write(STDERR_FILENO, "Switched into virtualization mode\n",
 745                             STRLITERALLEN("Switched into virtualization mode\n"));
 746                 if (ret < 0)
 747                         goto please_compiler;
 748         }
 749
 750 please_compiler:
 751         /*
 752          * The write() syscall is a function whose return value needs to be
 753          * checked. Otherwise the compiler will warn. This is how we
 754          * please our master. Another one could be to use
 755          * syscall(__NR_write, ...) directly but whatever.
 756          */
 757         return;
 758 }
 759
 760 static void __attribute__((constructor)) lxcfs_init(void)
 761 {
 762         __do_close int init_ns = -EBADF, root_fd = -EBADF,
 763                                   pidfd = -EBADF;
 764         int i = 0;
 765         pid_t pid;
 766
 767         lxcfs_info("Running constructor %s to reload liblxcfs", __func__);
 768
 769         cgroup_ops = cgroup_init();
 770         if (!cgroup_ops) {
 771                 lxcfs_info("Failed to initialize cgroup support");
 772                 goto broken_upgrade;
 773         }
 774
 775         /* Preserve initial namespace. */
 776         pid = getpid();
 777         init_ns = preserve_ns(pid, "mnt");
 778         if (init_ns < 0) {
 779                 lxcfs_info("Failed to preserve initial mount namespace");
 780                 goto broken_upgrade;
 781         }
 782
 783         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
 784          * to privately mount lxcfs cgroups. */
 785         if (!cgfs_setup_controllers()) {
 786                 log_exit("Failed to setup private cgroup mounts for lxcfs");
 787                 goto broken_upgrade;
 788         }
 789
 790         if (setns(init_ns, 0) < 0) {
 791                 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
 792                 goto broken_upgrade;
 793         }
 794
 795         if (!init_cpuview()) {
 796                 log_exit("Failed to init CPU view");
 797                 goto broken_upgrade;
 798         }
 799
 800         lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd);
 801         lxcfs_info("hierarchies:");
 802
 803         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
 804                 char **controller_list = (*h)->controllers;
 805                 __do_free char *controllers = NULL;
 806                 if (controller_list && *controller_list)
 807                         controllers = lxc_string_join(",", (const char **)controller_list, false);
 808                 lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: "");
 809         }
 810
 811         pidfd = pidfd_open(pid, 0);
 812         if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) {
 813                 can_use_pidfd = true;
 814                 lxcfs_info("Kernel supports pidfds");
 815         }
 816
 817         lxcfs_info("api_extensions:");
 818         for (i = 0; i < nr_api_extensions; i++)
 819                 lxcfs_info("- %s", api_extensions[i]);
 820
 821         root_fd = open("/", O_PATH | O_CLOEXEC);
 822         if (root_fd < 0)
 823                 lxcfs_info("%s - Failed to open root directory", strerror(errno));
 824         else if (fchdir(root_fd) < 0)
 825                 lxcfs_info("%s - Failed to change to root directory", strerror(errno));
 826
 827         if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) {
 828                 lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno));
 829                 goto broken_upgrade;
 830         }
 831
 832         reload_successful = 1;
 833         return;
 834
 835 broken_upgrade:
 836         reload_successful = 0;
 837         lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__);
 838 }
 839
 840 static void __attribute__((destructor)) lxcfs_exit(void)
 841 {
 842         lxcfs_info("Running destructor %s", __func__);
 843
 844         free_cpuview();
 845         cgroup_exit(cgroup_ops);
 846 }