+/*
+ * Functions needed to setup cgroups in the __constructor__.
+ */
+
+static bool mkdir_p(const char *dir, mode_t mode)
+{
+ const char *tmp = dir;
+ const char *orig = dir;
+ char *makeme;
+
+ do {
+ dir = tmp + strspn(tmp, "/");
+ tmp = dir + strcspn(dir, "/");
+ makeme = strndup(orig, dir - orig);
+ if (!makeme)
+ return false;
+ if (mkdir(makeme, mode) && errno != EEXIST) {
+ fprintf(stderr, "failed to create directory '%s': %s",
+ makeme, strerror(errno));
+ free(makeme);
+ return false;
+ }
+ free(makeme);
+ } while(tmp != dir);
+
+ return true;
+}
+
+static bool umount_if_mounted(void)
+{
+ if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
+ fprintf(stderr, "failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
+ return false;
+ }
+ return true;
+}
+
+static int pivot_enter(void)
+{
+ int ret = -1, oldroot = -1, newroot = -1;
+
+ oldroot = open("/", O_DIRECTORY | O_RDONLY);
+ if (oldroot < 0) {
+ fprintf(stderr, "%s: Failed to open old root for fchdir.\n", __func__);
+ return ret;
+ }
+
+ newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
+ if (newroot < 0) {
+ fprintf(stderr, "%s: Failed to open new root for fchdir.\n", __func__);
+ goto err;
+ }
+
+ /* change into new root fs */
+ if (fchdir(newroot) < 0) {
+ fprintf(stderr, "%s: Failed to change directory to new rootfs: %s.\n", __func__, ROOTDIR);
+ goto err;
+ }
+
+ /* pivot_root into our new root fs */
+ if (pivot_root(".", ".") < 0) {
+ fprintf(stderr, "%s: pivot_root() syscall failed: %s.\n", __func__, strerror(errno));
+ goto err;
+ }
+
+ /*
+ * At this point the old-root is mounted on top of our new-root.
+ * To unmounted it we must not be chdir'd into it, so escape back
+ * to the old-root.
+ */
+ if (fchdir(oldroot) < 0) {
+ fprintf(stderr, "%s: Failed to enter old root.\n", __func__);
+ goto err;
+ }
+ if (umount2(".", MNT_DETACH) < 0) {
+ fprintf(stderr, "%s: Failed to detach old root.\n", __func__);
+ goto err;
+ }
+
+ if (fchdir(newroot) < 0) {
+ fprintf(stderr, "%s: Failed to re-enter new root.\n", __func__);
+ goto err;
+ }
+
+ ret = 0;
+
+err:
+ if (oldroot > 0)
+ close(oldroot);
+ if (newroot > 0)
+ close(newroot);
+ return ret;
+}
+
+/* Prepare our new clean root. */
+static int pivot_prepare(void)
+{
+ if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
+ fprintf(stderr, "%s: Failed to create directory for new root.\n", __func__);
+ return -1;
+ }
+
+ if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
+ fprintf(stderr, "%s: Failed to bind-mount / for new root: %s.\n", __func__, strerror(errno));
+ return -1;
+ }
+
+ if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
+ fprintf(stderr, "%s: Failed to bind-mount /run into new root: %s.\n", __func__, strerror(errno));
+ return -1;
+ }
+
+ if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
+ printf("%s: failed to move " BASEDIR " into new root: %s.\n", __func__, strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static bool pivot_new_root(void)
+{
+ /* Prepare new root. */
+ if (pivot_prepare() < 0)
+ return false;
+
+ /* Pivot into new root. */
+ if (pivot_enter() < 0)
+ return false;
+
+ return true;
+}
+
+static bool setup_cgfs_dir(void)
+{
+ if (!mkdir_p(BASEDIR, 0700)) {
+ fprintf(stderr, "Failed to create lxcfs cgroup mountpoint.\n");
+ return false;
+ }
+
+ if (!umount_if_mounted()) {
+ fprintf(stderr, "Failed to clean up old lxcfs cgroup mountpoint.\n");
+ return false;
+ }
+
+ if (unshare(CLONE_NEWNS) < 0) {
+ fprintf(stderr, "%s: Failed to unshare mount namespace: %s.\n", __func__, strerror(errno));
+ return false;
+ }
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
+ fprintf(stderr, "%s: Failed to remount / private: %s.\n", __func__, strerror(errno));
+ return false;
+ }
+
+ if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
+ fprintf(stderr, "Failed to mount tmpfs over lxcfs cgroup mountpoint.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool do_mount_cgroups(void)
+{
+ char *target;
+ size_t clen, len;
+ int i, ret;
+
+ for (i = 0; i < num_hierarchies; i++) {
+ char *controller = hierarchies[i];
+ clen = strlen(controller);
+ len = strlen(BASEDIR) + clen + 2;
+ target = malloc(len);
+ if (!target)
+ return false;
+ ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
+ if (ret < 0 || ret >= len) {
+ free(target);
+ return false;
+ }
+ if (mkdir(target, 0755) < 0 && errno != EEXIST) {
+ free(target);
+ return false;
+ }
+ if (mount(controller, target, "cgroup", 0, controller) < 0) {
+ fprintf(stderr, "Failed mounting cgroup %s\n", controller);
+ free(target);
+ return false;
+ }
+
+ fd_hierarchies[i] = open(target, O_DIRECTORY);
+ if (fd_hierarchies[i] < 0) {
+ free(target);
+ return false;
+ }
+ free(target);
+ }
+ return true;
+}
+
+static bool cgfs_setup_controllers(void)
+{
+ if (!setup_cgfs_dir())
+ return false;
+
+ if (!do_mount_cgroups()) {
+ fprintf(stderr, "Failed to set up private lxcfs cgroup mounts.\n");
+ return false;
+ }
+
+ if (!pivot_new_root())
+ return false;
+
+ return true;
+}
+
+static int preserve_ns(int pid)
+{
+ int ret;
+ size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */;
+ char path[len];
+
+ ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
+ if (ret < 0 || (size_t)ret >= len)
+ return -1;
+
+ return open(path, O_RDONLY | O_CLOEXEC);
+}
+
+static void __attribute__((constructor)) collect_and_mount_subsystems(void)