]> git.proxmox.com Git - mirror_lxc.git/blobdiff - src/lxc/utils.c
utils: add lxc_deslashify
[mirror_lxc.git] / src / lxc / utils.c
index 23b1b11ef43b5df5f9422ba2409037560276c1ea..9a6ef4b37528f1cb9460164db8d380b4decff15a 100644 (file)
 
 #include "config.h"
 
+#include <assert.h>
+#include <dirent.h>
 #include <errno.h>
-#include <unistd.h>
-#include <stdlib.h>
+#include <fcntl.h>
+#include <libgen.h>
 #include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
+#include <unistd.h>
 #include <sys/mman.h>
-#include <sys/param.h>
 #include <sys/mount.h>
-#include <dirent.h>
-#include <fcntl.h>
-#include <libgen.h>
+#include <sys/param.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
 #include <sys/types.h>
+#include <sys/vfs.h>
 #include <sys/wait.h>
-#include <assert.h>
 
-#include "utils.h"
 #include "log.h"
 #include "lxclock.h"
 #include "namespace.h"
+#include "utils.h"
+
+#ifndef PR_SET_MM
+#define PR_SET_MM 35
+#endif
+
+#ifndef PR_SET_MM_MAP
+#define PR_SET_MM_MAP 14
+
+struct prctl_mm_map {
+        uint64_t   start_code;
+        uint64_t   end_code;
+        uint64_t   start_data;
+        uint64_t   end_data;
+        uint64_t   start_brk;
+        uint64_t   brk;
+        uint64_t   start_stack;
+        uint64_t   arg_start;
+        uint64_t   arg_end;
+        uint64_t   env_start;
+        uint64_t   env_end;
+        uint64_t   *auxv;
+        uint32_t   auxv_size;
+        uint32_t   exe_fd;
+};
+#endif
+
+#ifndef O_PATH
+#define O_PATH      010000000
+#endif
+
+#ifndef O_NOFOLLOW
+#define O_NOFOLLOW  00400000
+#endif
 
 lxc_log_define(lxc_utils, lxc);
 
-static int _recursive_rmdir_onedev(char *dirname, dev_t pdev,
-                                  const char *exclude, int level)
+/*
+ * if path is btrfs, tries to remove it and any subvolumes beneath it
+ */
+extern bool btrfs_try_remove_subvol(const char *path);
+
+static int _recursive_rmdir(char *dirname, dev_t pdev,
+                           const char *exclude, int level, bool onedev)
 {
-       struct dirent dirent, *direntp;
+       struct dirent *direntp;
        DIR *dir;
        int ret, failed=0;
        char pathname[MAXPATHLEN];
@@ -62,7 +102,7 @@ static int _recursive_rmdir_onedev(char *dirname, dev_t pdev,
                return -1;
        }
 
-       while (!readdir_r(dir, &dirent, &direntp)) {
+       while ((direntp = readdir(dir))) {
                struct stat mystat;
                int rc;
 
@@ -85,7 +125,7 @@ static int _recursive_rmdir_onedev(char *dirname, dev_t pdev,
                        if (ret < 0) {
                                switch(errno) {
                                case ENOTEMPTY:
-                                       INFO("Not deleting snapshots");
+                                       INFO("Not deleting snapshot %s", pathname);
                                        hadexclude = true;
                                        break;
                                case ENOTDIR:
@@ -105,27 +145,30 @@ static int _recursive_rmdir_onedev(char *dirname, dev_t pdev,
                ret = lstat(pathname, &mystat);
                if (ret) {
                        ERROR("%s: failed to stat %s", __func__, pathname);
-                       failed=1;
+                       failed = 1;
                        continue;
                }
-               if (mystat.st_dev != pdev)
+               if (onedev && mystat.st_dev != pdev) {
+                       /* TODO should we be checking /proc/self/mountinfo for
+                        * pathname and not doing this if found? */
+                       if (btrfs_try_remove_subvol(pathname))
+                               INFO("Removed btrfs subvolume at %s\n", pathname);
                        continue;
+               }
                if (S_ISDIR(mystat.st_mode)) {
-                       if (_recursive_rmdir_onedev(pathname, pdev, exclude, level+1) < 0)
+                       if (_recursive_rmdir(pathname, pdev, exclude, level+1, onedev) < 0)
                                failed=1;
                } else {
                        if (unlink(pathname) < 0) {
-                               ERROR("%s: failed to delete %s", __func__, pathname);
+                               SYSERROR("%s: failed to delete %s", __func__, pathname);
                                failed=1;
                        }
                }
        }
 
-       if (rmdir(dirname) < 0) {
-               if (!hadexclude) {
-                       ERROR("%s: failed to delete %s", __func__, dirname);
-                       failed=1;
-               }
+       if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) {
+               ERROR("%s: failed to delete %s", __func__, dirname);
+               failed=1;
        }
 
        ret = closedir(dir);
@@ -137,54 +180,43 @@ static int _recursive_rmdir_onedev(char *dirname, dev_t pdev,
        return failed ? -1 : 0;
 }
 
-/* returns 0 on success, -1 if there were any failures */
-extern int lxc_rmdir_onedev(char *path, const char *exclude)
+/* we have two different magic values for overlayfs, yay */
+#define OVERLAYFS_SUPER_MAGIC 0x794c764f
+#define OVERLAY_SUPER_MAGIC 0x794c7630
+/*
+ * In overlayfs, st_dev is unreliable.  so on overlayfs we don't do
+ * the lxc_rmdir_onedev()
+ */
+static bool is_native_overlayfs(const char *path)
 {
-       struct stat mystat;
-
-       if (lstat(path, &mystat) < 0) {
-               ERROR("%s: failed to stat %s", __func__, path);
-               return -1;
-       }
+       struct statfs sb;
 
-       return _recursive_rmdir_onedev(path, mystat.st_dev, exclude, 0);
+       if (statfs(path, &sb) < 0)
+               return false;
+       if (sb.f_type == OVERLAYFS_SUPER_MAGIC ||
+                       sb.f_type == OVERLAY_SUPER_MAGIC)
+               return true;
+       return false;
 }
 
-static int mount_fs(const char *source, const char *target, const char *type)
+/* returns 0 on success, -1 if there were any failures */
+extern int lxc_rmdir_onedev(char *path, const char *exclude)
 {
-       /* the umount may fail */
-       if (umount(target))
-               WARN("failed to unmount %s : %s", target, strerror(errno));
+       struct stat mystat;
+       bool onedev = true;
 
-       if (mount(source, target, type, 0, NULL)) {
-               ERROR("failed to mount %s : %s", target, strerror(errno));
-               return -1;
+       if (is_native_overlayfs(path)) {
+               onedev = false;
        }
 
-       DEBUG("'%s' mounted on '%s'", source, target);
-
-       return 0;
-}
-
-extern void lxc_setup_fs(void)
-{
-       if (mount_fs("proc", "/proc", "proc"))
-               INFO("failed to remount proc");
-
-       /* if we can't mount /dev/shm, continue anyway */
-       if (mount_fs("shmfs", "/dev/shm", "tmpfs"))
-               INFO("failed to mount /dev/shm");
-
-       /* If we were able to mount /dev/shm, then /dev exists */
-       /* Sure, but it's read-only per config :) */
-       if (access("/dev/mqueue", F_OK) && mkdir("/dev/mqueue", 0666)) {
-               DEBUG("failed to create '/dev/mqueue'");
-               return;
+       if (lstat(path, &mystat) < 0) {
+               if (errno == ENOENT)
+                       return 0;
+               ERROR("%s: failed to stat %s", __func__, path);
+               return -1;
        }
 
-       /* continue even without posix message queue support */
-       if (mount_fs("mqueue", "/dev/mqueue", "mqueue"))
-               INFO("failed to mount /dev/mqueue");
+       return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev);
 }
 
 /* borrowed from iproute2 */
@@ -229,195 +261,6 @@ extern int mkdir_p(const char *dir, mode_t mode)
        return 0;
 }
 
-extern void remove_trailing_slashes(char *p)
-{
-       int l = strlen(p);
-       while (--l >= 0 && (p[l] == '/' || p[l] == '\n'))
-               p[l] = '\0';
-}
-
-static char *copy_global_config_value(char *p)
-{
-       int len = strlen(p);
-       char *retbuf;
-
-       if (len < 1)
-               return NULL;
-       if (p[len-1] == '\n') {
-               p[len-1] = '\0';
-               len--;
-       }
-       retbuf = malloc(len+1);
-       if (!retbuf)
-               return NULL;
-       strcpy(retbuf, p);
-       return retbuf;
-}
-
-#define DEFAULT_VG "lxc"
-#define DEFAULT_THIN_POOL "lxc"
-#define DEFAULT_ZFSROOT "lxc"
-
-const char *lxc_global_config_value(const char *option_name)
-{
-       static const char * const options[][2] = {
-               { "lxc.bdev.lvm.vg",        DEFAULT_VG      },
-               { "lxc.bdev.lvm.thin_pool", DEFAULT_THIN_POOL },
-               { "lxc.bdev.zfs.root",      DEFAULT_ZFSROOT },
-               { "lxc.lxcpath",            NULL            },
-               { "lxc.default_config",     NULL            },
-               { "lxc.cgroup.pattern",     NULL            },
-               { "lxc.cgroup.use",         NULL            },
-               { NULL, NULL },
-       };
-
-       /* placed in the thread local storage pool for non-bionic targets */
-#ifdef HAVE_TLS
-       static __thread const char *values[sizeof(options) / sizeof(options[0])] = { 0 };
-#else
-       static const char *values[sizeof(options) / sizeof(options[0])] = { 0 };
-#endif
-
-       /* user_config_path is freed as soon as it is used */
-       char *user_config_path = NULL;
-
-       /*
-        * The following variables are freed at bottom unconditionally.
-        * So NULL the value if it is to be returned to the caller
-        */
-       char *user_default_config_path = NULL;
-       char *user_lxc_path = NULL;
-       char *user_cgroup_pattern = NULL;
-
-       if (geteuid() > 0) {
-               const char *user_home = getenv("HOME");
-               if (!user_home)
-                       user_home = "/";
-
-               user_config_path = malloc(sizeof(char) * (22 + strlen(user_home)));
-               user_default_config_path = malloc(sizeof(char) * (26 + strlen(user_home)));
-               user_lxc_path = malloc(sizeof(char) * (19 + strlen(user_home)));
-
-               sprintf(user_config_path, "%s/.config/lxc/lxc.conf", user_home);
-               sprintf(user_default_config_path, "%s/.config/lxc/default.conf", user_home);
-               sprintf(user_lxc_path, "%s/.local/share/lxc/", user_home);
-               user_cgroup_pattern = strdup("%n");
-       }
-       else {
-               user_config_path = strdup(LXC_GLOBAL_CONF);
-               user_default_config_path = strdup(LXC_DEFAULT_CONFIG);
-               user_lxc_path = strdup(LXCPATH);
-               user_cgroup_pattern = strdup(DEFAULT_CGROUP_PATTERN);
-       }
-
-       const char * const (*ptr)[2];
-       size_t i;
-       char buf[1024], *p, *p2;
-       FILE *fin = NULL;
-
-       for (i = 0, ptr = options; (*ptr)[0]; ptr++, i++) {
-               if (!strcmp(option_name, (*ptr)[0]))
-                       break;
-       }
-       if (!(*ptr)[0]) {
-               free(user_config_path);
-               free(user_default_config_path);
-               free(user_lxc_path);
-               free(user_cgroup_pattern);
-               errno = EINVAL;
-               return NULL;
-       }
-
-       if (values[i]) {
-               free(user_config_path);
-               free(user_default_config_path);
-               free(user_lxc_path);
-               free(user_cgroup_pattern);
-               return values[i];
-       }
-
-       fin = fopen_cloexec(user_config_path, "r");
-       free(user_config_path);
-       if (fin) {
-               while (fgets(buf, 1024, fin)) {
-                       if (buf[0] == '#')
-                               continue;
-                       p = strstr(buf, option_name);
-                       if (!p)
-                               continue;
-                       /* see if there was just white space in front
-                        * of the option name
-                        */
-                       for (p2 = buf; p2 < p; p2++) {
-                               if (*p2 != ' ' && *p2 != '\t')
-                                       break;
-                       }
-                       if (p2 < p)
-                               continue;
-                       p = strchr(p, '=');
-                       if (!p)
-                               continue;
-                       /* see if there was just white space after
-                        * the option name
-                        */
-                       for (p2 += strlen(option_name); p2 < p; p2++) {
-                               if (*p2 != ' ' && *p2 != '\t')
-                                       break;
-                       }
-                       if (p2 < p)
-                               continue;
-                       p++;
-                       while (*p && (*p == ' ' || *p == '\t')) p++;
-                       if (!*p)
-                               continue;
-
-                       if (strcmp(option_name, "lxc.lxcpath") == 0) {
-                               free(user_lxc_path);
-                               user_lxc_path = copy_global_config_value(p);
-                               remove_trailing_slashes(user_lxc_path);
-                               values[i] = user_lxc_path;
-                               user_lxc_path = NULL;
-                               goto out;
-                       }
-
-                       values[i] = copy_global_config_value(p);
-                       goto out;
-               }
-       }
-       /* could not find value, use default */
-       if (strcmp(option_name, "lxc.lxcpath") == 0) {
-               remove_trailing_slashes(user_lxc_path);
-               values[i] = user_lxc_path;
-               user_lxc_path = NULL;
-       }
-       else if (strcmp(option_name, "lxc.default_config") == 0) {
-               values[i] = user_default_config_path;
-               user_default_config_path = NULL;
-       }
-       else if (strcmp(option_name, "lxc.cgroup.pattern") == 0) {
-               values[i] = user_cgroup_pattern;
-               user_cgroup_pattern = NULL;
-       }
-       else
-               values[i] = (*ptr)[1];
-
-       /* special case: if default value is NULL,
-        * and there is no config, don't view that
-        * as an error... */
-       if (!values[i])
-               errno = 0;
-
-out:
-       if (fin)
-               fclose(fin);
-
-       free(user_cgroup_pattern);
-       free(user_default_config_path);
-       free(user_lxc_path);
-
-       return values[i];
-}
-
 char *get_rundir()
 {
        char *rundir;
@@ -623,50 +466,6 @@ const char** lxc_va_arg_list_to_argv_const(va_list ap, size_t skip)
        return (const char**)lxc_va_arg_list_to_argv(ap, skip, 0);
 }
 
-FILE *fopen_cloexec(const char *path, const char *mode)
-{
-       int open_mode = 0;
-       int step = 0;
-       int fd;
-       int saved_errno = 0;
-       FILE *ret;
-
-       if (!strncmp(mode, "r+", 2)) {
-               open_mode = O_RDWR;
-               step = 2;
-       } else if (!strncmp(mode, "r", 1)) {
-               open_mode = O_RDONLY;
-               step = 1;
-       } else if (!strncmp(mode, "w+", 2)) {
-               open_mode = O_RDWR | O_TRUNC | O_CREAT;
-               step = 2;
-       } else if (!strncmp(mode, "w", 1)) {
-               open_mode = O_WRONLY | O_TRUNC | O_CREAT;
-               step = 1;
-       } else if (!strncmp(mode, "a+", 2)) {
-               open_mode = O_RDWR | O_CREAT | O_APPEND;
-               step = 2;
-       } else if (!strncmp(mode, "a", 1)) {
-               open_mode = O_WRONLY | O_CREAT | O_APPEND;
-               step = 1;
-       }
-       for (; mode[step]; step++)
-               if (mode[step] == 'x')
-                       open_mode |= O_EXCL;
-       open_mode |= O_CLOEXEC;
-
-       fd = open(path, open_mode, 0666);
-       if (fd < 0)
-               return NULL;
-
-       ret = fdopen(fd, mode);
-       saved_errno = errno;
-       if (!ret)
-               close(fd);
-       errno = saved_errno;
-       return ret;
-}
-
 extern struct lxc_popen_FILE *lxc_popen(const char *command)
 {
        struct lxc_popen_FILE *fp = NULL;
@@ -917,6 +716,24 @@ char **lxc_normalize_path(const char *path)
        return components;
 }
 
+bool lxc_deslashify(char *path)
+{
+       char **parts = NULL, *path2;
+
+       parts = lxc_normalize_path(path);
+       if (!parts)
+               return false;
+
+       path2 = lxc_string_join("/", (const char **) parts, *path == '/');
+       lxc_free_array((void **) parts, free);
+       if (!path2)
+               return false;
+
+       strncpy(path, path2, strlen(path));
+       free(path2);
+       return true;
+}
+
 char *lxc_append_paths(const char *first, const char *second)
 {
        size_t len = strlen(first) + strlen(second) + 1;
@@ -1142,7 +959,7 @@ void **lxc_append_null_to_array(void **array, size_t count)
        if (count) {
                temp = realloc(array, (count + 1) * sizeof(*array));
                if (!temp) {
-                       int i;
+                       size_t i;
                        for (i = 0; i < count; i++)
                                free(array[i]);
                        free(array);
@@ -1386,6 +1203,11 @@ bool file_exists(const char *f)
        return stat(f, &statbuf) == 0;
 }
 
+bool cgns_supported(void)
+{
+       return file_exists("/proc/self/ns/cgroup");
+}
+
 /* historically lxc-init has been under /usr/lib/lxc and under
  * /usr/lib/$ARCH/lxc.  It now lives as $prefix/sbin/init.lxc.
  */
@@ -1506,3 +1328,569 @@ int is_dir(const char *path)
                return 1;
        return 0;
 }
+
+/*
+ * Given the '-t' template option to lxc-create, figure out what to
+ * do.  If the template is a full executable path, use that.  If it
+ * is something like 'sshd', then return $templatepath/lxc-sshd.
+ * On success return the template, on error return NULL.
+ */
+char *get_template_path(const char *t)
+{
+       int ret, len;
+       char *tpath;
+
+       if (t[0] == '/' && access(t, X_OK) == 0) {
+               tpath = strdup(t);
+               return tpath;
+       }
+
+       len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1;
+       tpath = malloc(len);
+       if (!tpath)
+               return NULL;
+       ret = snprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t);
+       if (ret < 0 || ret >= len) {
+               free(tpath);
+               return NULL;
+       }
+       if (access(tpath, X_OK) < 0) {
+               SYSERROR("bad template: %s", t);
+               free(tpath);
+               return NULL;
+       }
+
+       return tpath;
+}
+
+/*
+ * Sets the process title to the specified title. Note:
+ *   1. this function requires root to succeed
+ *   2. it clears /proc/self/environ
+ *   3. it may not succed (e.g. if title is longer than /proc/self/environ +
+ *      the original title)
+ */
+int setproctitle(char *title)
+{
+       static char *proctitle = NULL;
+       char buf[2048], *tmp;
+       FILE *f;
+       int i, len, ret = 0;
+
+       /* We don't really need to know all of this stuff, but unfortunately
+        * PR_SET_MM_MAP requires us to set it all at once, so we have to
+        * figure it out anyway.
+        */
+       unsigned long start_data, end_data, start_brk, start_code, end_code,
+                       start_stack, arg_start, arg_end, env_start, env_end,
+                       brk_val;
+       struct prctl_mm_map prctl_map;
+
+       f = fopen_cloexec("/proc/self/stat", "r");
+       if (!f) {
+               return -1;
+       }
+
+       tmp = fgets(buf, sizeof(buf), f);
+       fclose(f);
+       if (!tmp) {
+               return -1;
+       }
+
+       /* Skip the first 25 fields, column 26-28 are start_code, end_code,
+        * and start_stack */
+       tmp = strchr(buf, ' ');
+       for (i = 0; i < 24; i++) {
+               if (!tmp)
+                       return -1;
+               tmp = strchr(tmp+1, ' ');
+       }
+       if (!tmp)
+               return -1;
+
+       i = sscanf(tmp, "%lu %lu %lu", &start_code, &end_code, &start_stack);
+       if (i != 3)
+               return -1;
+
+       /* Skip the next 19 fields, column 45-51 are start_data to arg_end */
+       for (i = 0; i < 19; i++) {
+               if (!tmp)
+                       return -1;
+               tmp = strchr(tmp+1, ' ');
+       }
+
+       if (!tmp)
+               return -1;
+
+       i = sscanf(tmp, "%lu %lu %lu %lu %lu %lu %lu",
+               &start_data,
+               &end_data,
+               &start_brk,
+               &arg_start,
+               &arg_end,
+               &env_start,
+               &env_end);
+       if (i != 7)
+               return -1;
+
+       /* Include the null byte here, because in the calculations below we
+        * want to have room for it. */
+       len = strlen(title) + 1;
+
+       /* If we don't have enough room by just overwriting the old proctitle,
+        * let's allocate a new one.
+        */
+       if (len > arg_end - arg_start) {
+               void *m;
+               m = realloc(proctitle, len);
+               if (!m)
+                       return -1;
+               proctitle = m;
+
+               arg_start = (unsigned long) proctitle;
+       }
+
+       arg_end = arg_start + len;
+
+       brk_val = syscall(__NR_brk, 0);
+
+       prctl_map = (struct prctl_mm_map) {
+               .start_code = start_code,
+               .end_code = end_code,
+               .start_stack = start_stack,
+               .start_data = start_data,
+               .end_data = end_data,
+               .start_brk = start_brk,
+               .brk = brk_val,
+               .arg_start = arg_start,
+               .arg_end = arg_end,
+               .env_start = env_start,
+               .env_end = env_end,
+               .auxv = NULL,
+               .auxv_size = 0,
+               .exe_fd = -1,
+       };
+
+       ret = prctl(PR_SET_MM, PR_SET_MM_MAP, (long) &prctl_map, sizeof(prctl_map), 0);
+       if (ret == 0)
+               strcpy((char*)arg_start, title);
+       else
+               INFO("setting cmdline failed - %s", strerror(errno));
+
+       return ret;
+}
+
+/*
+ * @path:    a pathname where / replaced with '\0'.
+ * @offsetp: pointer to int showing which path segment was last seen.
+ *           Updated on return to reflect the next segment.
+ * @fulllen: full original path length.
+ * Returns a pointer to the next path segment, or NULL if done.
+ */
+static char *get_nextpath(char *path, int *offsetp, int fulllen)
+{
+       int offset = *offsetp;
+
+       if (offset >= fulllen)
+               return NULL;
+
+       while (path[offset] != '\0' && offset < fulllen)
+               offset++;
+       while (path[offset] == '\0' && offset < fulllen)
+               offset++;
+
+       *offsetp = offset;
+       return (offset < fulllen) ? &path[offset] : NULL;
+}
+
+/*
+ * Check that @subdir is a subdir of @dir.  @len is the length of
+ * @dir (to avoid having to recalculate it).
+ */
+static bool is_subdir(const char *subdir, const char *dir, size_t len)
+{
+       size_t subdirlen = strlen(subdir);
+
+       if (subdirlen < len)
+               return false;
+       if (strncmp(subdir, dir, len) != 0)
+               return false;
+       if (dir[len-1] == '/')
+               return true;
+       if (subdir[len] == '/' || subdirlen == len)
+               return true;
+       return false;
+}
+
+/*
+ * Check if the open fd is a symlink.  Return -ELOOP if it is.  Return
+ * -ENOENT if we couldn't fstat.  Return 0 if the fd is ok.
+ */
+static int check_symlink(int fd)
+{
+       struct stat sb;
+       int ret = fstat(fd, &sb);
+       if (ret < 0)
+               return -ENOENT;
+       if (S_ISLNK(sb.st_mode))
+               return -ELOOP;
+       return 0;
+}
+
+/*
+ * Open a file or directory, provided that it contains no symlinks.
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ */
+static int open_if_safe(int dirfd, const char *nextpath)
+{
+       int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
+       if (newfd >= 0) // was not a symlink, all good
+               return newfd;
+
+       if (errno == ELOOP)
+               return newfd;
+
+       if (errno == EPERM || errno == EACCES) {
+               /* we're not root (cause we got EPERM) so
+                  try opening with O_PATH */
+               newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
+               if (newfd >= 0) {
+                       /* O_PATH will return an fd for symlinks.  We know
+                        * nextpath wasn't a symlink at last openat, so if fd
+                        * is now a link, then something * fishy is going on
+                        */
+                       int ret = check_symlink(newfd);
+                       if (ret < 0) {
+                               close(newfd);
+                               newfd = ret;
+                       }
+               }
+       }
+
+       return newfd;
+}
+
+/*
+ * Open a path intending for mounting, ensuring that the final path
+ * is inside the container's rootfs.
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ *
+ * @target: path to be opened
+ * @prefix_skip: a part of @target in which to ignore symbolic links.  This
+ * would be the container's rootfs.
+ *
+ * Return an open fd for the path, or <0 on error.
+ */
+static int open_without_symlink(const char *target, const char *prefix_skip)
+{
+       int curlen = 0, dirfd, fulllen, i;
+       char *dup = NULL;
+
+       fulllen = strlen(target);
+
+       /* make sure prefix-skip makes sense */
+       if (prefix_skip && strlen(prefix_skip) > 0) {
+               curlen = strlen(prefix_skip);
+               if (!is_subdir(target, prefix_skip, curlen)) {
+                       ERROR("WHOA there - target '%s' didn't start with prefix '%s'",
+                               target, prefix_skip);
+                       return -EINVAL;
+               }
+               /*
+                * get_nextpath() expects the curlen argument to be
+                * on a  (turned into \0) / or before it, so decrement
+                * curlen to make sure that happens
+                */
+               if (curlen)
+                       curlen--;
+       } else {
+               prefix_skip = "/";
+               curlen = 0;
+       }
+
+       /* Make a copy of target which we can hack up, and tokenize it */
+       if ((dup = strdup(target)) == NULL) {
+               SYSERROR("Out of memory checking for symbolic link");
+               return -ENOMEM;
+       }
+       for (i = 0; i < fulllen; i++) {
+               if (dup[i] == '/')
+                       dup[i] = '\0';
+       }
+
+       dirfd = open(prefix_skip, O_RDONLY);
+       if (dirfd < 0)
+               goto out;
+       while (1) {
+               int newfd, saved_errno;
+               char *nextpath;
+
+               if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
+                       goto out;
+               newfd = open_if_safe(dirfd, nextpath);
+               saved_errno = errno;
+               close(dirfd);
+               dirfd = newfd;
+               if (newfd < 0) {
+                       errno = saved_errno;
+                       if (errno == ELOOP)
+                               SYSERROR("%s in %s was a symbolic link!", nextpath, target);
+                       goto out;
+               }
+       }
+
+out:
+       free(dup);
+       return dirfd;
+}
+
+/*
+ * Safely mount a path into a container, ensuring that the mount target
+ * is under the container's @rootfs.  (If @rootfs is NULL, then the container
+ * uses the host's /)
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ */
+int safe_mount(const char *src, const char *dest, const char *fstype,
+               unsigned long flags, const void *data, const char *rootfs)
+{
+       int srcfd = -1, destfd, ret, saved_errno;
+       char srcbuf[50], destbuf[50]; // only needs enough for /proc/self/fd/<fd>
+       const char *mntsrc = src;
+
+       if (!rootfs)
+               rootfs = "";
+
+       /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
+       if (flags & MS_BIND && src && src[0] != '/') {
+               INFO("this is a relative bind mount");
+               srcfd = open_without_symlink(src, NULL);
+               if (srcfd < 0)
+                       return srcfd;
+               ret = snprintf(srcbuf, 50, "/proc/self/fd/%d", srcfd);
+               if (ret < 0 || ret > 50) {
+                       close(srcfd);
+                       ERROR("Out of memory");
+                       return -EINVAL;
+               }
+               mntsrc = srcbuf;
+       }
+
+       destfd = open_without_symlink(dest, rootfs);
+       if (destfd < 0) {
+               if (srcfd != -1) {
+                       saved_errno = errno;
+                       close(srcfd);
+                       errno = saved_errno;
+               }
+               return destfd;
+       }
+
+       ret = snprintf(destbuf, 50, "/proc/self/fd/%d", destfd);
+       if (ret < 0 || ret > 50) {
+               if (srcfd != -1)
+                       close(srcfd);
+               close(destfd);
+               ERROR("Out of memory");
+               return -EINVAL;
+       }
+
+       ret = mount(mntsrc, destbuf, fstype, flags, data);
+       saved_errno = errno;
+       if (srcfd != -1)
+               close(srcfd);
+       close(destfd);
+       if (ret < 0) {
+               errno = saved_errno;
+               SYSERROR("Failed to mount %s onto %s", src, dest);
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+ * Mount a proc under @rootfs if proc self points to a pid other than
+ * my own.  This is needed to have a known-good proc mount for setting
+ * up LSMs both at container startup and attach.
+ *
+ * @rootfs : the rootfs where proc should be mounted
+ *
+ * Returns < 0 on failure, 0 if the correct proc was already mounted
+ * and 1 if a new proc was mounted.
+ *
+ * NOTE: not to be called from inside the container namespace!
+ */
+int mount_proc_if_needed(const char *rootfs)
+{
+       char path[MAXPATHLEN];
+       char link[20];
+       int linklen, ret;
+       int mypid;
+
+       ret = snprintf(path, MAXPATHLEN, "%s/proc/self", rootfs);
+       if (ret < 0 || ret >= MAXPATHLEN) {
+               SYSERROR("proc path name too long");
+               return -1;
+       }
+       memset(link, 0, 20);
+       linklen = readlink(path, link, 20);
+       mypid = (int)getpid();
+       INFO("I am %d, /proc/self points to '%s'", mypid, link);
+       ret = snprintf(path, MAXPATHLEN, "%s/proc", rootfs);
+       if (ret < 0 || ret >= MAXPATHLEN) {
+               SYSERROR("proc path name too long");
+               return -1;
+       }
+       if (linklen < 0) /* /proc not mounted */
+               goto domount;
+       if (atoi(link) != mypid) {
+               /* wrong /procs mounted */
+               umount2(path, MNT_DETACH); /* ignore failure */
+               goto domount;
+       }
+       /* the right proc is already mounted */
+       return 0;
+
+domount:
+       if (!strcmp(rootfs,"")) /* rootfs is NULL */
+               ret = mount("proc", path, "proc", 0, NULL);
+       else
+               ret = safe_mount("proc", path, "proc", 0, NULL, rootfs);
+
+       if (ret < 0)
+               return -1;
+
+       INFO("Mounted /proc in container for security transition");
+       return 1;
+}
+
+int open_devnull(void)
+{
+       int fd = open("/dev/null", O_RDWR);
+
+       if (fd < 0)
+               SYSERROR("Can't open /dev/null");
+
+       return fd;
+}
+
+int set_stdfds(int fd)
+{
+       if (fd < 0)
+               return -1;
+
+       if (dup2(fd, 0) < 0)
+               return -1;
+       if (dup2(fd, 1) < 0)
+               return -1;
+       if (dup2(fd, 2) < 0)
+               return -1;
+
+       return 0;
+}
+
+int null_stdfds(void)
+{
+       int ret = -1;
+       int fd = open_devnull();
+
+       if (fd >= 0) {
+               ret = set_stdfds(fd);
+               close(fd);
+       }
+
+       return ret;
+}
+
+/*
+ * Return the number of lines in file @fn, or -1 on error
+ */
+int lxc_count_file_lines(const char *fn)
+{
+       FILE *f;
+       char *line = NULL;
+       size_t sz = 0;
+       int n = 0;
+
+       f = fopen_cloexec(fn, "r");
+       if (!f)
+               return -1;
+
+       while (getline(&line, &sz, f) != -1) {
+               n++;
+       }
+       free(line);
+       fclose(f);
+       return n;
+}
+
+void *lxc_strmmap(void *addr, size_t length, int prot, int flags, int fd,
+                 off_t offset)
+{
+       void *tmp = NULL, *overlap = NULL;
+
+       /* We establish an anonymous mapping that is one byte larger than the
+        * underlying file. The pages handed to us are zero filled. */
+       tmp = mmap(addr, length + 1, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       if (tmp == MAP_FAILED)
+               return tmp;
+
+       /* Now we establish a fixed-address mapping starting at the address we
+        * received from our anonymous mapping and replace all bytes excluding
+        * the additional \0-byte with the file. This allows us to use normal
+        * string-handling functions. */
+       overlap = mmap(tmp, length, prot, MAP_FIXED | flags, fd, offset);
+       if (overlap == MAP_FAILED)
+               munmap(tmp, length + 1);
+
+       return overlap;
+}
+
+int lxc_strmunmap(void *addr, size_t length)
+{
+       return munmap(addr, length + 1);
+}
+
+/* Check whether a signal is blocked by a process. */
+bool task_blocking_signal(pid_t pid, int signal)
+{
+       bool bret = false;
+       char *line = NULL;
+       long unsigned int sigblk = 0;
+       size_t n = 0;
+       int ret;
+       FILE *f;
+
+       /* The largest integer that can fit into long int is 2^64. This is a
+        * 20-digit number. */
+       size_t len = /* /proc */ 5 + /* /pid-to-str */ 21 + /* /status */ 7 + /* \0 */ 1;
+       char status[len];
+
+       ret = snprintf(status, len, "/proc/%d/status", pid);
+       if (ret < 0 || ret >= len)
+               return bret;
+
+       f = fopen(status, "r");
+       if (!f)
+               return bret;
+
+       while (getline(&line, &n, f) != -1) {
+               if (!strncmp(line, "SigBlk:\t", 8))
+                       if (sscanf(line + 8, "%lx", &sigblk) != 1)
+                               goto out;
+       }
+
+       if (sigblk & signal)
+               bret = true;
+
+out:
+       free(line);
+       fclose(f);
+       return bret;
+}