utils: add lxc_deslashify

[mirror_lxc.git] / src / lxc / utils.c
diff --git a/src/lxc/utils.c b/src/lxc/utils.c

index 7ced314872ac4bbca21d9e564f6188d95163db68..9a6ef4b37528f1cb9460164db8d380b4decff15a 100644 (file)
--- a/src/lxc/utils.c
+++ b/src/lxc/utils.c
@@ -23,48 +23,61 @@
  
  #include "config.h"
  
+#include <assert.h>
+#include <dirent.h>
  #include <errno.h>
-#include <unistd.h>
-#include <stdlib.h>
+#include <fcntl.h>
+#include <libgen.h>
  #include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
  #include <string.h>
-#include <sys/types.h>
-#include <sys/vfs.h>
-#include <sys/stat.h>
+#include <unistd.h>
  #include <sys/mman.h>
-#include <sys/param.h>
  #include <sys/mount.h>
-#include <dirent.h>
-#include <fcntl.h>
-#include <libgen.h>
+#include <sys/param.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
  #include <sys/types.h>
+#include <sys/vfs.h>
  #include <sys/wait.h>
-#include <assert.h>
-#include <sys/prctl.h>
  
-#include "utils.h"
  #include "log.h"
  #include "lxclock.h"
  #include "namespace.h"
+#include "utils.h"
  
  #ifndef PR_SET_MM
  #define PR_SET_MM 35
  #endif
  
-#ifndef PR_SET_MM_ARG_START
-#define PR_SET_MM_ARG_START 8
-#endif
-
-#ifndef PR_SET_MM_ARG_END
-#define PR_SET_MM_ARG_END 9
+#ifndef PR_SET_MM_MAP
+#define PR_SET_MM_MAP 14
+
+struct prctl_mm_map {
+        uint64_t   start_code;
+        uint64_t   end_code;
+        uint64_t   start_data;
+        uint64_t   end_data;
+        uint64_t   start_brk;
+        uint64_t   brk;
+        uint64_t   start_stack;
+        uint64_t   arg_start;
+        uint64_t   arg_end;
+        uint64_t   env_start;
+        uint64_t   env_end;
+        uint64_t   *auxv;
+        uint32_t   auxv_size;
+        uint32_t   exe_fd;
+};
  #endif
  
-#ifndef PR_SET_MM_ENV_START
-#define PR_SET_MM_ENV_START 10
+#ifndef O_PATH
+#define O_PATH      010000000
  #endif
  
-#ifndef PR_SET_MM_ENV_END
-#define PR_SET_MM_ENV_END 11
+#ifndef O_NOFOLLOW
+#define O_NOFOLLOW  00400000
  #endif
  
  lxc_log_define(lxc_utils, lxc);
@@ -77,7 +90,7 @@ extern bool btrfs_try_remove_subvol(const char *path);
  static int _recursive_rmdir(char *dirname, dev_t pdev,
                             const char *exclude, int level, bool onedev)
  {
-       struct dirent dirent, *direntp;
+       struct dirent *direntp;
         DIR *dir;
         int ret, failed=0;
         char pathname[MAXPATHLEN];
@@ -89,7 +102,7 @@ static int _recursive_rmdir(char *dirname, dev_t pdev,
                 return -1;
         }
  
-       while (!readdir_r(dir, &dirent, &direntp)) {
+       while ((direntp = readdir(dir))) {
                 struct stat mystat;
                 int rc;
  
@@ -197,6 +210,8 @@ extern int lxc_rmdir_onedev(char *path, const char *exclude)
         }
  
         if (lstat(path, &mystat) < 0) {
+               if (errno == ENOENT)
+                       return 0;
                 ERROR("%s: failed to stat %s", __func__, path);
                 return -1;
         }
@@ -701,6 +716,24 @@ char **lxc_normalize_path(const char *path)
         return components;
  }
  
+bool lxc_deslashify(char *path)
+{
+       char **parts = NULL, *path2;
+
+       parts = lxc_normalize_path(path);
+       if (!parts)
+               return false;
+
+       path2 = lxc_string_join("/", (const char **) parts, *path == '/');
+       lxc_free_array((void **) parts, free);
+       if (!path2)
+               return false;
+
+       strncpy(path, path2, strlen(path));
+       free(path2);
+       return true;
+}
+
  char *lxc_append_paths(const char *first, const char *second)
  {
         size_t len = strlen(first) + strlen(second) + 1;
@@ -926,7 +959,7 @@ void **lxc_append_null_to_array(void **array, size_t count)
         if (count) {
                 temp = realloc(array, (count + 1) * sizeof(*array));
                 if (!temp) {
-                       int i;
+                       size_t i;
                         for (i = 0; i < count; i++)
                                 free(array[i]);
                         free(array);
@@ -1170,6 +1203,11 @@ bool file_exists(const char *f)
         return stat(f, &statbuf) == 0;
  }
  
+bool cgns_supported(void)
+{
+       return file_exists("/proc/self/ns/cgroup");
+}
+
  /* historically lxc-init has been under /usr/lib/lxc and under
   * /usr/lib/$ARCH/lxc.  It now lives as $prefix/sbin/init.lxc.
   */
@@ -1334,10 +1372,19 @@ char *get_template_path(const char *t)
   */
  int setproctitle(char *title)
  {
+       static char *proctitle = NULL;
         char buf[2048], *tmp;
         FILE *f;
         int i, len, ret = 0;
-       unsigned long arg_start, arg_end, env_start, env_end;
+
+       /* We don't really need to know all of this stuff, but unfortunately
+        * PR_SET_MM_MAP requires us to set it all at once, so we have to
+        * figure it out anyway.
+        */
+       unsigned long start_data, end_data, start_brk, start_code, end_code,
+                       start_stack, arg_start, arg_end, env_start, env_end,
+                       brk_val;
+       struct prctl_mm_map prctl_map;
  
         f = fopen_cloexec("/proc/self/stat", "r");
         if (!f) {
@@ -1350,56 +1397,321 @@ int setproctitle(char *title)
                 return -1;
         }
  
-       /* Skip the first 47 fields, column 48-51 are ARG_START and
-        * ARG_END. */
+       /* Skip the first 25 fields, column 26-28 are start_code, end_code,
+        * and start_stack */
         tmp = strchr(buf, ' ');
-       for (i = 0; i < 46; i++) {
+       for (i = 0; i < 24; i++) {
                 if (!tmp)
                         return -1;
                 tmp = strchr(tmp+1, ' ');
         }
-
         if (!tmp)
                 return -1;
  
-       i = sscanf(tmp, "%lu %lu %lu %lu", &arg_start, &arg_end, &env_start, &env_end);
-       if (i != 4) {
+       i = sscanf(tmp, "%lu %lu %lu", &start_code, &end_code, &start_stack);
+       if (i != 3)
                 return -1;
+
+       /* Skip the next 19 fields, column 45-51 are start_data to arg_end */
+       for (i = 0; i < 19; i++) {
+               if (!tmp)
+                       return -1;
+               tmp = strchr(tmp+1, ' ');
         }
  
+       if (!tmp)
+               return -1;
+
+       i = sscanf(tmp, "%lu %lu %lu %lu %lu %lu %lu",
+               &start_data,
+               &end_data,
+               &start_brk,
+               &arg_start,
+               &arg_end,
+               &env_start,
+               &env_end);
+       if (i != 7)
+               return -1;
+
         /* Include the null byte here, because in the calculations below we
          * want to have room for it. */
         len = strlen(title) + 1;
  
-       /* We're truncating the environment, so we should use at most the
-        * length of the argument + environment for the title. */
-       if (len > env_end - arg_start) {
-               arg_end = env_end;
-               len = env_end - arg_start;
+       /* If we don't have enough room by just overwriting the old proctitle,
+        * let's allocate a new one.
+        */
+       if (len > arg_end - arg_start) {
+               void *m;
+               m = realloc(proctitle, len);
+               if (!m)
+                       return -1;
+               proctitle = m;
+
+               arg_start = (unsigned long) proctitle;
+       }
+
+       arg_end = arg_start + len;
+
+       brk_val = syscall(__NR_brk, 0);
+
+       prctl_map = (struct prctl_mm_map) {
+               .start_code = start_code,
+               .end_code = end_code,
+               .start_stack = start_stack,
+               .start_data = start_data,
+               .end_data = end_data,
+               .start_brk = start_brk,
+               .brk = brk_val,
+               .arg_start = arg_start,
+               .arg_end = arg_end,
+               .env_start = env_start,
+               .env_end = env_end,
+               .auxv = NULL,
+               .auxv_size = 0,
+               .exe_fd = -1,
+       };
+
+       ret = prctl(PR_SET_MM, PR_SET_MM_MAP, (long) &prctl_map, sizeof(prctl_map), 0);
+       if (ret == 0)
+               strcpy((char*)arg_start, title);
+       else
+               INFO("setting cmdline failed - %s", strerror(errno));
+
+       return ret;
+}
+
+/*
+ * @path:    a pathname where / replaced with '\0'.
+ * @offsetp: pointer to int showing which path segment was last seen.
+ *           Updated on return to reflect the next segment.
+ * @fulllen: full original path length.
+ * Returns a pointer to the next path segment, or NULL if done.
+ */
+static char *get_nextpath(char *path, int *offsetp, int fulllen)
+{
+       int offset = *offsetp;
+
+       if (offset >= fulllen)
+               return NULL;
+
+       while (path[offset] != '\0' && offset < fulllen)
+               offset++;
+       while (path[offset] == '\0' && offset < fulllen)
+               offset++;
+
+       *offsetp = offset;
+       return (offset < fulllen) ? &path[offset] : NULL;
+}
+
+/*
+ * Check that @subdir is a subdir of @dir.  @len is the length of
+ * @dir (to avoid having to recalculate it).
+ */
+static bool is_subdir(const char *subdir, const char *dir, size_t len)
+{
+       size_t subdirlen = strlen(subdir);
+
+       if (subdirlen < len)
+               return false;
+       if (strncmp(subdir, dir, len) != 0)
+               return false;
+       if (dir[len-1] == '/')
+               return true;
+       if (subdir[len] == '/' || subdirlen == len)
+               return true;
+       return false;
+}
+
+/*
+ * Check if the open fd is a symlink.  Return -ELOOP if it is.  Return
+ * -ENOENT if we couldn't fstat.  Return 0 if the fd is ok.
+ */
+static int check_symlink(int fd)
+{
+       struct stat sb;
+       int ret = fstat(fd, &sb);
+       if (ret < 0)
+               return -ENOENT;
+       if (S_ISLNK(sb.st_mode))
+               return -ELOOP;
+       return 0;
+}
+
+/*
+ * Open a file or directory, provided that it contains no symlinks.
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ */
+static int open_if_safe(int dirfd, const char *nextpath)
+{
+       int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
+       if (newfd >= 0) // was not a symlink, all good
+               return newfd;
+
+       if (errno == ELOOP)
+               return newfd;
+
+       if (errno == EPERM || errno == EACCES) {
+               /* we're not root (cause we got EPERM) so
+                  try opening with O_PATH */
+               newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
+               if (newfd >= 0) {
+                       /* O_PATH will return an fd for symlinks.  We know
+                        * nextpath wasn't a symlink at last openat, so if fd
+                        * is now a link, then something * fishy is going on
+                        */
+                       int ret = check_symlink(newfd);
+                       if (ret < 0) {
+                               close(newfd);
+                               newfd = ret;
+                       }
+               }
+       }
+
+       return newfd;
+}
+
+/*
+ * Open a path intending for mounting, ensuring that the final path
+ * is inside the container's rootfs.
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ *
+ * @target: path to be opened
+ * @prefix_skip: a part of @target in which to ignore symbolic links.  This
+ * would be the container's rootfs.
+ *
+ * Return an open fd for the path, or <0 on error.
+ */
+static int open_without_symlink(const char *target, const char *prefix_skip)
+{
+       int curlen = 0, dirfd, fulllen, i;
+       char *dup = NULL;
+
+       fulllen = strlen(target);
+
+       /* make sure prefix-skip makes sense */
+       if (prefix_skip && strlen(prefix_skip) > 0) {
+               curlen = strlen(prefix_skip);
+               if (!is_subdir(target, prefix_skip, curlen)) {
+                       ERROR("WHOA there - target '%s' didn't start with prefix '%s'",
+                               target, prefix_skip);
+                       return -EINVAL;
+               }
+               /*
+                * get_nextpath() expects the curlen argument to be
+                * on a  (turned into \0) / or before it, so decrement
+                * curlen to make sure that happens
+                */
+               if (curlen)
+                       curlen--;
         } else {
-               /* Only truncate the environment if we're actually going to
-                * overwrite part of it. */
-               if (len >= arg_end - arg_start) {
-                       env_start = env_end;
+               prefix_skip = "/";
+               curlen = 0;
+       }
+
+       /* Make a copy of target which we can hack up, and tokenize it */
+       if ((dup = strdup(target)) == NULL) {
+               SYSERROR("Out of memory checking for symbolic link");
+               return -ENOMEM;
+       }
+       for (i = 0; i < fulllen; i++) {
+               if (dup[i] == '/')
+                       dup[i] = '\0';
+       }
+
+       dirfd = open(prefix_skip, O_RDONLY);
+       if (dirfd < 0)
+               goto out;
+       while (1) {
+               int newfd, saved_errno;
+               char *nextpath;
+
+               if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
+                       goto out;
+               newfd = open_if_safe(dirfd, nextpath);
+               saved_errno = errno;
+               close(dirfd);
+               dirfd = newfd;
+               if (newfd < 0) {
+                       errno = saved_errno;
+                       if (errno == ELOOP)
+                               SYSERROR("%s in %s was a symbolic link!", nextpath, target);
+                       goto out;
                 }
+       }
  
-               arg_end = arg_start + len;
+out:
+       free(dup);
+       return dirfd;
+}
  
-               /* check overflow */
-               if (arg_end < len || arg_end < arg_start) {
-                       return -1;
+/*
+ * Safely mount a path into a container, ensuring that the mount target
+ * is under the container's @rootfs.  (If @rootfs is NULL, then the container
+ * uses the host's /)
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ */
+int safe_mount(const char *src, const char *dest, const char *fstype,
+               unsigned long flags, const void *data, const char *rootfs)
+{
+       int srcfd = -1, destfd, ret, saved_errno;
+       char srcbuf[50], destbuf[50]; // only needs enough for /proc/self/fd/<fd>
+       const char *mntsrc = src;
+
+       if (!rootfs)
+               rootfs = "";
+
+       /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
+       if (flags & MS_BIND && src && src[0] != '/') {
+               INFO("this is a relative bind mount");
+               srcfd = open_without_symlink(src, NULL);
+               if (srcfd < 0)
+                       return srcfd;
+               ret = snprintf(srcbuf, 50, "/proc/self/fd/%d", srcfd);
+               if (ret < 0 || ret > 50) {
+                       close(srcfd);
+                       ERROR("Out of memory");
+                       return -EINVAL;
                 }
+               mntsrc = srcbuf;
+       }
  
+       destfd = open_without_symlink(dest, rootfs);
+       if (destfd < 0) {
+               if (srcfd != -1) {
+                       saved_errno = errno;
+                       close(srcfd);
+                       errno = saved_errno;
+               }
+               return destfd;
         }
  
-       strcpy((char*)arg_start, title);
+       ret = snprintf(destbuf, 50, "/proc/self/fd/%d", destfd);
+       if (ret < 0 || ret > 50) {
+               if (srcfd != -1)
+                       close(srcfd);
+               close(destfd);
+               ERROR("Out of memory");
+               return -EINVAL;
+       }
  
-       ret |= prctl(PR_SET_MM, PR_SET_MM_ARG_START,   arg_start, 0, 0);
-       ret |= prctl(PR_SET_MM, PR_SET_MM_ARG_END,     arg_end, 0, 0);
-       ret |= prctl(PR_SET_MM, PR_SET_MM_ENV_START,   env_start, 0, 0);
-       ret |= prctl(PR_SET_MM, PR_SET_MM_ENV_END,     env_end, 0, 0);
+       ret = mount(mntsrc, destbuf, fstype, flags, data);
+       saved_errno = errno;
+       if (srcfd != -1)
+               close(srcfd);
+       close(destfd);
+       if (ret < 0) {
+               errno = saved_errno;
+               SYSERROR("Failed to mount %s onto %s", src, dest);
+               return ret;
+       }
  
-       return ret;
+       return 0;
  }
  
  /*
@@ -1411,6 +1723,8 @@ int setproctitle(char *title)
   *
   * Returns < 0 on failure, 0 if the correct proc was already mounted
   * and 1 if a new proc was mounted.
+ *
+ * NOTE: not to be called from inside the container namespace!
   */
  int mount_proc_if_needed(const char *rootfs)
  {
@@ -1429,6 +1743,10 @@ int mount_proc_if_needed(const char *rootfs)
         mypid = (int)getpid();
         INFO("I am %d, /proc/self points to '%s'", mypid, link);
         ret = snprintf(path, MAXPATHLEN, "%s/proc", rootfs);
+       if (ret < 0 || ret >= MAXPATHLEN) {
+               SYSERROR("proc path name too long");
+               return -1;
+       }
         if (linklen < 0) /* /proc not mounted */
                 goto domount;
         if (atoi(link) != mypid) {
@@ -1440,29 +1758,139 @@ int mount_proc_if_needed(const char *rootfs)
         return 0;
  
  domount:
-       if (mount("proc", path, "proc", 0, NULL))
+       if (!strcmp(rootfs,"")) /* rootfs is NULL */
+               ret = mount("proc", path, "proc", 0, NULL);
+       else
+               ret = safe_mount("proc", path, "proc", 0, NULL, rootfs);
+
+       if (ret < 0)
                 return -1;
+
         INFO("Mounted /proc in container for security transition");
         return 1;
  }
  
-int null_stdfds(void)
+int open_devnull(void)
  {
-       int fd, ret = -1;
+       int fd = open("/dev/null", O_RDWR);
+
+       if (fd < 0)
+               SYSERROR("Can't open /dev/null");
+
+       return fd;
+}
  
-       fd = open("/dev/null", O_RDWR);
+int set_stdfds(int fd)
+{
         if (fd < 0)
                 return -1;
  
         if (dup2(fd, 0) < 0)
-               goto err;
+               return -1;
         if (dup2(fd, 1) < 0)
-               goto err;
+               return -1;
         if (dup2(fd, 2) < 0)
-               goto err;
+               return -1;
+
+       return 0;
+}
+
+int null_stdfds(void)
+{
+       int ret = -1;
+       int fd = open_devnull();
+
+       if (fd >= 0) {
+               ret = set_stdfds(fd);
+               close(fd);
+       }
  
-       ret = 0;
-err:
-       close(fd);
         return ret;
  }
+
+/*
+ * Return the number of lines in file @fn, or -1 on error
+ */
+int lxc_count_file_lines(const char *fn)
+{
+       FILE *f;
+       char *line = NULL;
+       size_t sz = 0;
+       int n = 0;
+
+       f = fopen_cloexec(fn, "r");
+       if (!f)
+               return -1;
+
+       while (getline(&line, &sz, f) != -1) {
+               n++;
+       }
+       free(line);
+       fclose(f);
+       return n;
+}
+
+void *lxc_strmmap(void *addr, size_t length, int prot, int flags, int fd,
+                 off_t offset)
+{
+       void *tmp = NULL, *overlap = NULL;
+
+       /* We establish an anonymous mapping that is one byte larger than the
+        * underlying file. The pages handed to us are zero filled. */
+       tmp = mmap(addr, length + 1, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       if (tmp == MAP_FAILED)
+               return tmp;
+
+       /* Now we establish a fixed-address mapping starting at the address we
+        * received from our anonymous mapping and replace all bytes excluding
+        * the additional \0-byte with the file. This allows us to use normal
+        * string-handling functions. */
+       overlap = mmap(tmp, length, prot, MAP_FIXED | flags, fd, offset);
+       if (overlap == MAP_FAILED)
+               munmap(tmp, length + 1);
+
+       return overlap;
+}
+
+int lxc_strmunmap(void *addr, size_t length)
+{
+       return munmap(addr, length + 1);
+}
+
+/* Check whether a signal is blocked by a process. */
+bool task_blocking_signal(pid_t pid, int signal)
+{
+       bool bret = false;
+       char *line = NULL;
+       long unsigned int sigblk = 0;
+       size_t n = 0;
+       int ret;
+       FILE *f;
+
+       /* The largest integer that can fit into long int is 2^64. This is a
+        * 20-digit number. */
+       size_t len = /* /proc */ 5 + /* /pid-to-str */ 21 + /* /status */ 7 + /* \0 */ 1;
+       char status[len];
+
+       ret = snprintf(status, len, "/proc/%d/status", pid);
+       if (ret < 0 || ret >= len)
+               return bret;
+
+       f = fopen(status, "r");
+       if (!f)
+               return bret;
+
+       while (getline(&line, &n, f) != -1) {
+               if (!strncmp(line, "SigBlk:\t", 8))
+                       if (sscanf(line + 8, "%lx", &sigblk) != 1)
+                               goto out;
+       }
+
+       if (sigblk & signal)
+               bret = true;
+
+out:
+       free(line);
+       fclose(f);
+       return bret;
+}