#include "config.h"
+#include <assert.h>
+#include <dirent.h>
#include <errno.h>
-#include <unistd.h>
-#include <stdlib.h>
+#include <fcntl.h>
+#include <libgen.h>
#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
-#include <sys/types.h>
-#include <sys/vfs.h>
-#include <sys/stat.h>
+#include <unistd.h>
#include <sys/mman.h>
-#include <sys/param.h>
#include <sys/mount.h>
-#include <dirent.h>
-#include <fcntl.h>
-#include <libgen.h>
+#include <sys/param.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
#include <sys/types.h>
+#include <sys/vfs.h>
#include <sys/wait.h>
-#include <assert.h>
-#include <sys/prctl.h>
-#include "utils.h"
#include "log.h"
#include "lxclock.h"
#include "namespace.h"
+#include "utils.h"
#ifndef PR_SET_MM
#define PR_SET_MM 35
#endif
-#ifndef PR_SET_MM_ARG_START
-#define PR_SET_MM_ARG_START 8
-#endif
-
-#ifndef PR_SET_MM_ARG_END
-#define PR_SET_MM_ARG_END 9
+#ifndef PR_SET_MM_MAP
+#define PR_SET_MM_MAP 14
+
+struct prctl_mm_map {
+ uint64_t start_code;
+ uint64_t end_code;
+ uint64_t start_data;
+ uint64_t end_data;
+ uint64_t start_brk;
+ uint64_t brk;
+ uint64_t start_stack;
+ uint64_t arg_start;
+ uint64_t arg_end;
+ uint64_t env_start;
+ uint64_t env_end;
+ uint64_t *auxv;
+ uint32_t auxv_size;
+ uint32_t exe_fd;
+};
#endif
-#ifndef PR_SET_MM_ENV_START
-#define PR_SET_MM_ENV_START 10
+#ifndef O_PATH
+#define O_PATH 010000000
#endif
-#ifndef PR_SET_MM_ENV_END
-#define PR_SET_MM_ENV_END 11
+#ifndef O_NOFOLLOW
+#define O_NOFOLLOW 00400000
#endif
lxc_log_define(lxc_utils, lxc);
static int _recursive_rmdir(char *dirname, dev_t pdev,
const char *exclude, int level, bool onedev)
{
- struct dirent dirent, *direntp;
+ struct dirent *direntp;
DIR *dir;
int ret, failed=0;
char pathname[MAXPATHLEN];
return -1;
}
- while (!readdir_r(dir, &dirent, &direntp)) {
+ while ((direntp = readdir(dir))) {
struct stat mystat;
int rc;
}
if (lstat(path, &mystat) < 0) {
+ if (errno == ENOENT)
+ return 0;
ERROR("%s: failed to stat %s", __func__, path);
return -1;
}
return components;
}
+bool lxc_deslashify(char *path)
+{
+ char **parts = NULL, *path2;
+
+ parts = lxc_normalize_path(path);
+ if (!parts)
+ return false;
+
+ path2 = lxc_string_join("/", (const char **) parts, *path == '/');
+ lxc_free_array((void **) parts, free);
+ if (!path2)
+ return false;
+
+ strncpy(path, path2, strlen(path));
+ free(path2);
+ return true;
+}
+
char *lxc_append_paths(const char *first, const char *second)
{
size_t len = strlen(first) + strlen(second) + 1;
if (count) {
temp = realloc(array, (count + 1) * sizeof(*array));
if (!temp) {
- int i;
+ size_t i;
for (i = 0; i < count; i++)
free(array[i]);
free(array);
return stat(f, &statbuf) == 0;
}
+bool cgns_supported(void)
+{
+ return file_exists("/proc/self/ns/cgroup");
+}
+
/* historically lxc-init has been under /usr/lib/lxc and under
* /usr/lib/$ARCH/lxc. It now lives as $prefix/sbin/init.lxc.
*/
*/
int setproctitle(char *title)
{
+ static char *proctitle = NULL;
char buf[2048], *tmp;
FILE *f;
int i, len, ret = 0;
- unsigned long arg_start, arg_end, env_start, env_end;
+
+ /* We don't really need to know all of this stuff, but unfortunately
+ * PR_SET_MM_MAP requires us to set it all at once, so we have to
+ * figure it out anyway.
+ */
+ unsigned long start_data, end_data, start_brk, start_code, end_code,
+ start_stack, arg_start, arg_end, env_start, env_end,
+ brk_val;
+ struct prctl_mm_map prctl_map;
f = fopen_cloexec("/proc/self/stat", "r");
if (!f) {
return -1;
}
- /* Skip the first 47 fields, column 48-51 are ARG_START and
- * ARG_END. */
+ /* Skip the first 25 fields, column 26-28 are start_code, end_code,
+ * and start_stack */
tmp = strchr(buf, ' ');
- for (i = 0; i < 46; i++) {
+ for (i = 0; i < 24; i++) {
if (!tmp)
return -1;
tmp = strchr(tmp+1, ' ');
}
-
if (!tmp)
return -1;
- i = sscanf(tmp, "%lu %lu %lu %lu", &arg_start, &arg_end, &env_start, &env_end);
- if (i != 4) {
+ i = sscanf(tmp, "%lu %lu %lu", &start_code, &end_code, &start_stack);
+ if (i != 3)
return -1;
+
+ /* Skip the next 19 fields, column 45-51 are start_data to arg_end */
+ for (i = 0; i < 19; i++) {
+ if (!tmp)
+ return -1;
+ tmp = strchr(tmp+1, ' ');
}
+ if (!tmp)
+ return -1;
+
+ i = sscanf(tmp, "%lu %lu %lu %lu %lu %lu %lu",
+ &start_data,
+ &end_data,
+ &start_brk,
+ &arg_start,
+ &arg_end,
+ &env_start,
+ &env_end);
+ if (i != 7)
+ return -1;
+
/* Include the null byte here, because in the calculations below we
* want to have room for it. */
len = strlen(title) + 1;
- /* We're truncating the environment, so we should use at most the
- * length of the argument + environment for the title. */
- if (len > env_end - arg_start) {
- arg_end = env_end;
- len = env_end - arg_start;
+ /* If we don't have enough room by just overwriting the old proctitle,
+ * let's allocate a new one.
+ */
+ if (len > arg_end - arg_start) {
+ void *m;
+ m = realloc(proctitle, len);
+ if (!m)
+ return -1;
+ proctitle = m;
+
+ arg_start = (unsigned long) proctitle;
+ }
+
+ arg_end = arg_start + len;
+
+ brk_val = syscall(__NR_brk, 0);
+
+ prctl_map = (struct prctl_mm_map) {
+ .start_code = start_code,
+ .end_code = end_code,
+ .start_stack = start_stack,
+ .start_data = start_data,
+ .end_data = end_data,
+ .start_brk = start_brk,
+ .brk = brk_val,
+ .arg_start = arg_start,
+ .arg_end = arg_end,
+ .env_start = env_start,
+ .env_end = env_end,
+ .auxv = NULL,
+ .auxv_size = 0,
+ .exe_fd = -1,
+ };
+
+ ret = prctl(PR_SET_MM, PR_SET_MM_MAP, (long) &prctl_map, sizeof(prctl_map), 0);
+ if (ret == 0)
+ strcpy((char*)arg_start, title);
+ else
+ INFO("setting cmdline failed - %s", strerror(errno));
+
+ return ret;
+}
+
+/*
+ * @path: a pathname where / replaced with '\0'.
+ * @offsetp: pointer to int showing which path segment was last seen.
+ * Updated on return to reflect the next segment.
+ * @fulllen: full original path length.
+ * Returns a pointer to the next path segment, or NULL if done.
+ */
+static char *get_nextpath(char *path, int *offsetp, int fulllen)
+{
+ int offset = *offsetp;
+
+ if (offset >= fulllen)
+ return NULL;
+
+ while (path[offset] != '\0' && offset < fulllen)
+ offset++;
+ while (path[offset] == '\0' && offset < fulllen)
+ offset++;
+
+ *offsetp = offset;
+ return (offset < fulllen) ? &path[offset] : NULL;
+}
+
+/*
+ * Check that @subdir is a subdir of @dir. @len is the length of
+ * @dir (to avoid having to recalculate it).
+ */
+static bool is_subdir(const char *subdir, const char *dir, size_t len)
+{
+ size_t subdirlen = strlen(subdir);
+
+ if (subdirlen < len)
+ return false;
+ if (strncmp(subdir, dir, len) != 0)
+ return false;
+ if (dir[len-1] == '/')
+ return true;
+ if (subdir[len] == '/' || subdirlen == len)
+ return true;
+ return false;
+}
+
+/*
+ * Check if the open fd is a symlink. Return -ELOOP if it is. Return
+ * -ENOENT if we couldn't fstat. Return 0 if the fd is ok.
+ */
+static int check_symlink(int fd)
+{
+ struct stat sb;
+ int ret = fstat(fd, &sb);
+ if (ret < 0)
+ return -ENOENT;
+ if (S_ISLNK(sb.st_mode))
+ return -ELOOP;
+ return 0;
+}
+
+/*
+ * Open a file or directory, provided that it contains no symlinks.
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ */
+static int open_if_safe(int dirfd, const char *nextpath)
+{
+ int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
+ if (newfd >= 0) // was not a symlink, all good
+ return newfd;
+
+ if (errno == ELOOP)
+ return newfd;
+
+ if (errno == EPERM || errno == EACCES) {
+ /* we're not root (cause we got EPERM) so
+ try opening with O_PATH */
+ newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
+ if (newfd >= 0) {
+ /* O_PATH will return an fd for symlinks. We know
+ * nextpath wasn't a symlink at last openat, so if fd
+ * is now a link, then something * fishy is going on
+ */
+ int ret = check_symlink(newfd);
+ if (ret < 0) {
+ close(newfd);
+ newfd = ret;
+ }
+ }
+ }
+
+ return newfd;
+}
+
+/*
+ * Open a path intending for mounting, ensuring that the final path
+ * is inside the container's rootfs.
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ *
+ * @target: path to be opened
+ * @prefix_skip: a part of @target in which to ignore symbolic links. This
+ * would be the container's rootfs.
+ *
+ * Return an open fd for the path, or <0 on error.
+ */
+static int open_without_symlink(const char *target, const char *prefix_skip)
+{
+ int curlen = 0, dirfd, fulllen, i;
+ char *dup = NULL;
+
+ fulllen = strlen(target);
+
+ /* make sure prefix-skip makes sense */
+ if (prefix_skip && strlen(prefix_skip) > 0) {
+ curlen = strlen(prefix_skip);
+ if (!is_subdir(target, prefix_skip, curlen)) {
+ ERROR("WHOA there - target '%s' didn't start with prefix '%s'",
+ target, prefix_skip);
+ return -EINVAL;
+ }
+ /*
+ * get_nextpath() expects the curlen argument to be
+ * on a (turned into \0) / or before it, so decrement
+ * curlen to make sure that happens
+ */
+ if (curlen)
+ curlen--;
} else {
- /* Only truncate the environment if we're actually going to
- * overwrite part of it. */
- if (len >= arg_end - arg_start) {
- env_start = env_end;
+ prefix_skip = "/";
+ curlen = 0;
+ }
+
+ /* Make a copy of target which we can hack up, and tokenize it */
+ if ((dup = strdup(target)) == NULL) {
+ SYSERROR("Out of memory checking for symbolic link");
+ return -ENOMEM;
+ }
+ for (i = 0; i < fulllen; i++) {
+ if (dup[i] == '/')
+ dup[i] = '\0';
+ }
+
+ dirfd = open(prefix_skip, O_RDONLY);
+ if (dirfd < 0)
+ goto out;
+ while (1) {
+ int newfd, saved_errno;
+ char *nextpath;
+
+ if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
+ goto out;
+ newfd = open_if_safe(dirfd, nextpath);
+ saved_errno = errno;
+ close(dirfd);
+ dirfd = newfd;
+ if (newfd < 0) {
+ errno = saved_errno;
+ if (errno == ELOOP)
+ SYSERROR("%s in %s was a symbolic link!", nextpath, target);
+ goto out;
}
+ }
- arg_end = arg_start + len;
+out:
+ free(dup);
+ return dirfd;
+}
- /* check overflow */
- if (arg_end < len || arg_end < arg_start) {
- return -1;
+/*
+ * Safely mount a path into a container, ensuring that the mount target
+ * is under the container's @rootfs. (If @rootfs is NULL, then the container
+ * uses the host's /)
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ */
+int safe_mount(const char *src, const char *dest, const char *fstype,
+ unsigned long flags, const void *data, const char *rootfs)
+{
+ int srcfd = -1, destfd, ret, saved_errno;
+ char srcbuf[50], destbuf[50]; // only needs enough for /proc/self/fd/<fd>
+ const char *mntsrc = src;
+
+ if (!rootfs)
+ rootfs = "";
+
+ /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
+ if (flags & MS_BIND && src && src[0] != '/') {
+ INFO("this is a relative bind mount");
+ srcfd = open_without_symlink(src, NULL);
+ if (srcfd < 0)
+ return srcfd;
+ ret = snprintf(srcbuf, 50, "/proc/self/fd/%d", srcfd);
+ if (ret < 0 || ret > 50) {
+ close(srcfd);
+ ERROR("Out of memory");
+ return -EINVAL;
}
+ mntsrc = srcbuf;
+ }
+ destfd = open_without_symlink(dest, rootfs);
+ if (destfd < 0) {
+ if (srcfd != -1) {
+ saved_errno = errno;
+ close(srcfd);
+ errno = saved_errno;
+ }
+ return destfd;
}
- strcpy((char*)arg_start, title);
+ ret = snprintf(destbuf, 50, "/proc/self/fd/%d", destfd);
+ if (ret < 0 || ret > 50) {
+ if (srcfd != -1)
+ close(srcfd);
+ close(destfd);
+ ERROR("Out of memory");
+ return -EINVAL;
+ }
- ret |= prctl(PR_SET_MM, PR_SET_MM_ARG_START, arg_start, 0, 0);
- ret |= prctl(PR_SET_MM, PR_SET_MM_ARG_END, arg_end, 0, 0);
- ret |= prctl(PR_SET_MM, PR_SET_MM_ENV_START, env_start, 0, 0);
- ret |= prctl(PR_SET_MM, PR_SET_MM_ENV_END, env_end, 0, 0);
+ ret = mount(mntsrc, destbuf, fstype, flags, data);
+ saved_errno = errno;
+ if (srcfd != -1)
+ close(srcfd);
+ close(destfd);
+ if (ret < 0) {
+ errno = saved_errno;
+ SYSERROR("Failed to mount %s onto %s", src, dest);
+ return ret;
+ }
- return ret;
+ return 0;
}
/*
*
* Returns < 0 on failure, 0 if the correct proc was already mounted
* and 1 if a new proc was mounted.
+ *
+ * NOTE: not to be called from inside the container namespace!
*/
int mount_proc_if_needed(const char *rootfs)
{
mypid = (int)getpid();
INFO("I am %d, /proc/self points to '%s'", mypid, link);
ret = snprintf(path, MAXPATHLEN, "%s/proc", rootfs);
+ if (ret < 0 || ret >= MAXPATHLEN) {
+ SYSERROR("proc path name too long");
+ return -1;
+ }
if (linklen < 0) /* /proc not mounted */
goto domount;
if (atoi(link) != mypid) {
return 0;
domount:
- if (mount("proc", path, "proc", 0, NULL))
+ if (!strcmp(rootfs,"")) /* rootfs is NULL */
+ ret = mount("proc", path, "proc", 0, NULL);
+ else
+ ret = safe_mount("proc", path, "proc", 0, NULL, rootfs);
+
+ if (ret < 0)
return -1;
+
INFO("Mounted /proc in container for security transition");
return 1;
}
-int null_stdfds(void)
+int open_devnull(void)
{
- int fd, ret = -1;
+ int fd = open("/dev/null", O_RDWR);
+
+ if (fd < 0)
+ SYSERROR("Can't open /dev/null");
+
+ return fd;
+}
- fd = open("/dev/null", O_RDWR);
+int set_stdfds(int fd)
+{
if (fd < 0)
return -1;
if (dup2(fd, 0) < 0)
- goto err;
+ return -1;
if (dup2(fd, 1) < 0)
- goto err;
+ return -1;
if (dup2(fd, 2) < 0)
- goto err;
+ return -1;
+
+ return 0;
+}
+
+int null_stdfds(void)
+{
+ int ret = -1;
+ int fd = open_devnull();
+
+ if (fd >= 0) {
+ ret = set_stdfds(fd);
+ close(fd);
+ }
- ret = 0;
-err:
- close(fd);
return ret;
}
+
+/*
+ * Return the number of lines in file @fn, or -1 on error
+ */
+int lxc_count_file_lines(const char *fn)
+{
+ FILE *f;
+ char *line = NULL;
+ size_t sz = 0;
+ int n = 0;
+
+ f = fopen_cloexec(fn, "r");
+ if (!f)
+ return -1;
+
+ while (getline(&line, &sz, f) != -1) {
+ n++;
+ }
+ free(line);
+ fclose(f);
+ return n;
+}
+
+void *lxc_strmmap(void *addr, size_t length, int prot, int flags, int fd,
+ off_t offset)
+{
+ void *tmp = NULL, *overlap = NULL;
+
+ /* We establish an anonymous mapping that is one byte larger than the
+ * underlying file. The pages handed to us are zero filled. */
+ tmp = mmap(addr, length + 1, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (tmp == MAP_FAILED)
+ return tmp;
+
+ /* Now we establish a fixed-address mapping starting at the address we
+ * received from our anonymous mapping and replace all bytes excluding
+ * the additional \0-byte with the file. This allows us to use normal
+ * string-handling functions. */
+ overlap = mmap(tmp, length, prot, MAP_FIXED | flags, fd, offset);
+ if (overlap == MAP_FAILED)
+ munmap(tmp, length + 1);
+
+ return overlap;
+}
+
+int lxc_strmunmap(void *addr, size_t length)
+{
+ return munmap(addr, length + 1);
+}
+
+/* Check whether a signal is blocked by a process. */
+bool task_blocking_signal(pid_t pid, int signal)
+{
+ bool bret = false;
+ char *line = NULL;
+ long unsigned int sigblk = 0;
+ size_t n = 0;
+ int ret;
+ FILE *f;
+
+ /* The largest integer that can fit into long int is 2^64. This is a
+ * 20-digit number. */
+ size_t len = /* /proc */ 5 + /* /pid-to-str */ 21 + /* /status */ 7 + /* \0 */ 1;
+ char status[len];
+
+ ret = snprintf(status, len, "/proc/%d/status", pid);
+ if (ret < 0 || ret >= len)
+ return bret;
+
+ f = fopen(status, "r");
+ if (!f)
+ return bret;
+
+ while (getline(&line, &n, f) != -1) {
+ if (!strncmp(line, "SigBlk:\t", 8))
+ if (sscanf(line + 8, "%lx", &sigblk) != 1)
+ goto out;
+ }
+
+ if (sigblk & signal)
+ bret = true;
+
+out:
+ free(line);
+ fclose(f);
+ return bret;
+}