X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=src%2Flxc%2Futils.c;h=9a6ef4b37528f1cb9460164db8d380b4decff15a;hb=aeb3682ff631e6afaa9c0b5d8a32654e8ebd2771;hp=7ced314872ac4bbca21d9e564f6188d95163db68;hpb=10841821111263ccc1b56709e3e01521203a8fa5;p=mirror_lxc.git diff --git a/src/lxc/utils.c b/src/lxc/utils.c index 7ced31487..9a6ef4b37 100644 --- a/src/lxc/utils.c +++ b/src/lxc/utils.c @@ -23,48 +23,61 @@ #include "config.h" +#include +#include #include -#include -#include +#include +#include #include +#include +#include #include -#include -#include -#include +#include #include -#include #include -#include -#include -#include +#include +#include +#include #include +#include #include -#include -#include -#include "utils.h" #include "log.h" #include "lxclock.h" #include "namespace.h" +#include "utils.h" #ifndef PR_SET_MM #define PR_SET_MM 35 #endif -#ifndef PR_SET_MM_ARG_START -#define PR_SET_MM_ARG_START 8 -#endif - -#ifndef PR_SET_MM_ARG_END -#define PR_SET_MM_ARG_END 9 +#ifndef PR_SET_MM_MAP +#define PR_SET_MM_MAP 14 + +struct prctl_mm_map { + uint64_t start_code; + uint64_t end_code; + uint64_t start_data; + uint64_t end_data; + uint64_t start_brk; + uint64_t brk; + uint64_t start_stack; + uint64_t arg_start; + uint64_t arg_end; + uint64_t env_start; + uint64_t env_end; + uint64_t *auxv; + uint32_t auxv_size; + uint32_t exe_fd; +}; #endif -#ifndef PR_SET_MM_ENV_START -#define PR_SET_MM_ENV_START 10 +#ifndef O_PATH +#define O_PATH 010000000 #endif -#ifndef PR_SET_MM_ENV_END -#define PR_SET_MM_ENV_END 11 +#ifndef O_NOFOLLOW +#define O_NOFOLLOW 00400000 #endif lxc_log_define(lxc_utils, lxc); @@ -77,7 +90,7 @@ extern bool btrfs_try_remove_subvol(const char *path); static int _recursive_rmdir(char *dirname, dev_t pdev, const char *exclude, int level, bool onedev) { - struct dirent dirent, *direntp; + struct dirent *direntp; DIR *dir; int ret, failed=0; char pathname[MAXPATHLEN]; @@ -89,7 +102,7 @@ static int _recursive_rmdir(char *dirname, dev_t pdev, return -1; } - while (!readdir_r(dir, &dirent, &direntp)) { + while ((direntp = readdir(dir))) { struct stat mystat; int rc; @@ -197,6 +210,8 @@ extern int lxc_rmdir_onedev(char *path, const char *exclude) } if (lstat(path, &mystat) < 0) { + if (errno == ENOENT) + return 0; ERROR("%s: failed to stat %s", __func__, path); return -1; } @@ -701,6 +716,24 @@ char **lxc_normalize_path(const char *path) return components; } +bool lxc_deslashify(char *path) +{ + char **parts = NULL, *path2; + + parts = lxc_normalize_path(path); + if (!parts) + return false; + + path2 = lxc_string_join("/", (const char **) parts, *path == '/'); + lxc_free_array((void **) parts, free); + if (!path2) + return false; + + strncpy(path, path2, strlen(path)); + free(path2); + return true; +} + char *lxc_append_paths(const char *first, const char *second) { size_t len = strlen(first) + strlen(second) + 1; @@ -926,7 +959,7 @@ void **lxc_append_null_to_array(void **array, size_t count) if (count) { temp = realloc(array, (count + 1) * sizeof(*array)); if (!temp) { - int i; + size_t i; for (i = 0; i < count; i++) free(array[i]); free(array); @@ -1170,6 +1203,11 @@ bool file_exists(const char *f) return stat(f, &statbuf) == 0; } +bool cgns_supported(void) +{ + return file_exists("/proc/self/ns/cgroup"); +} + /* historically lxc-init has been under /usr/lib/lxc and under * /usr/lib/$ARCH/lxc. It now lives as $prefix/sbin/init.lxc. */ @@ -1334,10 +1372,19 @@ char *get_template_path(const char *t) */ int setproctitle(char *title) { + static char *proctitle = NULL; char buf[2048], *tmp; FILE *f; int i, len, ret = 0; - unsigned long arg_start, arg_end, env_start, env_end; + + /* We don't really need to know all of this stuff, but unfortunately + * PR_SET_MM_MAP requires us to set it all at once, so we have to + * figure it out anyway. + */ + unsigned long start_data, end_data, start_brk, start_code, end_code, + start_stack, arg_start, arg_end, env_start, env_end, + brk_val; + struct prctl_mm_map prctl_map; f = fopen_cloexec("/proc/self/stat", "r"); if (!f) { @@ -1350,56 +1397,321 @@ int setproctitle(char *title) return -1; } - /* Skip the first 47 fields, column 48-51 are ARG_START and - * ARG_END. */ + /* Skip the first 25 fields, column 26-28 are start_code, end_code, + * and start_stack */ tmp = strchr(buf, ' '); - for (i = 0; i < 46; i++) { + for (i = 0; i < 24; i++) { if (!tmp) return -1; tmp = strchr(tmp+1, ' '); } - if (!tmp) return -1; - i = sscanf(tmp, "%lu %lu %lu %lu", &arg_start, &arg_end, &env_start, &env_end); - if (i != 4) { + i = sscanf(tmp, "%lu %lu %lu", &start_code, &end_code, &start_stack); + if (i != 3) return -1; + + /* Skip the next 19 fields, column 45-51 are start_data to arg_end */ + for (i = 0; i < 19; i++) { + if (!tmp) + return -1; + tmp = strchr(tmp+1, ' '); } + if (!tmp) + return -1; + + i = sscanf(tmp, "%lu %lu %lu %lu %lu %lu %lu", + &start_data, + &end_data, + &start_brk, + &arg_start, + &arg_end, + &env_start, + &env_end); + if (i != 7) + return -1; + /* Include the null byte here, because in the calculations below we * want to have room for it. */ len = strlen(title) + 1; - /* We're truncating the environment, so we should use at most the - * length of the argument + environment for the title. */ - if (len > env_end - arg_start) { - arg_end = env_end; - len = env_end - arg_start; + /* If we don't have enough room by just overwriting the old proctitle, + * let's allocate a new one. + */ + if (len > arg_end - arg_start) { + void *m; + m = realloc(proctitle, len); + if (!m) + return -1; + proctitle = m; + + arg_start = (unsigned long) proctitle; + } + + arg_end = arg_start + len; + + brk_val = syscall(__NR_brk, 0); + + prctl_map = (struct prctl_mm_map) { + .start_code = start_code, + .end_code = end_code, + .start_stack = start_stack, + .start_data = start_data, + .end_data = end_data, + .start_brk = start_brk, + .brk = brk_val, + .arg_start = arg_start, + .arg_end = arg_end, + .env_start = env_start, + .env_end = env_end, + .auxv = NULL, + .auxv_size = 0, + .exe_fd = -1, + }; + + ret = prctl(PR_SET_MM, PR_SET_MM_MAP, (long) &prctl_map, sizeof(prctl_map), 0); + if (ret == 0) + strcpy((char*)arg_start, title); + else + INFO("setting cmdline failed - %s", strerror(errno)); + + return ret; +} + +/* + * @path: a pathname where / replaced with '\0'. + * @offsetp: pointer to int showing which path segment was last seen. + * Updated on return to reflect the next segment. + * @fulllen: full original path length. + * Returns a pointer to the next path segment, or NULL if done. + */ +static char *get_nextpath(char *path, int *offsetp, int fulllen) +{ + int offset = *offsetp; + + if (offset >= fulllen) + return NULL; + + while (path[offset] != '\0' && offset < fulllen) + offset++; + while (path[offset] == '\0' && offset < fulllen) + offset++; + + *offsetp = offset; + return (offset < fulllen) ? &path[offset] : NULL; +} + +/* + * Check that @subdir is a subdir of @dir. @len is the length of + * @dir (to avoid having to recalculate it). + */ +static bool is_subdir(const char *subdir, const char *dir, size_t len) +{ + size_t subdirlen = strlen(subdir); + + if (subdirlen < len) + return false; + if (strncmp(subdir, dir, len) != 0) + return false; + if (dir[len-1] == '/') + return true; + if (subdir[len] == '/' || subdirlen == len) + return true; + return false; +} + +/* + * Check if the open fd is a symlink. Return -ELOOP if it is. Return + * -ENOENT if we couldn't fstat. Return 0 if the fd is ok. + */ +static int check_symlink(int fd) +{ + struct stat sb; + int ret = fstat(fd, &sb); + if (ret < 0) + return -ENOENT; + if (S_ISLNK(sb.st_mode)) + return -ELOOP; + return 0; +} + +/* + * Open a file or directory, provided that it contains no symlinks. + * + * CAVEAT: This function must not be used for other purposes than container + * setup before executing the container's init + */ +static int open_if_safe(int dirfd, const char *nextpath) +{ + int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW); + if (newfd >= 0) // was not a symlink, all good + return newfd; + + if (errno == ELOOP) + return newfd; + + if (errno == EPERM || errno == EACCES) { + /* we're not root (cause we got EPERM) so + try opening with O_PATH */ + newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW); + if (newfd >= 0) { + /* O_PATH will return an fd for symlinks. We know + * nextpath wasn't a symlink at last openat, so if fd + * is now a link, then something * fishy is going on + */ + int ret = check_symlink(newfd); + if (ret < 0) { + close(newfd); + newfd = ret; + } + } + } + + return newfd; +} + +/* + * Open a path intending for mounting, ensuring that the final path + * is inside the container's rootfs. + * + * CAVEAT: This function must not be used for other purposes than container + * setup before executing the container's init + * + * @target: path to be opened + * @prefix_skip: a part of @target in which to ignore symbolic links. This + * would be the container's rootfs. + * + * Return an open fd for the path, or <0 on error. + */ +static int open_without_symlink(const char *target, const char *prefix_skip) +{ + int curlen = 0, dirfd, fulllen, i; + char *dup = NULL; + + fulllen = strlen(target); + + /* make sure prefix-skip makes sense */ + if (prefix_skip && strlen(prefix_skip) > 0) { + curlen = strlen(prefix_skip); + if (!is_subdir(target, prefix_skip, curlen)) { + ERROR("WHOA there - target '%s' didn't start with prefix '%s'", + target, prefix_skip); + return -EINVAL; + } + /* + * get_nextpath() expects the curlen argument to be + * on a (turned into \0) / or before it, so decrement + * curlen to make sure that happens + */ + if (curlen) + curlen--; } else { - /* Only truncate the environment if we're actually going to - * overwrite part of it. */ - if (len >= arg_end - arg_start) { - env_start = env_end; + prefix_skip = "/"; + curlen = 0; + } + + /* Make a copy of target which we can hack up, and tokenize it */ + if ((dup = strdup(target)) == NULL) { + SYSERROR("Out of memory checking for symbolic link"); + return -ENOMEM; + } + for (i = 0; i < fulllen; i++) { + if (dup[i] == '/') + dup[i] = '\0'; + } + + dirfd = open(prefix_skip, O_RDONLY); + if (dirfd < 0) + goto out; + while (1) { + int newfd, saved_errno; + char *nextpath; + + if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL) + goto out; + newfd = open_if_safe(dirfd, nextpath); + saved_errno = errno; + close(dirfd); + dirfd = newfd; + if (newfd < 0) { + errno = saved_errno; + if (errno == ELOOP) + SYSERROR("%s in %s was a symbolic link!", nextpath, target); + goto out; } + } - arg_end = arg_start + len; +out: + free(dup); + return dirfd; +} - /* check overflow */ - if (arg_end < len || arg_end < arg_start) { - return -1; +/* + * Safely mount a path into a container, ensuring that the mount target + * is under the container's @rootfs. (If @rootfs is NULL, then the container + * uses the host's /) + * + * CAVEAT: This function must not be used for other purposes than container + * setup before executing the container's init + */ +int safe_mount(const char *src, const char *dest, const char *fstype, + unsigned long flags, const void *data, const char *rootfs) +{ + int srcfd = -1, destfd, ret, saved_errno; + char srcbuf[50], destbuf[50]; // only needs enough for /proc/self/fd/ + const char *mntsrc = src; + + if (!rootfs) + rootfs = ""; + + /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */ + if (flags & MS_BIND && src && src[0] != '/') { + INFO("this is a relative bind mount"); + srcfd = open_without_symlink(src, NULL); + if (srcfd < 0) + return srcfd; + ret = snprintf(srcbuf, 50, "/proc/self/fd/%d", srcfd); + if (ret < 0 || ret > 50) { + close(srcfd); + ERROR("Out of memory"); + return -EINVAL; } + mntsrc = srcbuf; + } + destfd = open_without_symlink(dest, rootfs); + if (destfd < 0) { + if (srcfd != -1) { + saved_errno = errno; + close(srcfd); + errno = saved_errno; + } + return destfd; } - strcpy((char*)arg_start, title); + ret = snprintf(destbuf, 50, "/proc/self/fd/%d", destfd); + if (ret < 0 || ret > 50) { + if (srcfd != -1) + close(srcfd); + close(destfd); + ERROR("Out of memory"); + return -EINVAL; + } - ret |= prctl(PR_SET_MM, PR_SET_MM_ARG_START, arg_start, 0, 0); - ret |= prctl(PR_SET_MM, PR_SET_MM_ARG_END, arg_end, 0, 0); - ret |= prctl(PR_SET_MM, PR_SET_MM_ENV_START, env_start, 0, 0); - ret |= prctl(PR_SET_MM, PR_SET_MM_ENV_END, env_end, 0, 0); + ret = mount(mntsrc, destbuf, fstype, flags, data); + saved_errno = errno; + if (srcfd != -1) + close(srcfd); + close(destfd); + if (ret < 0) { + errno = saved_errno; + SYSERROR("Failed to mount %s onto %s", src, dest); + return ret; + } - return ret; + return 0; } /* @@ -1411,6 +1723,8 @@ int setproctitle(char *title) * * Returns < 0 on failure, 0 if the correct proc was already mounted * and 1 if a new proc was mounted. + * + * NOTE: not to be called from inside the container namespace! */ int mount_proc_if_needed(const char *rootfs) { @@ -1429,6 +1743,10 @@ int mount_proc_if_needed(const char *rootfs) mypid = (int)getpid(); INFO("I am %d, /proc/self points to '%s'", mypid, link); ret = snprintf(path, MAXPATHLEN, "%s/proc", rootfs); + if (ret < 0 || ret >= MAXPATHLEN) { + SYSERROR("proc path name too long"); + return -1; + } if (linklen < 0) /* /proc not mounted */ goto domount; if (atoi(link) != mypid) { @@ -1440,29 +1758,139 @@ int mount_proc_if_needed(const char *rootfs) return 0; domount: - if (mount("proc", path, "proc", 0, NULL)) + if (!strcmp(rootfs,"")) /* rootfs is NULL */ + ret = mount("proc", path, "proc", 0, NULL); + else + ret = safe_mount("proc", path, "proc", 0, NULL, rootfs); + + if (ret < 0) return -1; + INFO("Mounted /proc in container for security transition"); return 1; } -int null_stdfds(void) +int open_devnull(void) { - int fd, ret = -1; + int fd = open("/dev/null", O_RDWR); + + if (fd < 0) + SYSERROR("Can't open /dev/null"); + + return fd; +} - fd = open("/dev/null", O_RDWR); +int set_stdfds(int fd) +{ if (fd < 0) return -1; if (dup2(fd, 0) < 0) - goto err; + return -1; if (dup2(fd, 1) < 0) - goto err; + return -1; if (dup2(fd, 2) < 0) - goto err; + return -1; + + return 0; +} + +int null_stdfds(void) +{ + int ret = -1; + int fd = open_devnull(); + + if (fd >= 0) { + ret = set_stdfds(fd); + close(fd); + } - ret = 0; -err: - close(fd); return ret; } + +/* + * Return the number of lines in file @fn, or -1 on error + */ +int lxc_count_file_lines(const char *fn) +{ + FILE *f; + char *line = NULL; + size_t sz = 0; + int n = 0; + + f = fopen_cloexec(fn, "r"); + if (!f) + return -1; + + while (getline(&line, &sz, f) != -1) { + n++; + } + free(line); + fclose(f); + return n; +} + +void *lxc_strmmap(void *addr, size_t length, int prot, int flags, int fd, + off_t offset) +{ + void *tmp = NULL, *overlap = NULL; + + /* We establish an anonymous mapping that is one byte larger than the + * underlying file. The pages handed to us are zero filled. */ + tmp = mmap(addr, length + 1, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (tmp == MAP_FAILED) + return tmp; + + /* Now we establish a fixed-address mapping starting at the address we + * received from our anonymous mapping and replace all bytes excluding + * the additional \0-byte with the file. This allows us to use normal + * string-handling functions. */ + overlap = mmap(tmp, length, prot, MAP_FIXED | flags, fd, offset); + if (overlap == MAP_FAILED) + munmap(tmp, length + 1); + + return overlap; +} + +int lxc_strmunmap(void *addr, size_t length) +{ + return munmap(addr, length + 1); +} + +/* Check whether a signal is blocked by a process. */ +bool task_blocking_signal(pid_t pid, int signal) +{ + bool bret = false; + char *line = NULL; + long unsigned int sigblk = 0; + size_t n = 0; + int ret; + FILE *f; + + /* The largest integer that can fit into long int is 2^64. This is a + * 20-digit number. */ + size_t len = /* /proc */ 5 + /* /pid-to-str */ 21 + /* /status */ 7 + /* \0 */ 1; + char status[len]; + + ret = snprintf(status, len, "/proc/%d/status", pid); + if (ret < 0 || ret >= len) + return bret; + + f = fopen(status, "r"); + if (!f) + return bret; + + while (getline(&line, &n, f) != -1) { + if (!strncmp(line, "SigBlk:\t", 8)) + if (sscanf(line + 8, "%lx", &sigblk) != 1) + goto out; + } + + if (sigblk & signal) + bret = true; + +out: + free(line); + fclose(f); + return bret; +}