#include "mount_utils.h"
#include "namespace.h"
#include "network.h"
+#include "open_utils.h"
#include "parse.h"
#include "process_utils.h"
#include "ringbuf.h"
#include <mntent.h>
#endif
-#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
+#if !HAVE_PRLIMIT && HAVE_PRLIMIT64
#include "prlimit.h"
#endif
PROTECT_LOOKUP_BENEATH,
S_IWUSR | S_IRUSR);
if (fd_pin < 0) {
- if (errno == EROFS) {
+ if (errno == EROFS)
return log_trace_errno(0, EROFS, "Not pinning on read-only filesystem");
- }
return syserror("Failed to pin rootfs");
}
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
{ LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL, false },
{ LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL, false },
+ /* /proc/sys is used as a temporary staging directory for the read-write sysfs mount and unmounted after binding net */
+ { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/proc/sys", "sysfs", MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL, false },
{ LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL, false },
- { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL, false },
- { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL, false },
+ { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/proc/sys/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL, false },
+ { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/proc/sys", NULL, NULL, 0, NULL, false },
{ 0, 0, NULL, NULL, NULL, 0, NULL, false }
};
struct lxc_conf *conf = handler->conf;
return syserror_set(-ENOMEM, "Failed to create source path");
}
- if (!default_mounts[i].destination)
- return syserror_set(-EINVAL, "BUG: auto mounts destination %d was NULL", i);
-
if (!has_cap_net_admin && default_mounts[i].requires_cap_net_admin) {
TRACE("Container does not have CAP_NET_ADMIN. Skipping \"%s\" mount", default_mounts[i].source ?: "(null)");
continue;
}
+ if (!default_mounts[i].destination) {
+ ret = umount2(source, MNT_DETACH);
+ if (ret < 0)
+ return log_error_errno(-1, errno,
+ "Failed to unmount \"%s\"",
+ source);
+ TRACE("Unmounted automount \"%s\"", source);
+ continue;
+ }
+
/* will act like strdup if %r is not present */
destination = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].destination);
if (!destination)
}
/* Build a space-separate list of ptys to pass to systemd. */
-static bool append_ttyname(char **pp, char *name)
+static bool append_ttyname(struct lxc_tty_info *ttys, char *tty_name)
{
- char *p;
+ char *tty_names, *buf;
size_t size;
- if (!*pp) {
- *pp = zalloc(strlen(name) + strlen("container_ttys=") + 1);
- if (!*pp)
- return false;
+ if (!tty_name)
+ return false;
- sprintf(*pp, "container_ttys=%s", name);
- return true;
- }
+ size = strlen(tty_name) + 1;
+ if (ttys->tty_names)
+ size += strlen(ttys->tty_names) + 1;
- size = strlen(*pp) + strlen(name) + 2;
- p = realloc(*pp, size);
- if (!p)
+ buf = realloc(ttys->tty_names, size);
+ if (!buf)
return false;
+ tty_names = buf;
- *pp = p;
- (void)strlcat(p, " ", size);
- (void)strlcat(p, name, size);
-
+ if (ttys->tty_names)
+ (void)strlcat(buf, " ", size);
+ else
+ buf[0] = '\0';
+ (void)strlcat(buf, tty_name, size);
+ ttys->tty_names = tty_names;
return true;
}
PROTECT_LOOKUP_BENEATH,
0);
if (fd < 0) {
- if (!IN_SET(errno, ENXIO, EEXIST))
+ if (errno != ENXIO && errno != EEXIST)
return syserror("Failed to create \"%d/\%s\"", dfd, path);
SYSINFO("Failed to create \"%d/\%s\"", dfd, path);
DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, rootfs->buf);
}
- if (!append_ttyname(&conf->ttys.tty_names, tty->name))
+ if (!append_ttyname(&conf->ttys, tty->name))
return log_error(-1, "Error setting up container_ttys string");
}
return -ENOMEM;
for (size_t i = 0; i < conf->ttys.max; i++) {
- int pty_nr = -1;
struct lxc_terminal_info *tty = &ttys->tty[i];
ret = lxc_devpts_terminal(conf->devpts_fd, &tty->ptx,
- &tty->pty, &pty_nr, false);
+ &tty->pty, &tty->pty_nr, false);
if (ret < 0) {
conf->ttys.max = i;
return syserror_set(-ENOTTY, "Failed to create tty %zu", i);
}
+ ret = strnprintf(tty->name, sizeof(tty->name), "pts/%d", tty->pty_nr);
+ if (ret < 0)
+ return syserror("Failed to create tty %zu", i);
+
DEBUG("Created tty with ptx fd %d and pty fd %d and index %d",
- tty->ptx, tty->pty, pty_nr);
+ tty->ptx, tty->pty, tty->pty_nr);
tty->busy = -1;
}
SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
goto on_error;
}
+ TRACE("Set \"container_ttys=%s\"", conf->ttys.tty_names);
}
return 0;
return log_trace(0, "Container uses separate rootfs. Opened container's rootfs");
}
+static bool lxc_rootfs_overmounted(struct lxc_rootfs *rootfs)
+{
+ __do_close int fd_rootfs = -EBADF;
+
+ if (!rootfs->path)
+ fd_rootfs = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
+ else
+ fd_rootfs = open_at(-EBADF, rootfs->mount, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
+ if (fd_rootfs < 0)
+ return true;
+
+ if (!same_file_lax(rootfs->dfd_mnt, fd_rootfs))
+ return syswarn_ret(true, "Rootfs seems to have changed after setting up mounts");
+
+ return false;
+}
+
static int lxc_chroot(const struct lxc_rootfs *rootfs)
{
__do_free char *nroot = NULL;
return log_error_errno(-errno, errno, "Failed to enter old root directory");
/*
- * Make fd_oldroot a depedent mount to make sure our umounts don't
- * propagate to the host.
+ * Unprivileged containers will have had all their mounts turned into
+ * dependent mounts when the container was created. But for privileged
+ * containers we need to turn the old root mount tree into a dependent
+ * mount tree to prevent propagating mounts and umounts into the host
+ * mount namespace.
*/
ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
if (ret < 0)
if (ret < 0)
return log_error_errno(-errno, errno, "Failed to re-enter new root directory \"%s\"", rootfs->mount);
+ /*
+ * Finally, we turn the rootfs into a shared mount. Note, that this
+ * doesn't reestablish mount propagation with the hosts mount
+ * namespace. Instead we'll create a new peer group.
+ *
+ * We're doing this because most workloads do rely on the rootfs being
+ * a shared mount. For example, systemd daemon like sytemd-udevd run in
+ * their own mount namespace. Their mount namespace has been made a
+ * dependent mount (MS_SLAVE) with the host rootfs as it's dominating
+ * mount. This means new mounts on the host propagate into the
+ * respective services.
+ *
+ * This is broken if we leave the container's rootfs a dependent mount.
+ * In which case both the container's rootfs and the service's rootfs
+ * will be dependent mounts with the host's rootfs as their dominating
+ * mount. So if you were to mount over the rootfs from the host it
+ * would not just propagate into the container's mount namespace it
+ * would also propagate into the service. That's nonsense semantics for
+ * nearly all relevant use-cases. Instead, establish the container's
+ * rootfs as a separate peer group mirroring the behavior on the host.
+ */
+ ret = mount("", ".", "", MS_SHARED | MS_REC, NULL);
+ if (ret < 0)
+ return log_error_errno(-errno, errno, "Failed to turn new root mount tree into shared mount tree");
+
TRACE("Changed into new rootfs \"%s\"", rootfs->mount);
return 0;
}
return syserror("Failed to create path");
close_prot_errno_disarm(conf->devpts_fd);
- return umount2(rootfs->buf, MNT_DETACH);
+ (void)umount2(rootfs->buf, MNT_DETACH);
+ return 0;
}
static int lxc_send_devpts_to_parent(struct lxc_handler *handler)
static int parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
{
- ssize_t ret;
+ size_t ret;
/* If '=' is contained in opt, the option must go into data. */
if (!strchr(opt, '=')) {
if (strlen(*data)) {
ret = strlcat(*data, ",", size);
- if (ret < 0)
+ if (ret >= size)
return log_error_errno(ret, errno, "Failed to append \",\" to %s", *data);
}
ret = strlcat(*data, opt, size);
- if (ret < 0)
+ if (ret >= size)
return log_error_errno(ret, errno, "Failed to append \"%s\" to %s", opt, *data);
return 0;
struct lxc_mount_options opts = {};
int dfd_from;
const char *source_relative, *target_relative;
- struct lxc_mount_attr attr = {};
+ struct mount_attr attr = {};
ret = parse_lxc_mount_attrs(&opts, mntent.mnt_opts);
if (ret < 0)
/* Set propagation mount options. */
if (opts.attr.propagation) {
- attr = (struct lxc_mount_attr) {
+ attr = (struct mount_attr) {
.propagation = opts.attr.propagation,
};
dfd_from = rootfs->dfd_mnt;
else
dfd_from = rootfs->dfd_host;
- fd_to = open_at(dfd_from, target_relative, PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH_WITH_SYMLINKS, 0);
+ fd_to = open_at(dfd_from, target_relative, PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH_XDEV, 0);
if (fd_to < 0) {
if (opts.optional) {
TRACE("Skipping optional idmapped mount");
char filename[PATH_MAX] = {0};
struct lxc_sysctl *sysctl, *nsysctl;
- if (!list_empty(&conf->sysctls))
+ if (list_empty(&conf->sysctls))
return 0;
list_for_each_entry_safe(sysctl, nsysctl, &conf->sysctls, head) {
if (ret < 0)
return log_error_errno(-1, errno, "Failed to setup sysctl parameters %s to %s",
sysctl->key, sysctl->value);
+
+ TRACE("Setting %s to %s", filename, sysctl->value);
}
+ TRACE("Setup /proc/sys settings");
return 0;
}
char filename[PATH_MAX] = {0};
struct lxc_proc *proc;
- if (!list_empty(&conf->procs))
+ if (list_empty(&conf->procs))
return 0;
list_for_each_entry(proc, &conf->procs, head) {
if (ret < 0)
return log_error_errno(-1, errno, "Failed to setup proc filesystem %s to %s",
proc->filename, proc->value);
+
+ TRACE("Setting %s to %s", filename, proc->value);
}
TRACE("Setup /proc/%d settings", pid);
for (;;) {
__do_close int fd_from = -EBADF, fd_userns = -EBADF;
- struct lxc_mount_attr attr = {};
+ struct mount_attr attr = {};
struct lxc_mount_options opts = {};
ssize_t ret;
return syserror("Failed to receive idmapped mount file descriptors from child");
if (fd_from < 0 || fd_userns < 0)
- return log_trace(0, "Finished receiving idmapped mount file descriptors from child");
+ return log_trace(0, "Finished receiving idmapped mount file descriptors (%d | %d) from child", fd_from, fd_userns);
attr.attr_set = MOUNT_ATTR_IDMAP;
attr.userns_fd = fd_userns;
for (size_t i = 0; i < ttys_max; i++) {
terminal_info = &info_new->tty[i];
terminal_info->busy = -1;
+ terminal_info->pty_nr = -1;
terminal_info->ptx = -EBADF;
terminal_info->pty = -EBADF;
}
return 0;
}
+static int make_shmount_dependent_mount(const struct lxc_conf *conf)
+{
+ if (!(conf->auto_mounts & LXC_AUTO_SHMOUNTS_MASK))
+ return 0;
+
+ return mount(NULL, conf->shmount.path_cont, NULL, MS_REC | MS_SLAVE, 0);
+}
+
int lxc_setup(struct lxc_handler *handler)
{
int ret;
if (ret < 0)
return log_error(-1, "Failed to run mount hooks");
+ if (lxc_rootfs_overmounted(&lxc_conf->rootfs))
+ return log_error(-1, "Rootfs overmounted");
+
if (lxc_conf->autodev > 0) {
ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
if (ret < 0)
if (ret < 0)
return log_error(-1, "Failed to pivot root into rootfs");
+ ret = make_shmount_dependent_mount(lxc_conf);
+ if (ret < 0)
+ return log_error(-1, "Failed to turn mount tunnel \"%s\" into dependent mount",
+ lxc_conf->shmount.path_cont);
+
/* Setting the boot-id is best-effort for now. */
if (lxc_conf->autodev > 0)
(void)lxc_setup_boot_id();
free(conf->cgroup_meta.container_dir);
free(conf->cgroup_meta.namespace_dir);
free(conf->cgroup_meta.controllers);
+ free(conf->cgroup_meta.systemd_scope);
free(conf->shmount.path_host);
free(conf->shmount.path_cont);
free(conf);
close_prot_errno_disarm(sock_fds[0]);
- if (!lxc_switch_uid_gid(0, 0))
+ if (!lxc_drop_groups() && errno != EPERM)
+ _exit(EXIT_FAILURE);
+
+ ret = setresgid(0, 0, 0);
+ if (ret < 0) {
+ SYSERROR("Failed to setresgid(0, 0, 0)");
_exit(EXIT_FAILURE);
+ }
- if (!lxc_drop_groups())
+ ret = setresuid(0, 0, 0);
+ if (ret < 0) {
+ SYSERROR("Failed to setresuid(0, 0, 0)");
_exit(EXIT_FAILURE);
+ }
ret = fchown(target_fd, 0, st.st_gid);
if (ret) {
/* Wait for child to finish. */
if (pid < 0)
+ return log_error(-1, "Failed to create child process");
+
+ if (!wait_exited(pid))
return -1;
- return wait_for_pid(pid);
+ return 0;
}
/* not thread-safe, do not use from api without first forking */