]> git.proxmox.com Git - mirror_lxc.git/blobdiff - src/lxc/conf.c
Fix strlcat's return value checks
[mirror_lxc.git] / src / lxc / conf.c
index db6c0434a59b3153fd9c23c89f08e308bfb7e105..d2ab8ceda20aa11038aff356c5b142f1cf06483f 100644 (file)
@@ -50,6 +50,7 @@
 #include "mount_utils.h"
 #include "namespace.h"
 #include "network.h"
+#include "open_utils.h"
 #include "parse.h"
 #include "process_utils.h"
 #include "ringbuf.h"
@@ -90,7 +91,7 @@
 #include <mntent.h>
 #endif
 
-#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
+#if !HAVE_PRLIMIT && HAVE_PRLIMIT64
 #include "prlimit.h"
 #endif
 
@@ -579,9 +580,8 @@ int lxc_rootfs_init(struct lxc_conf *conf, bool userns)
                         PROTECT_LOOKUP_BENEATH,
                         S_IWUSR | S_IRUSR);
        if (fd_pin < 0) {
-               if (errno == EROFS) {
+               if (errno == EROFS)
                        return log_trace_errno(0, EROFS, "Not pinning on read-only filesystem");
-               }
                return syserror("Failed to pin rootfs");
        }
 
@@ -709,9 +709,11 @@ static int lxc_mount_auto_mounts(struct lxc_handler *handler, int flags)
                { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW,    "proc",                                           "%r/proc",                    "proc",  MS_NODEV|MS_NOEXEC|MS_NOSUID,                    NULL, false },
                { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_RW,     "sysfs",                                          "%r/sys",                     "sysfs", 0,                                               NULL, false },
                { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_RO,     "sysfs",                                          "%r/sys",                     "sysfs", MS_RDONLY,                                       NULL, false },
+               /* /proc/sys is used as a temporary staging directory for the read-write sysfs mount and unmounted after binding net */
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "sysfs",                                          "%r/proc/sys",                "sysfs", MS_NOSUID|MS_NODEV|MS_NOEXEC,                    NULL, false },
                { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "sysfs",                                          "%r/sys",                     "sysfs", MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC,          NULL, false },
-               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "%r/sys/devices/virtual/net",                     "%r/sys/devices/virtual/net",  NULL,   MS_BIND,                                         NULL, false },
-               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  NULL,                                             "%r/sys/devices/virtual/net",  NULL,   MS_REMOUNT|MS_NOSUID|MS_NODEV|MS_NOEXEC,         NULL, false },
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "%r/proc/sys/devices/virtual/net",                "%r/sys/devices/virtual/net", NULL,    MS_BIND,                                         NULL, false },
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "%r/proc/sys",                                    NULL,                         NULL,    0,                                               NULL, false },
                { 0,                  0,                   NULL,                                             NULL,                         NULL,    0,                                               NULL, false }
        };
        struct lxc_conf *conf = handler->conf;
@@ -779,14 +781,21 @@ static int lxc_mount_auto_mounts(struct lxc_handler *handler, int flags)
                                return syserror_set(-ENOMEM, "Failed to create source path");
                }
 
-               if (!default_mounts[i].destination)
-                       return syserror_set(-EINVAL, "BUG: auto mounts destination %d was NULL", i);
-
                if (!has_cap_net_admin && default_mounts[i].requires_cap_net_admin) {
                        TRACE("Container does not have CAP_NET_ADMIN. Skipping \"%s\" mount", default_mounts[i].source ?: "(null)");
                        continue;
                }
 
+               if (!default_mounts[i].destination) {
+                       ret = umount2(source, MNT_DETACH);
+                       if (ret < 0)
+                               return log_error_errno(-1, errno,
+                                                      "Failed to unmount \"%s\"",
+                                                      source);
+                       TRACE("Unmounted automount \"%s\"", source);
+                       continue;
+               }
+
                /* will act like strdup if %r is not present */
                destination = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].destination);
                if (!destination)
@@ -914,29 +923,29 @@ static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
 }
 
 /* Build a space-separate list of ptys to pass to systemd. */
-static bool append_ttyname(char **pp, char *name)
+static bool append_ttyname(struct lxc_tty_info *ttys, char *tty_name)
 {
-       char *p;
+       char *tty_names, *buf;
        size_t size;
 
-       if (!*pp) {
-               *pp = zalloc(strlen(name) + strlen("container_ttys=") + 1);
-               if (!*pp)
-                       return false;
+       if (!tty_name)
+               return false;
 
-               sprintf(*pp, "container_ttys=%s", name);
-               return true;
-       }
+       size = strlen(tty_name) + 1;
+       if (ttys->tty_names)
+               size += strlen(ttys->tty_names) + 1;
 
-       size = strlen(*pp) + strlen(name) + 2;
-       p = realloc(*pp, size);
-       if (!p)
+       buf = realloc(ttys->tty_names, size);
+       if (!buf)
                return false;
+       tty_names = buf;
 
-       *pp = p;
-       (void)strlcat(p, " ", size);
-       (void)strlcat(p, name, size);
-
+       if (ttys->tty_names)
+               (void)strlcat(buf, " ", size);
+       else
+               buf[0] = '\0';
+       (void)strlcat(buf, tty_name, size);
+       ttys->tty_names = tty_names;
        return true;
 }
 
@@ -949,7 +958,7 @@ static int open_ttymnt_at(int dfd, const char *path)
                     PROTECT_LOOKUP_BENEATH,
                     0);
        if (fd < 0) {
-               if (!IN_SET(errno, ENXIO, EEXIST))
+               if (errno != ENXIO && errno != EEXIST)
                        return syserror("Failed to create \"%d/\%s\"", dfd, path);
 
                SYSINFO("Failed to create \"%d/\%s\"", dfd, path);
@@ -1057,7 +1066,7 @@ static int lxc_setup_ttys(struct lxc_conf *conf)
                        DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, rootfs->buf);
                }
 
-               if (!append_ttyname(&conf->ttys.tty_names, tty->name))
+               if (!append_ttyname(&conf->ttys, tty->name))
                        return log_error(-1, "Error setting up container_ttys string");
        }
 
@@ -1081,17 +1090,20 @@ static int lxc_allocate_ttys(struct lxc_conf *conf)
                return -ENOMEM;
 
        for (size_t i = 0; i < conf->ttys.max; i++) {
-               int pty_nr = -1;
                struct lxc_terminal_info *tty = &ttys->tty[i];
 
                ret = lxc_devpts_terminal(conf->devpts_fd, &tty->ptx,
-                                         &tty->pty, &pty_nr, false);
+                                         &tty->pty, &tty->pty_nr, false);
                if (ret < 0) {
                        conf->ttys.max = i;
                        return syserror_set(-ENOTTY, "Failed to create tty %zu", i);
                }
+               ret = strnprintf(tty->name, sizeof(tty->name), "pts/%d", tty->pty_nr);
+               if (ret < 0)
+                       return syserror("Failed to create tty %zu", i);
+
                DEBUG("Created tty with ptx fd %d and pty fd %d and index %d",
-                     tty->ptx, tty->pty, pty_nr);
+                     tty->ptx, tty->pty, tty->pty_nr);
                tty->busy = -1;
        }
 
@@ -1172,6 +1184,7 @@ static int lxc_create_ttys(struct lxc_handler *handler)
                        SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
                        goto on_error;
                }
+               TRACE("Set \"container_ttys=%s\"", conf->ttys.tty_names);
        }
 
        return 0;
@@ -1432,6 +1445,23 @@ static int lxc_mount_rootfs(struct lxc_rootfs *rootfs)
        return log_trace(0, "Container uses separate rootfs. Opened container's rootfs");
 }
 
+static bool lxc_rootfs_overmounted(struct lxc_rootfs *rootfs)
+{
+       __do_close int fd_rootfs = -EBADF;
+
+       if (!rootfs->path)
+               fd_rootfs = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
+       else
+               fd_rootfs = open_at(-EBADF, rootfs->mount, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
+       if (fd_rootfs < 0)
+               return true;
+
+       if (!same_file_lax(rootfs->dfd_mnt, fd_rootfs))
+               return syswarn_ret(true, "Rootfs seems to have changed after setting up mounts");
+
+       return false;
+}
+
 static int lxc_chroot(const struct lxc_rootfs *rootfs)
 {
        __do_free char *nroot = NULL;
@@ -1576,8 +1606,11 @@ static int lxc_pivot_root(const struct lxc_rootfs *rootfs)
                return log_error_errno(-errno, errno, "Failed to enter old root directory");
 
        /*
-        * Make fd_oldroot a depedent mount to make sure our umounts don't
-        * propagate to the host.
+        * Unprivileged containers will have had all their mounts turned into
+        * dependent mounts when the container was created. But for privileged
+        * containers we need to turn the old root mount tree into a dependent
+        * mount tree to prevent propagating mounts and umounts into the host
+        * mount namespace.
         */
        ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
        if (ret < 0)
@@ -1591,6 +1624,31 @@ static int lxc_pivot_root(const struct lxc_rootfs *rootfs)
        if (ret < 0)
                return log_error_errno(-errno, errno, "Failed to re-enter new root directory \"%s\"", rootfs->mount);
 
+       /*
+        * Finally, we turn the rootfs into a shared mount. Note, that this
+        * doesn't reestablish mount propagation with the hosts mount
+        * namespace. Instead we'll create a new peer group.
+        *
+        * We're doing this because most workloads do rely on the rootfs being
+        * a shared mount. For example, systemd daemon like sytemd-udevd run in
+        * their own mount namespace. Their mount namespace has been made a
+        * dependent mount (MS_SLAVE) with the host rootfs as it's dominating
+        * mount. This means new mounts on the host propagate into the
+        * respective services.
+        *
+        * This is broken if we leave the container's rootfs a dependent mount.
+        * In which case both the container's rootfs and the service's rootfs
+        * will be dependent mounts with the host's rootfs as their dominating
+        * mount. So if you were to mount over the rootfs from the host it
+        * would not just propagate into the container's mount namespace it
+        * would also propagate into the service. That's nonsense semantics for
+        * nearly all relevant use-cases. Instead, establish the container's
+        * rootfs as a separate peer group mirroring the behavior on the host.
+        */
+       ret = mount("", ".", "", MS_SHARED | MS_REC, NULL);
+       if (ret < 0)
+               return log_error_errno(-errno, errno, "Failed to turn new root mount tree into shared mount tree");
+
        TRACE("Changed into new rootfs \"%s\"", rootfs->mount);
        return 0;
 }
@@ -1852,7 +1910,8 @@ static int lxc_finish_devpts_child(struct lxc_handler *handler)
                return syserror("Failed to create path");
 
        close_prot_errno_disarm(conf->devpts_fd);
-       return umount2(rootfs->buf, MNT_DETACH);
+       (void)umount2(rootfs->buf, MNT_DETACH);
+       return 0;
 }
 
 static int lxc_send_devpts_to_parent(struct lxc_handler *handler)
@@ -2153,7 +2212,7 @@ static int lxc_setup_console(const struct lxc_handler *handler,
 
 static int parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
 {
-       ssize_t ret;
+       size_t ret;
 
        /* If '=' is contained in opt, the option must go into data. */
        if (!strchr(opt, '=')) {
@@ -2177,12 +2236,12 @@ static int parse_mntopt(char *opt, unsigned long *flags, char **data, size_t siz
 
        if (strlen(*data)) {
                ret = strlcat(*data, ",", size);
-               if (ret < 0)
+               if (ret >= size)
                        return log_error_errno(ret, errno, "Failed to append \",\" to %s", *data);
        }
 
        ret = strlcat(*data, opt, size);
-       if (ret < 0)
+       if (ret >= size)
                return log_error_errno(ret, errno, "Failed to append \"%s\" to %s", opt, *data);
 
        return 0;
@@ -2855,7 +2914,7 @@ static int __lxc_idmapped_mounts_child(struct lxc_handler *handler, FILE *f)
                struct lxc_mount_options opts = {};
                int dfd_from;
                const char *source_relative, *target_relative;
-               struct lxc_mount_attr attr = {};
+               struct mount_attr attr = {};
 
                ret = parse_lxc_mount_attrs(&opts, mntent.mnt_opts);
                if (ret < 0)
@@ -2975,7 +3034,7 @@ static int __lxc_idmapped_mounts_child(struct lxc_handler *handler, FILE *f)
 
                /* Set propagation mount options. */
                if (opts.attr.propagation) {
-                       attr = (struct lxc_mount_attr) {
+                       attr = (struct mount_attr) {
                                .propagation = opts.attr.propagation,
                        };
 
@@ -3010,7 +3069,7 @@ static int __lxc_idmapped_mounts_child(struct lxc_handler *handler, FILE *f)
                        dfd_from = rootfs->dfd_mnt;
                else
                        dfd_from = rootfs->dfd_host;
-               fd_to = open_at(dfd_from, target_relative, PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH_WITH_SYMLINKS, 0);
+               fd_to = open_at(dfd_from, target_relative, PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH_XDEV, 0);
                if (fd_to < 0) {
                        if (opts.optional) {
                                TRACE("Skipping optional idmapped mount");
@@ -3271,7 +3330,7 @@ int setup_sysctl_parameters(struct lxc_conf *conf)
        char filename[PATH_MAX] = {0};
        struct lxc_sysctl *sysctl, *nsysctl;
 
-       if (!list_empty(&conf->sysctls))
+       if (list_empty(&conf->sysctls))
                return 0;
 
        list_for_each_entry_safe(sysctl, nsysctl, &conf->sysctls, head) {
@@ -3288,8 +3347,11 @@ int setup_sysctl_parameters(struct lxc_conf *conf)
                if (ret < 0)
                        return log_error_errno(-1, errno, "Failed to setup sysctl parameters %s to %s",
                                               sysctl->key, sysctl->value);
+
+               TRACE("Setting %s to %s", filename, sysctl->value);
        }
 
+       TRACE("Setup /proc/sys settings");
        return 0;
 }
 
@@ -3300,7 +3362,7 @@ int setup_proc_filesystem(struct lxc_conf *conf, pid_t pid)
        char filename[PATH_MAX] = {0};
        struct lxc_proc *proc;
 
-       if (!list_empty(&conf->procs))
+       if (list_empty(&conf->procs))
                return 0;
 
        list_for_each_entry(proc, &conf->procs, head) {
@@ -3317,6 +3379,8 @@ int setup_proc_filesystem(struct lxc_conf *conf, pid_t pid)
                if (ret < 0)
                        return log_error_errno(-1, errno, "Failed to setup proc filesystem %s to %s",
                                               proc->filename, proc->value);
+
+               TRACE("Setting %s to %s", filename, proc->value);
        }
 
        TRACE("Setup /proc/%d settings", pid);
@@ -4074,7 +4138,7 @@ int lxc_idmapped_mounts_parent(struct lxc_handler *handler)
 
        for (;;) {
                __do_close int fd_from = -EBADF, fd_userns = -EBADF;
-               struct lxc_mount_attr attr = {};
+               struct mount_attr attr = {};
                struct lxc_mount_options opts = {};
                ssize_t ret;
 
@@ -4085,7 +4149,7 @@ int lxc_idmapped_mounts_parent(struct lxc_handler *handler)
                        return syserror("Failed to receive idmapped mount file descriptors from child");
 
                if (fd_from < 0 || fd_userns < 0)
-                       return log_trace(0, "Finished receiving idmapped mount file descriptors from child");
+                       return log_trace(0, "Finished receiving idmapped mount file descriptors (%d | %d) from child", fd_from, fd_userns);
 
                attr.attr_set   = MOUNT_ATTR_IDMAP;
                attr.userns_fd  = fd_userns;
@@ -4132,6 +4196,7 @@ static int lxc_recv_ttys_from_child(struct lxc_handler *handler)
        for (size_t i = 0; i < ttys_max; i++) {
                terminal_info = &info_new->tty[i];
                terminal_info->busy = -1;
+               terminal_info->pty_nr = -1;
                terminal_info->ptx = -EBADF;
                terminal_info->pty = -EBADF;
        }
@@ -4280,6 +4345,14 @@ static int setup_capabilities(struct lxc_conf *conf)
        return 0;
 }
 
+static int make_shmount_dependent_mount(const struct lxc_conf *conf)
+{
+       if (!(conf->auto_mounts & LXC_AUTO_SHMOUNTS_MASK))
+               return 0;
+
+       return mount(NULL, conf->shmount.path_cont, NULL, MS_REC | MS_SLAVE, 0);
+}
+
 int lxc_setup(struct lxc_handler *handler)
 {
        int ret;
@@ -4363,6 +4436,9 @@ int lxc_setup(struct lxc_handler *handler)
        if (ret < 0)
                return log_error(-1, "Failed to run mount hooks");
 
+       if (lxc_rootfs_overmounted(&lxc_conf->rootfs))
+               return log_error(-1, "Rootfs overmounted");
+
        if (lxc_conf->autodev > 0) {
                ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
                if (ret < 0)
@@ -4406,6 +4482,11 @@ int lxc_setup(struct lxc_handler *handler)
        if (ret < 0)
                return log_error(-1, "Failed to pivot root into rootfs");
 
+       ret = make_shmount_dependent_mount(lxc_conf);
+       if (ret < 0)
+               return log_error(-1, "Failed to turn mount tunnel \"%s\" into dependent mount",
+                                lxc_conf->shmount.path_cont);
+
        /* Setting the boot-id is best-effort for now. */
        if (lxc_conf->autodev > 0)
                (void)lxc_setup_boot_id();
@@ -4792,6 +4873,7 @@ void lxc_conf_free(struct lxc_conf *conf)
        free(conf->cgroup_meta.container_dir);
        free(conf->cgroup_meta.namespace_dir);
        free(conf->cgroup_meta.controllers);
+       free(conf->cgroup_meta.systemd_scope);
        free(conf->shmount.path_host);
        free(conf->shmount.path_cont);
        free(conf);
@@ -5471,11 +5553,20 @@ int userns_exec_mapped_root(const char *path, int path_fd,
 
                close_prot_errno_disarm(sock_fds[0]);
 
-               if (!lxc_switch_uid_gid(0, 0))
+               if (!lxc_drop_groups() && errno != EPERM)
+                       _exit(EXIT_FAILURE);
+
+               ret = setresgid(0, 0, 0);
+               if (ret < 0) {
+                       SYSERROR("Failed to setresgid(0, 0, 0)");
                        _exit(EXIT_FAILURE);
+               }
 
-               if (!lxc_drop_groups())
+               ret = setresuid(0, 0, 0);
+               if (ret < 0) {
+                       SYSERROR("Failed to setresuid(0, 0, 0)");
                        _exit(EXIT_FAILURE);
+               }
 
                ret = fchown(target_fd, 0, st.st_gid);
                if (ret) {
@@ -5523,9 +5614,12 @@ on_error:
 
        /* Wait for child to finish. */
        if (pid < 0)
+               return log_error(-1, "Failed to create child process");
+
+       if (!wait_exited(pid))
                return -1;
 
-       return wait_for_pid(pid);
+       return 0;
 }
 
 /* not thread-safe, do not use from api without first forking */