]> git.proxmox.com Git - mirror_lxc.git/blobdiff - src/lxc/conf.c
Fix strlcat's return value checks
[mirror_lxc.git] / src / lxc / conf.c
index fa1777699a646f55b43a1609ede74b39298af663..d2ab8ceda20aa11038aff356c5b142f1cf06483f 100644 (file)
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: LGPL-2.1+ */
 
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE 1
-#endif
+#include "config.h"
+
 #include <arpa/inet.h>
 #include <dirent.h>
 #include <errno.h>
 #include <time.h>
 #include <unistd.h>
 
+#include "conf.h"
 #include "af_unix.h"
 #include "caps.h"
 #include "cgroups/cgroup.h"
 #include "compiler.h"
-#include "conf.h"
-#include "config.h"
 #include "confile.h"
 #include "confile_utils.h"
 #include "error.h"
@@ -52,6 +50,7 @@
 #include "mount_utils.h"
 #include "namespace.h"
 #include "network.h"
+#include "open_utils.h"
 #include "parse.h"
 #include "process_utils.h"
 #include "ringbuf.h"
 #if HAVE_OPENPTY
 #include <pty.h>
 #else
-#include <../include/openpty.h>
+#include "openpty.h"
 #endif
 
 #if HAVE_LIBCAP
 #include <sys/capability.h>
 #endif
 
-#ifndef HAVE_STRLCAT
-#include "include/strlcat.h"
+#if !HAVE_STRLCAT
+#include "strlcat.h"
 #endif
 
 #if IS_BIONIC
-#include <../include/lxcmntent.h>
+#include "lxcmntent.h"
 #else
 #include <mntent.h>
 #endif
 
-#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
-#include <../include/prlimit.h>
+#if !HAVE_PRLIMIT && HAVE_PRLIMIT64
+#include "prlimit.h"
 #endif
 
-#ifndef HAVE_STRLCPY
-#include "include/strlcpy.h"
+#if !HAVE_STRLCPY
+#include "strlcpy.h"
 #endif
 
-#ifndef HAVE_STRCHRNUL
-#include "include/strchrnul.h"
+#if !HAVE_STRCHRNUL
+#include "strchrnul.h"
 #endif
 
 lxc_log_define(conf, lxc);
@@ -135,7 +134,7 @@ struct mount_opt {
 
 struct caps_opt {
        char *name;
-       int value;
+       __u32 value;
 };
 
 struct limit_opt {
@@ -185,7 +184,7 @@ static struct mount_opt propagation_opt[] = {
        { "rshared",     0, true,  MS_SHARED,     MS_SHARED | MS_REC     },
        { "rslave",      0, true,  MS_SLAVE,      MS_SLAVE | MS_REC      },
        { "runbindable", 0, true,  MS_UNBINDABLE, MS_UNBINDABLE | MS_REC },
-       { NULL,          0, 0                                            },
+       { NULL,          0, false, 0,             0                     },
 };
 
 static struct caps_opt caps_opt[] = {
@@ -581,9 +580,8 @@ int lxc_rootfs_init(struct lxc_conf *conf, bool userns)
                         PROTECT_LOOKUP_BENEATH,
                         S_IWUSR | S_IRUSR);
        if (fd_pin < 0) {
-               if (errno == EROFS) {
+               if (errno == EROFS)
                        return log_trace_errno(0, EROFS, "Not pinning on read-only filesystem");
-               }
                return syserror("Failed to pin rootfs");
        }
 
@@ -711,9 +709,11 @@ static int lxc_mount_auto_mounts(struct lxc_handler *handler, int flags)
                { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW,    "proc",                                           "%r/proc",                    "proc",  MS_NODEV|MS_NOEXEC|MS_NOSUID,                    NULL, false },
                { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_RW,     "sysfs",                                          "%r/sys",                     "sysfs", 0,                                               NULL, false },
                { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_RO,     "sysfs",                                          "%r/sys",                     "sysfs", MS_RDONLY,                                       NULL, false },
+               /* /proc/sys is used as a temporary staging directory for the read-write sysfs mount and unmounted after binding net */
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "sysfs",                                          "%r/proc/sys",                "sysfs", MS_NOSUID|MS_NODEV|MS_NOEXEC,                    NULL, false },
                { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "sysfs",                                          "%r/sys",                     "sysfs", MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC,          NULL, false },
-               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "%r/sys/devices/virtual/net",                     "%r/sys/devices/virtual/net",  NULL,   MS_BIND,                                         NULL, false },
-               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  NULL,                                             "%r/sys/devices/virtual/net",  NULL,   MS_REMOUNT|MS_NOSUID|MS_NODEV|MS_NOEXEC,         NULL, false },
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "%r/proc/sys/devices/virtual/net",                "%r/sys/devices/virtual/net", NULL,    MS_BIND,                                         NULL, false },
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "%r/proc/sys",                                    NULL,                         NULL,    0,                                               NULL, false },
                { 0,                  0,                   NULL,                                             NULL,                         NULL,    0,                                               NULL, false }
        };
        struct lxc_conf *conf = handler->conf;
@@ -781,14 +781,21 @@ static int lxc_mount_auto_mounts(struct lxc_handler *handler, int flags)
                                return syserror_set(-ENOMEM, "Failed to create source path");
                }
 
-               if (!default_mounts[i].destination)
-                       return syserror_set(-EINVAL, "BUG: auto mounts destination %d was NULL", i);
-
                if (!has_cap_net_admin && default_mounts[i].requires_cap_net_admin) {
                        TRACE("Container does not have CAP_NET_ADMIN. Skipping \"%s\" mount", default_mounts[i].source ?: "(null)");
                        continue;
                }
 
+               if (!default_mounts[i].destination) {
+                       ret = umount2(source, MNT_DETACH);
+                       if (ret < 0)
+                               return log_error_errno(-1, errno,
+                                                      "Failed to unmount \"%s\"",
+                                                      source);
+                       TRACE("Unmounted automount \"%s\"", source);
+                       continue;
+               }
+
                /* will act like strdup if %r is not present */
                destination = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].destination);
                if (!destination)
@@ -828,17 +835,14 @@ static int lxc_mount_auto_mounts(struct lxc_handler *handler, int flags)
                 * container can't remount it read-write.
                 */
                if ((cg_flags == LXC_AUTO_CGROUP_NOSPEC) || (cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC)) {
-                       int has_sys_admin = 0;
-
-                       if (!lxc_list_empty(&conf->keepcaps))
-                               has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
-                       else
-                               has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
-
                        if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
-                               cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
+                               cg_flags = has_cap(CAP_SYS_ADMIN, conf)
+                                              ? LXC_AUTO_CGROUP_RW
+                                              : LXC_AUTO_CGROUP_MIXED;
                        else
-                               cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
+                               cg_flags = has_cap(CAP_SYS_ADMIN, conf)
+                                              ? LXC_AUTO_CGROUP_FULL_RW
+                                              : LXC_AUTO_CGROUP_FULL_MIXED;
                }
 
                if (flags & LXC_AUTO_CGROUP_FORCE)
@@ -888,7 +892,7 @@ static const struct dev_symlinks dev_symlinks[] = {
 
 static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
 {
-       for (int i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
+       for (size_t i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
                int ret;
                struct stat s;
                const struct dev_symlinks *d = &dev_symlinks[i];
@@ -919,29 +923,29 @@ static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
 }
 
 /* Build a space-separate list of ptys to pass to systemd. */
-static bool append_ttyname(char **pp, char *name)
+static bool append_ttyname(struct lxc_tty_info *ttys, char *tty_name)
 {
-       char *p;
+       char *tty_names, *buf;
        size_t size;
 
-       if (!*pp) {
-               *pp = zalloc(strlen(name) + strlen("container_ttys=") + 1);
-               if (!*pp)
-                       return false;
+       if (!tty_name)
+               return false;
 
-               sprintf(*pp, "container_ttys=%s", name);
-               return true;
-       }
+       size = strlen(tty_name) + 1;
+       if (ttys->tty_names)
+               size += strlen(ttys->tty_names) + 1;
 
-       size = strlen(*pp) + strlen(name) + 2;
-       p = realloc(*pp, size);
-       if (!p)
+       buf = realloc(ttys->tty_names, size);
+       if (!buf)
                return false;
+       tty_names = buf;
 
-       *pp = p;
-       (void)strlcat(p, " ", size);
-       (void)strlcat(p, name, size);
-
+       if (ttys->tty_names)
+               (void)strlcat(buf, " ", size);
+       else
+               buf[0] = '\0';
+       (void)strlcat(buf, tty_name, size);
+       ttys->tty_names = tty_names;
        return true;
 }
 
@@ -954,7 +958,7 @@ static int open_ttymnt_at(int dfd, const char *path)
                     PROTECT_LOOKUP_BENEATH,
                     0);
        if (fd < 0) {
-               if (!IN_SET(errno, ENXIO, EEXIST))
+               if (errno != ENXIO && errno != EEXIST)
                        return syserror("Failed to create \"%d/\%s\"", dfd, path);
 
                SYSINFO("Failed to create \"%d/\%s\"", dfd, path);
@@ -977,7 +981,7 @@ static int lxc_setup_ttys(struct lxc_conf *conf)
        if (!conf->rootfs.path)
                return 0;
 
-       for (int i = 0; i < ttys->max; i++) {
+       for (size_t i = 0; i < ttys->max; i++) {
                __do_close int fd_to = -EBADF;
                struct lxc_terminal_info *tty = &ttys->tty[i];
 
@@ -985,7 +989,7 @@ static int lxc_setup_ttys(struct lxc_conf *conf)
                        char *tty_name, *tty_path;
 
                        ret = strnprintf(rootfs->buf, sizeof(rootfs->buf),
-                                      "/dev/%s/tty%d", ttydir, i + 1);
+                                      "/dev/%s/tty%zu", ttydir, i + 1);
                        if (ret < 0)
                                return ret_errno(-EIO);
 
@@ -1031,7 +1035,7 @@ static int lxc_setup_ttys(struct lxc_conf *conf)
                                                       rootfs->dfd_dev, tty_name,
                                                       rootfs->dfd_dev, tty_path);
                } else {
-                       ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "tty%d", i + 1);
+                       ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "tty%zu", i + 1);
                        if (ret < 0)
                                return ret_errno(-EIO);
 
@@ -1062,7 +1066,7 @@ static int lxc_setup_ttys(struct lxc_conf *conf)
                        DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, rootfs->buf);
                }
 
-               if (!append_ttyname(&conf->ttys.tty_names, tty->name))
+               if (!append_ttyname(&conf->ttys, tty->name))
                        return log_error(-1, "Error setting up container_ttys string");
        }
 
@@ -1086,17 +1090,20 @@ static int lxc_allocate_ttys(struct lxc_conf *conf)
                return -ENOMEM;
 
        for (size_t i = 0; i < conf->ttys.max; i++) {
-               int pty_nr = -1;
                struct lxc_terminal_info *tty = &ttys->tty[i];
 
                ret = lxc_devpts_terminal(conf->devpts_fd, &tty->ptx,
-                                         &tty->pty, &pty_nr, false);
+                                         &tty->pty, &tty->pty_nr, false);
                if (ret < 0) {
                        conf->ttys.max = i;
                        return syserror_set(-ENOTTY, "Failed to create tty %zu", i);
                }
+               ret = strnprintf(tty->name, sizeof(tty->name), "pts/%d", tty->pty_nr);
+               if (ret < 0)
+                       return syserror("Failed to create tty %zu", i);
+
                DEBUG("Created tty with ptx fd %d and pty fd %d and index %d",
-                     tty->ptx, tty->pty, pty_nr);
+                     tty->ptx, tty->pty, tty->pty_nr);
                tty->busy = -1;
        }
 
@@ -1110,7 +1117,7 @@ void lxc_delete_tty(struct lxc_tty_info *ttys)
        if (!ttys || !ttys->tty)
                return;
 
-       for (int i = 0; i < ttys->max; i++) {
+       for (size_t i = 0; i < ttys->max; i++) {
                struct lxc_terminal_info *tty = &ttys->tty[i];
                close_prot_errno_disarm(tty->ptx);
                close_prot_errno_disarm(tty->pty);
@@ -1121,7 +1128,6 @@ void lxc_delete_tty(struct lxc_tty_info *ttys)
 
 static int __lxc_send_ttys_to_parent(struct lxc_handler *handler)
 {
-       int i;
        int ret = -1;
        struct lxc_conf *conf = handler->conf;
        struct lxc_tty_info *ttys = &conf->ttys;
@@ -1130,7 +1136,7 @@ static int __lxc_send_ttys_to_parent(struct lxc_handler *handler)
        if (ttys->max == 0)
                return 0;
 
-       for (i = 0; i < ttys->max; i++) {
+       for (size_t i = 0; i < ttys->max; i++) {
                int ttyfds[2];
                struct lxc_terminal_info *tty = &ttys->tty[i];
 
@@ -1178,6 +1184,7 @@ static int lxc_create_ttys(struct lxc_handler *handler)
                        SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
                        goto on_error;
                }
+               TRACE("Set \"container_ttys=%s\"", conf->ttys.tty_names);
        }
 
        return 0;
@@ -1301,7 +1308,7 @@ enum {
 
 static int lxc_fill_autodev(struct lxc_rootfs *rootfs)
 {
-       int i, ret;
+       int ret;
        mode_t cmask;
        int use_mknod = LXC_DEVNODE_MKNOD;
 
@@ -1311,7 +1318,7 @@ static int lxc_fill_autodev(struct lxc_rootfs *rootfs)
        INFO("Populating \"/dev\"");
 
        cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
-       for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
+       for (size_t i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
                const struct lxc_device_node *device = &lxc_devices[i];
 
                if (use_mknod >= LXC_DEVNODE_MKNOD) {
@@ -1438,6 +1445,23 @@ static int lxc_mount_rootfs(struct lxc_rootfs *rootfs)
        return log_trace(0, "Container uses separate rootfs. Opened container's rootfs");
 }
 
+static bool lxc_rootfs_overmounted(struct lxc_rootfs *rootfs)
+{
+       __do_close int fd_rootfs = -EBADF;
+
+       if (!rootfs->path)
+               fd_rootfs = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
+       else
+               fd_rootfs = open_at(-EBADF, rootfs->mount, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
+       if (fd_rootfs < 0)
+               return true;
+
+       if (!same_file_lax(rootfs->dfd_mnt, fd_rootfs))
+               return syswarn_ret(true, "Rootfs seems to have changed after setting up mounts");
+
+       return false;
+}
+
 static int lxc_chroot(const struct lxc_rootfs *rootfs)
 {
        __do_free char *nroot = NULL;
@@ -1582,8 +1606,11 @@ static int lxc_pivot_root(const struct lxc_rootfs *rootfs)
                return log_error_errno(-errno, errno, "Failed to enter old root directory");
 
        /*
-        * Make fd_oldroot a depedent mount to make sure our umounts don't
-        * propagate to the host.
+        * Unprivileged containers will have had all their mounts turned into
+        * dependent mounts when the container was created. But for privileged
+        * containers we need to turn the old root mount tree into a dependent
+        * mount tree to prevent propagating mounts and umounts into the host
+        * mount namespace.
         */
        ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
        if (ret < 0)
@@ -1597,6 +1624,31 @@ static int lxc_pivot_root(const struct lxc_rootfs *rootfs)
        if (ret < 0)
                return log_error_errno(-errno, errno, "Failed to re-enter new root directory \"%s\"", rootfs->mount);
 
+       /*
+        * Finally, we turn the rootfs into a shared mount. Note, that this
+        * doesn't reestablish mount propagation with the hosts mount
+        * namespace. Instead we'll create a new peer group.
+        *
+        * We're doing this because most workloads do rely on the rootfs being
+        * a shared mount. For example, systemd daemon like sytemd-udevd run in
+        * their own mount namespace. Their mount namespace has been made a
+        * dependent mount (MS_SLAVE) with the host rootfs as it's dominating
+        * mount. This means new mounts on the host propagate into the
+        * respective services.
+        *
+        * This is broken if we leave the container's rootfs a dependent mount.
+        * In which case both the container's rootfs and the service's rootfs
+        * will be dependent mounts with the host's rootfs as their dominating
+        * mount. So if you were to mount over the rootfs from the host it
+        * would not just propagate into the container's mount namespace it
+        * would also propagate into the service. That's nonsense semantics for
+        * nearly all relevant use-cases. Instead, establish the container's
+        * rootfs as a separate peer group mirroring the behavior on the host.
+        */
+       ret = mount("", ".", "", MS_SHARED | MS_REC, NULL);
+       if (ret < 0)
+               return log_error_errno(-errno, errno, "Failed to turn new root mount tree into shared mount tree");
+
        TRACE("Changed into new rootfs \"%s\"", rootfs->mount);
        return 0;
 }
@@ -1858,7 +1910,8 @@ static int lxc_finish_devpts_child(struct lxc_handler *handler)
                return syserror("Failed to create path");
 
        close_prot_errno_disarm(conf->devpts_fd);
-       return umount2(rootfs->buf, MNT_DETACH);
+       (void)umount2(rootfs->buf, MNT_DETACH);
+       return 0;
 }
 
 static int lxc_send_devpts_to_parent(struct lxc_handler *handler)
@@ -2159,7 +2212,7 @@ static int lxc_setup_console(const struct lxc_handler *handler,
 
 static int parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
 {
-       ssize_t ret;
+       size_t ret;
 
        /* If '=' is contained in opt, the option must go into data. */
        if (!strchr(opt, '=')) {
@@ -2183,12 +2236,12 @@ static int parse_mntopt(char *opt, unsigned long *flags, char **data, size_t siz
 
        if (strlen(*data)) {
                ret = strlcat(*data, ",", size);
-               if (ret < 0)
+               if (ret >= size)
                        return log_error_errno(ret, errno, "Failed to append \",\" to %s", *data);
        }
 
        ret = strlcat(*data, opt, size);
-       if (ret < 0)
+       if (ret >= size)
                return log_error_errno(ret, errno, "Failed to append \"%s\" to %s", opt, *data);
 
        return 0;
@@ -2250,7 +2303,7 @@ static int parse_vfs_attr(struct lxc_mount_options *opts, char *opt, size_t size
                        return 0;
                }
 
-               if (mo->flag == ~0)
+               if (mo->flag == (__u64)~0)
                        return log_info(0, "Ignoring %s mount option", mo->name);
 
                if (mo->clear) {
@@ -2778,14 +2831,13 @@ static const char nesting_helpers[] =
 "proc dev/.lxc/proc proc create=dir,optional 0 0\n"
 "sys dev/.lxc/sys sysfs create=dir,optional 0 0\n";
 
-FILE *make_anonymous_mount_file(struct lxc_list *mount,
+FILE *make_anonymous_mount_file(const struct list_head *mount_entries,
                                bool include_nesting_helpers)
 {
        __do_close int fd = -EBADF;
        FILE *f;
        int ret;
-       char *mount_entry;
-       struct lxc_list *iterator;
+       struct string_entry *entry;
 
        fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
        if (fd < 0) {
@@ -2801,14 +2853,13 @@ FILE *make_anonymous_mount_file(struct lxc_list *mount,
                TRACE("Created temporary mount file");
        }
 
-       lxc_list_for_each (iterator, mount) {
+       list_for_each_entry(entry, mount_entries, head) {
                size_t len;
 
-               mount_entry = iterator->elem;
-               len = strlen(mount_entry);
+               len = strlen(entry->val);
 
-               ret = lxc_write_nointr(fd, mount_entry, len);
-               if (ret != len)
+               ret = lxc_write_nointr(fd, entry->val, len);
+               if (ret < 0 || (size_t)ret != len)
                        return NULL;
 
                ret = lxc_write_nointr(fd, "\n", 1);
@@ -2834,12 +2885,12 @@ FILE *make_anonymous_mount_file(struct lxc_list *mount,
 }
 
 static int setup_mount_entries(const struct lxc_conf *conf,
-                              struct lxc_rootfs *rootfs, struct lxc_list *mount,
+                              struct lxc_rootfs *rootfs,
                               const char *lxc_name, const char *lxc_path)
 {
        __do_fclose FILE *f = NULL;
 
-       f = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
+       f = make_anonymous_mount_file(&conf->mount_entries, conf->lsm_aa_allow_nesting);
        if (!f)
                return -1;
 
@@ -2863,7 +2914,7 @@ static int __lxc_idmapped_mounts_child(struct lxc_handler *handler, FILE *f)
                struct lxc_mount_options opts = {};
                int dfd_from;
                const char *source_relative, *target_relative;
-               struct lxc_mount_attr attr = {};
+               struct mount_attr attr = {};
 
                ret = parse_lxc_mount_attrs(&opts, mntent.mnt_opts);
                if (ret < 0)
@@ -2983,8 +3034,8 @@ static int __lxc_idmapped_mounts_child(struct lxc_handler *handler, FILE *f)
 
                /* Set propagation mount options. */
                if (opts.attr.propagation) {
-                       attr = (struct lxc_mount_attr) {
-                               attr.propagation = opts.attr.propagation,
+                       attr = (struct mount_attr) {
+                               .propagation = opts.attr.propagation,
                        };
 
                        ret = mount_setattr(fd_from,
@@ -3018,7 +3069,7 @@ static int __lxc_idmapped_mounts_child(struct lxc_handler *handler, FILE *f)
                        dfd_from = rootfs->dfd_mnt;
                else
                        dfd_from = rootfs->dfd_host;
-               fd_to = open_at(dfd_from, target_relative, PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH_WITH_SYMLINKS, 0);
+               fd_to = open_at(dfd_from, target_relative, PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH_XDEV, 0);
                if (fd_to < 0) {
                        if (opts.optional) {
                                TRACE("Skipping optional idmapped mount");
@@ -3060,10 +3111,10 @@ static int lxc_idmapped_mounts_child(struct lxc_handler *handler)
        int fret = -1;
        struct lxc_conf *conf = handler->conf;
        const char *fstab = conf->fstab;
-       struct lxc_list *mount = &conf->mount_list;
        int ret;
 
-       f_entries = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
+       f_entries = make_anonymous_mount_file(&conf->mount_entries,
+                                             conf->lsm_aa_allow_nesting);
        if (!f_entries) {
                SYSERROR("Failed to create anonymous mount file");
                goto out;
@@ -3105,123 +3156,119 @@ out:
        return fret;
 }
 
-static int parse_cap(const char *cap)
+int parse_cap(const char *cap_name, __u32 *cap)
 {
-       size_t i;
-       int capid = -1;
        size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
-       char *ptr = NULL;
+       int ret;
+       unsigned int res;
+       __u32 last_cap;
 
-       if (strequal(cap, "none"))
+       if (strequal(cap_name, "none"))
                return -2;
 
-       for (i = 0; i < end; i++) {
-               if (!strequal(cap, caps_opt[i].name))
+       for (size_t i = 0; i < end; i++) {
+               if (!strequal(cap_name, caps_opt[i].name))
                        continue;
 
-               capid = caps_opt[i].value;
-               break;
+               *cap = caps_opt[i].value;
+               return 0;
        }
 
-       if (capid < 0) {
-               /* Try to see if it's numeric, so the user may specify
-                * capabilities that the running kernel knows about but we
-                * don't
-                */
-               errno = 0;
-               capid = strtol(cap, &ptr, 10);
-               if (!ptr || *ptr != '\0' || errno != 0)
-                       /* not a valid number */
-                       capid = -1;
-               else if (capid > lxc_caps_last_cap())
-                       /* we have a number but it's not a valid
-                        * capability */
-                       capid = -1;
-       }
-
-       return capid;
+       /*
+        * Try to see if it's numeric, so the user may specify
+        * capabilities that the running kernel knows about but we
+        * don't.
+        */
+       ret = lxc_safe_uint(cap_name, &res);
+       if (ret < 0)
+               return -1;
+
+       ret = lxc_caps_last_cap(&last_cap);
+       if (ret)
+               return -1;
+
+       if ((__u32)res > last_cap)
+               return -1;
+
+       *cap = (__u32)res;
+       return 0;
 }
 
-int in_caplist(int cap, struct lxc_list *caps)
+bool has_cap(__u32 cap, struct lxc_conf *conf)
 {
-       int capid;
-       struct lxc_list *iterator;
+       bool cap_in_list = false;
+       struct cap_entry *cap_entry;
+
+       list_for_each_entry(cap_entry, &conf->caps.list, head) {
+               if (cap_entry->cap != cap)
+                       continue;
 
-       lxc_list_for_each (iterator, caps) {
-               capid = parse_cap(iterator->elem);
-               if (capid == cap)
-                       return 1;
+               cap_in_list = true;
        }
 
-       return 0;
+       /* The capability is kept. */
+       if (conf->caps.keep)
+               return cap_in_list;
+
+       /* The capability is not dropped. */
+       return !cap_in_list;
 }
 
-static int setup_caps(struct lxc_list *caps)
+static int capabilities_deny(struct lxc_conf *conf)
 {
-       int capid;
-       char *drop_entry;
-       struct lxc_list *iterator;
+       struct cap_entry *cap;
 
-       lxc_list_for_each (iterator, caps) {
+       list_for_each_entry(cap, &conf->caps.list, head) {
                int ret;
 
-               drop_entry = iterator->elem;
-
-               capid = parse_cap(drop_entry);
-               if (capid < 0)
-                       return log_error(-1, "unknown capability %s", drop_entry);
-
-               ret = prctl(PR_CAPBSET_DROP, prctl_arg(capid), prctl_arg(0),
+               ret = prctl(PR_CAPBSET_DROP, prctl_arg(cap->cap), prctl_arg(0),
                            prctl_arg(0), prctl_arg(0));
                if (ret < 0)
-                       return log_error_errno(-1, errno, "Failed to remove %s capability", drop_entry);
-               DEBUG("Dropped %s (%d) capability", drop_entry, capid);
+                       return syserror("Failed to remove %s capability", cap->cap_name);
+
+               DEBUG("Dropped %s (%d) capability", cap->cap_name, cap->cap);
        }
 
        DEBUG("Capabilities have been setup");
        return 0;
 }
 
-static int dropcaps_except(struct lxc_list *caps)
+static int capabilities_allow(struct lxc_conf *conf)
 {
-       __do_free int *caplist = NULL;
-       int i, capid, numcaps;
-       char *keep_entry;
-       struct lxc_list *iterator;
+       __do_free __u32 *keep_bits = NULL;
+       int ret;
+       struct cap_entry *cap;
+       __u32 last_cap, nr_u32;
 
-       numcaps = lxc_caps_last_cap() + 1;
-       if (numcaps <= 0 || numcaps > 200)
-               return -1;
-       TRACE("Found %d capabilities", numcaps);
+       ret = lxc_caps_last_cap(&last_cap);
+       if (ret || last_cap > 200)
+               return ret_errno(EINVAL);
 
-       /* caplist[i] is 1 if we keep capability i */
-       caplist = must_realloc(NULL, numcaps * sizeof(int));
-       memset(caplist, 0, numcaps * sizeof(int));
+       TRACE("Found %d capabilities", last_cap);
 
-       lxc_list_for_each (iterator, caps) {
-               keep_entry = iterator->elem;
+       nr_u32 = BITS_TO_LONGS(last_cap);
+       keep_bits = zalloc(nr_u32 * sizeof(__u32));
+       if (!keep_bits)
+               return ret_errno(ENOMEM);
 
-               capid = parse_cap(keep_entry);
-               if (capid == -2)
+       list_for_each_entry(cap, &conf->caps.list, head) {
+               if (cap->cap > last_cap)
                        continue;
 
-               if (capid < 0)
-                       return log_error(-1, "Unknown capability %s", keep_entry);
-
-               DEBUG("Keep capability %s (%d)", keep_entry, capid);
-               caplist[capid] = 1;
+               set_bit(cap->cap, keep_bits);
+               DEBUG("Keeping %s (%d) capability", cap->cap_name, cap->cap);
        }
 
-       for (i = 0; i < numcaps; i++) {
-               int ret;
-
-               if (caplist[i])
+       for (__u32 cap_bit = 0; cap_bit <= last_cap; cap_bit++) {
+               if (is_set(cap_bit, keep_bits))
                        continue;
 
-               ret = prctl(PR_CAPBSET_DROP, prctl_arg(i), prctl_arg(0),
+               ret = prctl(PR_CAPBSET_DROP, prctl_arg(cap_bit), prctl_arg(0),
                            prctl_arg(0), prctl_arg(0));
                if (ret < 0)
-                       return log_error_errno(-1, errno, "Failed to remove capability %d", i);
+                       return syserror("Failed to remove capability %d", cap_bit);
+
+               TRACE("Dropped capability %d", cap_bit);
        }
 
        DEBUG("Capabilities have been setup");
@@ -3283,7 +3330,7 @@ int setup_sysctl_parameters(struct lxc_conf *conf)
        char filename[PATH_MAX] = {0};
        struct lxc_sysctl *sysctl, *nsysctl;
 
-       if (!list_empty(&conf->sysctls))
+       if (list_empty(&conf->sysctls))
                return 0;
 
        list_for_each_entry_safe(sysctl, nsysctl, &conf->sysctls, head) {
@@ -3300,8 +3347,11 @@ int setup_sysctl_parameters(struct lxc_conf *conf)
                if (ret < 0)
                        return log_error_errno(-1, errno, "Failed to setup sysctl parameters %s to %s",
                                               sysctl->key, sysctl->value);
+
+               TRACE("Setting %s to %s", filename, sysctl->value);
        }
 
+       TRACE("Setup /proc/sys settings");
        return 0;
 }
 
@@ -3312,7 +3362,7 @@ int setup_proc_filesystem(struct lxc_conf *conf, pid_t pid)
        char filename[PATH_MAX] = {0};
        struct lxc_proc *proc;
 
-       if (!list_empty(&conf->procs))
+       if (list_empty(&conf->procs))
                return 0;
 
        list_for_each_entry(proc, &conf->procs, head) {
@@ -3329,6 +3379,8 @@ int setup_proc_filesystem(struct lxc_conf *conf, pid_t pid)
                if (ret < 0)
                        return log_error_errno(-1, errno, "Failed to setup proc filesystem %s to %s",
                                               proc->filename, proc->value);
+
+               TRACE("Setting %s to %s", filename, proc->value);
        }
 
        TRACE("Setup /proc/%d settings", pid);
@@ -3384,9 +3436,8 @@ struct lxc_conf *lxc_conf_init(void)
        /* Block ("allowlist") all devices by default. */
        new->bpf_devices.list_type = LXC_BPF_DEVICE_CGROUP_ALLOWLIST;
        INIT_LIST_HEAD(&(new->bpf_devices).devices);
-       lxc_list_init(&new->mount_list);
-       lxc_list_init(&new->caps);
-       lxc_list_init(&new->keepcaps);
+       INIT_LIST_HEAD(&new->mount_entries);
+       INIT_LIST_HEAD(&new->caps.list);
        INIT_LIST_HEAD(&new->id_map);
        new->root_nsuid_map = NULL;
        new->root_nsgid_map = NULL;
@@ -3396,17 +3447,19 @@ struct lxc_conf *lxc_conf_init(void)
        INIT_LIST_HEAD(&new->procs);
        new->hooks_version = 0;
        for (i = 0; i < NUM_LXC_HOOKS; i++)
-               lxc_list_init(&new->hooks[i]);
-       lxc_list_init(&new->groups);
+               INIT_LIST_HEAD(&new->hooks[i]);
+       INIT_LIST_HEAD(&new->groups);
        INIT_LIST_HEAD(&new->state_clients);
        new->lsm_aa_profile = NULL;
-       lxc_list_init(&new->lsm_aa_raw);
+       INIT_LIST_HEAD(&new->lsm_aa_raw);
        new->lsm_se_context = NULL;
        new->lsm_se_keyring_context = NULL;
        new->keyring_disable_session = false;
        new->transient_procfs_mnt = false;
        new->shmount.path_host = NULL;
        new->shmount.path_cont = NULL;
+       new->sched_core = false;
+       new->sched_core_cookie = INVALID_SCHED_CORE_COOKIE;
 
        /* if running in a new user namespace, init and COMMAND
         * default to running as UID/GID 0 when using lxc-execute */
@@ -3460,7 +3513,7 @@ int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
                return log_error_errno(-1, errno, "Failed to open \"%s\"", path);
 
        ret = lxc_write_nointr(fd, buf, buf_size);
-       if (ret != buf_size)
+       if (ret < 0 || (size_t)ret != buf_size)
                return log_error_errno(-1, errno, "Failed to write %cid mapping to \"%s\"",
                                       idtype == ID_TYPE_UID ? 'u' : 'g', path);
 
@@ -3531,7 +3584,9 @@ static struct id_map *find_mapped_hostid_entry(const struct list_head *idmap,
 
 int lxc_map_ids(struct list_head *idmap, pid_t pid)
 {
-       int hostuid, hostgid, fill, left;
+       int fill, left;
+       uid_t hostuid;
+       gid_t hostgid;
        char u_or_g;
        char *pos;
        char cmd_output[PATH_MAX];
@@ -3591,7 +3646,7 @@ int lxc_map_ids(struct list_head *idmap, pid_t pid)
        /* Check if we really need to use newuidmap and newgidmap.
        * If the user is only remapping their own {g,u}id, we don't need it.
        */
-       if (use_shadow && list_len(idmap) == 2) {
+       if (use_shadow && list_len(map, idmap, head) == 2) {
                use_shadow = false;
                list_for_each_entry(map, idmap, head) {
                        if (map->idtype == ID_TYPE_UID && map->range == 1 &&
@@ -3740,7 +3795,7 @@ static int lxc_transient_proc(struct lxc_rootfs *rootfs)
                        return log_error_errno(-errno, errno, "Failed to create %d(proc)", rootfs->dfd_mnt);
 
                goto domount;
-       } else if (link_len >= sizeof(link)) {
+       } else if ((size_t)link_len >= sizeof(link)) {
                return log_error_errno(-EIO, EIO, "Truncated link target");
        }
        link[link_len] = '\0';
@@ -3944,11 +3999,11 @@ int lxc_setup_rootfs_prepare_root(struct lxc_conf *conf, const char *name,
 static bool verify_start_hooks(struct lxc_conf *conf)
 {
        char path[PATH_MAX];
-       struct lxc_list *it;
+       struct string_entry *hook;
 
-       lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
+       list_for_each_entry(hook, &conf->hooks[LXCHOOK_START], head) {
                int ret;
-               char *hookname = it->elem;
+               char *hookname = hook->val;
 
                ret = strnprintf(path, sizeof(path), "%s%s",
                               conf->rootfs.path ? conf->rootfs.mount : "",
@@ -4083,7 +4138,7 @@ int lxc_idmapped_mounts_parent(struct lxc_handler *handler)
 
        for (;;) {
                __do_close int fd_from = -EBADF, fd_userns = -EBADF;
-               struct lxc_mount_attr attr = {};
+               struct mount_attr attr = {};
                struct lxc_mount_options opts = {};
                ssize_t ret;
 
@@ -4094,7 +4149,7 @@ int lxc_idmapped_mounts_parent(struct lxc_handler *handler)
                        return syserror("Failed to receive idmapped mount file descriptors from child");
 
                if (fd_from < 0 || fd_userns < 0)
-                       return log_trace(0, "Finished receiving idmapped mount file descriptors from child");
+                       return log_trace(0, "Finished receiving idmapped mount file descriptors (%d | %d) from child", fd_from, fd_userns);
 
                attr.attr_set   = MOUNT_ATTR_IDMAP;
                attr.userns_fd  = fd_userns;
@@ -4138,14 +4193,15 @@ static int lxc_recv_ttys_from_child(struct lxc_handler *handler)
        if (!info_new->tty)
                return ret_errno(ENOMEM);
 
-       for (int i = 0; i < ttys_max; i++) {
+       for (size_t i = 0; i < ttys_max; i++) {
                terminal_info = &info_new->tty[i];
                terminal_info->busy = -1;
+               terminal_info->pty_nr = -1;
                terminal_info->ptx = -EBADF;
                terminal_info->pty = -EBADF;
        }
 
-       for (int i = 0; i < ttys_max; i++) {
+       for (size_t i = 0; i < ttys_max; i++) {
                int ptx = -EBADF, pty = -EBADF;
 
                ret = lxc_abstract_unix_recv_two_fds(sock, &ptx, &pty);
@@ -4275,6 +4331,28 @@ int lxc_sync_fds_child(struct lxc_handler *handler)
        return 0;
 }
 
+static int setup_capabilities(struct lxc_conf *conf)
+{
+       int ret;
+
+       if (conf->caps.keep)
+               ret = capabilities_allow(conf);
+       else
+               ret = capabilities_deny(conf);
+       if (ret < 0)
+               return syserror_ret(ret, "Failed to %s capabilities", conf->caps.keep ? "allow" : "deny");
+
+       return 0;
+}
+
+static int make_shmount_dependent_mount(const struct lxc_conf *conf)
+{
+       if (!(conf->auto_mounts & LXC_AUTO_SHMOUNTS_MASK))
+               return 0;
+
+       return mount(NULL, conf->shmount.path_cont, NULL, MS_REC | MS_SLAVE, 0);
+}
+
 int lxc_setup(struct lxc_handler *handler)
 {
        int ret;
@@ -4328,9 +4406,8 @@ int lxc_setup(struct lxc_handler *handler)
        if (ret < 0)
                return log_error(-1, "Failed to setup mounts");
 
-       if (!lxc_list_empty(&lxc_conf->mount_list)) {
-               ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
-                                         &lxc_conf->mount_list, name, lxcpath);
+       if (!list_empty(&lxc_conf->mount_entries)) {
+               ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs, name, lxcpath);
                if (ret < 0)
                        return log_error(-1, "Failed to setup mount entries");
        }
@@ -4359,6 +4436,9 @@ int lxc_setup(struct lxc_handler *handler)
        if (ret < 0)
                return log_error(-1, "Failed to run mount hooks");
 
+       if (lxc_rootfs_overmounted(&lxc_conf->rootfs))
+               return log_error(-1, "Rootfs overmounted");
+
        if (lxc_conf->autodev > 0) {
                ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
                if (ret < 0)
@@ -4402,6 +4482,11 @@ int lxc_setup(struct lxc_handler *handler)
        if (ret < 0)
                return log_error(-1, "Failed to pivot root into rootfs");
 
+       ret = make_shmount_dependent_mount(lxc_conf);
+       if (ret < 0)
+               return log_error(-1, "Failed to turn mount tunnel \"%s\" into dependent mount",
+                                lxc_conf->shmount.path_cont);
+
        /* Setting the boot-id is best-effort for now. */
        if (lxc_conf->autodev > 0)
                (void)lxc_setup_boot_id();
@@ -4418,15 +4503,9 @@ int lxc_setup(struct lxc_handler *handler)
        if (ret < 0)
                return log_error(-1, "Failed to setup sysctl parameters");
 
-       if (!lxc_list_empty(&lxc_conf->keepcaps)) {
-               if (!lxc_list_empty(&lxc_conf->caps))
-                       return log_error(-1, "Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both");
-
-               if (dropcaps_except(&lxc_conf->keepcaps))
-                       return log_error(-1, "Failed to keep capabilities");
-       } else if (setup_caps(&lxc_conf->caps)) {
-               return log_error(-1, "Failed to drop capabilities");
-       }
+       ret = setup_capabilities(lxc_conf);
+       if (ret < 0)
+               return log_error(-1, "Failed to setup capabilities");
 
        put_lxc_rootfs(&handler->conf->rootfs, true);
        NOTICE("The container \"%s\" is set up", name);
@@ -4437,8 +4516,8 @@ int lxc_setup(struct lxc_handler *handler)
 int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
                  char *argv[])
 {
-       struct lxc_list *it;
        int which;
+       struct string_entry *entry;
 
        for (which = 0; which < NUM_LXC_HOOKS; which ++) {
                if (strequal(hookname, lxchook_names[which]))
@@ -4448,9 +4527,9 @@ int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
        if (which >= NUM_LXC_HOOKS)
                return -1;
 
-       lxc_list_for_each (it, &conf->hooks[which]) {
+       list_for_each_entry(entry, &conf->hooks[which], head) {
                int ret;
-               char *hook = it->elem;
+               char *hook = entry->val;
 
                ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
                                      hookname, argv);
@@ -4463,15 +4542,16 @@ int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
 
 int lxc_clear_config_caps(struct lxc_conf *c)
 {
-       struct lxc_list *it, *next;
+       struct cap_entry *cap, *ncap;
 
-       lxc_list_for_each_safe (it, &c->caps, next) {
-               lxc_list_del(it);
-               free(it->elem);
-               free(it);
+       list_for_each_entry_safe(cap, ncap, &c->caps.list, head) {
+               list_del(&cap->head);
+               free(cap->cap_name);
+               free(cap);
        }
 
-       lxc_list_init(&c->caps);
+       c->caps.keep = false;
+       INIT_LIST_HEAD(&c->caps.list);
        return 0;
 }
 
@@ -4500,20 +4580,6 @@ int lxc_clear_idmaps(struct lxc_conf *c)
        return lxc_free_idmap(&c->id_map);
 }
 
-int lxc_clear_config_keepcaps(struct lxc_conf *c)
-{
-       struct lxc_list *it, *next;
-
-       lxc_list_for_each_safe (it, &c->keepcaps, next) {
-               lxc_list_del(it);
-               free(it->elem);
-               free(it);
-       }
-
-       lxc_list_init(&c->keepcaps);
-       return 0;
-}
-
 int lxc_clear_namespace(struct lxc_conf *c)
 {
        for (int i = 0; i < LXC_NS_MAX; i++)
@@ -4663,15 +4729,15 @@ int lxc_clear_procs(struct lxc_conf *c, const char *key)
 
 int lxc_clear_groups(struct lxc_conf *c)
 {
-       struct lxc_list *it, *next;
+       struct string_entry *entry, *nentry;
 
-       lxc_list_for_each_safe (it, &c->groups, next) {
-               lxc_list_del(it);
-               free(it->elem);
-               free(it);
+       list_for_each_entry_safe(entry, nentry, &c->groups, head) {
+               list_del(&entry->head);
+               free(entry->val);
+               free(entry);
        }
 
-       lxc_list_init(&c->groups);
+       INIT_LIST_HEAD(&c->groups);
        return 0;
 }
 
@@ -4692,15 +4758,15 @@ int lxc_clear_environment(struct lxc_conf *c)
 
 int lxc_clear_mount_entries(struct lxc_conf *c)
 {
-       struct lxc_list *it, *next;
+       struct string_entry *entry, *nentry;
 
-       lxc_list_for_each_safe (it, &c->mount_list, next) {
-               lxc_list_del(it);
-               free(it->elem);
-               free(it);
+       list_for_each_entry_safe(entry, nentry, &c->mount_entries, head) {
+               list_del(&entry->head);
+               free(entry->val);
+               free(entry);
        }
 
-       lxc_list_init(&c->mount_list);
+       INIT_LIST_HEAD(&c->mount_entries);
        return 0;
 }
 
@@ -4712,9 +4778,9 @@ int lxc_clear_automounts(struct lxc_conf *c)
 
 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
 {
-       struct lxc_list *it, *next;
        const char *k = NULL;
        bool all = false, done = false;
+       struct string_entry *entry, *nentry;
 
        if (strequal(key, "lxc.hook"))
                all = true;
@@ -4725,13 +4791,12 @@ int lxc_clear_hooks(struct lxc_conf *c, const char *key)
 
        for (int i = 0; i < NUM_LXC_HOOKS; i++) {
                if (all || strequal(k, lxchook_names[i])) {
-                       lxc_list_for_each_safe (it, &c->hooks[i], next) {
-                               lxc_list_del(it);
-                               free(it->elem);
-                               free(it);
+                       list_for_each_entry_safe(entry, nentry, &c->hooks[i], head) {
+                               list_del(&entry->head);
+                               free(entry->val);
+                               free(entry);
                        }
-                       lxc_list_init(&c->hooks[i]);
-
+                       INIT_LIST_HEAD(&c->hooks[i]);
                        done = true;
                }
        }
@@ -4744,15 +4809,15 @@ int lxc_clear_hooks(struct lxc_conf *c, const char *key)
 
 int lxc_clear_apparmor_raw(struct lxc_conf *c)
 {
-       struct lxc_list *it, *next;
+       struct string_entry *entry, *nentry;
 
-       lxc_list_for_each_safe (it, &c->lsm_aa_raw, next) {
-               lxc_list_del(it);
-               free(it->elem);
-               free(it);
+       list_for_each_entry_safe(entry, nentry, &c->lsm_aa_raw, head) {
+               list_del(&entry->head);
+               free(entry->val);
+               free(entry);
        }
 
-       lxc_list_init(&c->lsm_aa_raw);
+       INIT_LIST_HEAD(&c->lsm_aa_raw);
        return 0;
 }
 
@@ -4789,7 +4854,6 @@ void lxc_conf_free(struct lxc_conf *conf)
        free(conf->lsm_se_keyring_context);
        lxc_seccomp_free(&conf->seccomp);
        lxc_clear_config_caps(conf);
-       lxc_clear_config_keepcaps(conf);
        lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
        lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
        lxc_clear_cgroups_devices(conf);
@@ -4809,6 +4873,7 @@ void lxc_conf_free(struct lxc_conf *conf)
        free(conf->cgroup_meta.container_dir);
        free(conf->cgroup_meta.namespace_dir);
        free(conf->cgroup_meta.controllers);
+       free(conf->cgroup_meta.systemd_scope);
        free(conf->shmount.path_host);
        free(conf->shmount.path_cont);
        free(conf);
@@ -5488,11 +5553,20 @@ int userns_exec_mapped_root(const char *path, int path_fd,
 
                close_prot_errno_disarm(sock_fds[0]);
 
-               if (!lxc_switch_uid_gid(0, 0))
+               if (!lxc_drop_groups() && errno != EPERM)
+                       _exit(EXIT_FAILURE);
+
+               ret = setresgid(0, 0, 0);
+               if (ret < 0) {
+                       SYSERROR("Failed to setresgid(0, 0, 0)");
                        _exit(EXIT_FAILURE);
+               }
 
-               if (!lxc_drop_groups())
+               ret = setresuid(0, 0, 0);
+               if (ret < 0) {
+                       SYSERROR("Failed to setresuid(0, 0, 0)");
                        _exit(EXIT_FAILURE);
+               }
 
                ret = fchown(target_fd, 0, st.st_gid);
                if (ret) {
@@ -5540,9 +5614,12 @@ on_error:
 
        /* Wait for child to finish. */
        if (pid < 0)
+               return log_error(-1, "Failed to create child process");
+
+       if (!wait_exited(pid))
                return -1;
 
-       return wait_for_pid(pid);
+       return 0;
 }
 
 /* not thread-safe, do not use from api without first forking */
@@ -5551,11 +5628,11 @@ static char *getuname(void)
        __do_free char *buf = NULL;
        struct passwd pwent;
        struct passwd *pwentp = NULL;
-       size_t bufsize;
+       ssize_t bufsize;
        int ret;
 
        bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
-       if (bufsize == -1)
+       if (bufsize < 0)
                bufsize = 1024;
 
        buf = zalloc(bufsize);
@@ -5579,11 +5656,11 @@ static char *getgname(void)
        __do_free char *buf = NULL;
        struct group grent;
        struct group *grentp = NULL;
-       size_t bufsize;
+       ssize_t bufsize;
        int ret;
 
        bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
-       if (bufsize == -1)
+       if (bufsize < 0)
                bufsize = 1024;
 
        buf = zalloc(bufsize);
@@ -5705,29 +5782,6 @@ void suggest_default_idmap(void)
        ERROR("lxc.idmap = g 0 %u %u", gid, grange);
 }
 
-/* Return the list of cgroup_settings sorted according to the following rules
- * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
- */
-void sort_cgroup_settings(struct lxc_conf *conf)
-{
-       struct lxc_cgroup *cgroup, *memsw_limit, *ncgroup;
-
-       /* Iterate over the cgroup settings and copy them to the output list. */
-       list_for_each_entry_safe(cgroup, ncgroup, &conf->cgroup, head) {
-               if (strequal(cgroup->subsystem, "memory.memsw.limit_in_bytes")) {
-                       /* Store the memsw_limit location */
-                       memsw_limit = cgroup;
-               } else if (memsw_limit && strequal(cgroup->subsystem, "memory.limit_in_bytes")) {
-                       /*
-                        * lxc.cgroup.memory.memsw.limit_in_bytes is found
-                        * before lxc.cgroup.memory.limit_in_bytes, swap these
-                        * two items.
-                        */
-                       list_swap(&memsw_limit->head, &cgroup->head);
-               }
-       }
-}
-
 int lxc_set_environment(const struct lxc_conf *conf)
 {
        struct environment_entry *env;