]> git.proxmox.com Git - mirror_lxc.git/blobdiff - src/lxc/conf.c
tree-wide: use lxc_drop_groups() instead of lxc_setgroups(0, NULL)
[mirror_lxc.git] / src / lxc / conf.c
index 0a1bb6d75af96df0dc4170939424c685c122c349..4d258ada71b7850281300e21086f848cbfb9b030 100644 (file)
@@ -35,8 +35,7 @@
 
 #include "af_unix.h"
 #include "caps.h"
-#include "cgroup.h"
-#include "cgroup2_devices.h"
+#include "cgroups/cgroup.h"
 #include "conf.h"
 #include "config.h"
 #include "confile.h"
@@ -55,7 +54,7 @@
 #include "process_utils.h"
 #include "ringbuf.h"
 #include "start.h"
-#include "storage.h"
+#include "storage/storage.h"
 #include "storage/overlay.h"
 #include "syscall_wrappers.h"
 #include "terminal.h"
@@ -182,56 +181,47 @@ static struct mount_opt propagation_opt[] = {
 
 static struct caps_opt caps_opt[] = {
 #if HAVE_LIBCAP
-       { "chown",            CAP_CHOWN            },
-       { "dac_override",     CAP_DAC_OVERRIDE     },
-       { "dac_read_search",  CAP_DAC_READ_SEARCH  },
-       { "fowner",           CAP_FOWNER           },
-       { "fsetid",           CAP_FSETID           },
-       { "kill",             CAP_KILL             },
-       { "setgid",           CAP_SETGID           },
-       { "setuid",           CAP_SETUID           },
-       { "setpcap",          CAP_SETPCAP          },
-       { "linux_immutable",  CAP_LINUX_IMMUTABLE  },
-       { "net_bind_service", CAP_NET_BIND_SERVICE },
-       { "net_broadcast",    CAP_NET_BROADCAST    },
-       { "net_admin",        CAP_NET_ADMIN        },
-       { "net_raw",          CAP_NET_RAW          },
-       { "ipc_lock",         CAP_IPC_LOCK         },
-       { "ipc_owner",        CAP_IPC_OWNER        },
-       { "sys_module",       CAP_SYS_MODULE       },
-       { "sys_rawio",        CAP_SYS_RAWIO        },
-       { "sys_chroot",       CAP_SYS_CHROOT       },
-       { "sys_ptrace",       CAP_SYS_PTRACE       },
-       { "sys_pacct",        CAP_SYS_PACCT        },
-       { "sys_admin",        CAP_SYS_ADMIN        },
-       { "sys_boot",         CAP_SYS_BOOT         },
-       { "sys_nice",         CAP_SYS_NICE         },
-       { "sys_resource",     CAP_SYS_RESOURCE     },
-       { "sys_time",         CAP_SYS_TIME         },
-       { "sys_tty_config",   CAP_SYS_TTY_CONFIG   },
-       { "mknod",            CAP_MKNOD            },
-       { "lease",            CAP_LEASE            },
-#ifdef CAP_AUDIT_READ
-       { "audit_read",       CAP_AUDIT_READ       },
-#endif
-#ifdef CAP_AUDIT_WRITE
-       { "audit_write",      CAP_AUDIT_WRITE      },
-#endif
-#ifdef CAP_AUDIT_CONTROL
-       { "audit_control",    CAP_AUDIT_CONTROL    },
-#endif
-       { "setfcap",          CAP_SETFCAP          },
-       { "mac_override",     CAP_MAC_OVERRIDE     },
-       { "mac_admin",        CAP_MAC_ADMIN        },
-#ifdef CAP_SYSLOG
-       { "syslog",           CAP_SYSLOG           },
-#endif
-#ifdef CAP_WAKE_ALARM
-       { "wake_alarm",       CAP_WAKE_ALARM       },
-#endif
-#ifdef CAP_BLOCK_SUSPEND
-       { "block_suspend",    CAP_BLOCK_SUSPEND    },
-#endif
+       { "chown",              CAP_CHOWN              },
+       { "dac_override",       CAP_DAC_OVERRIDE       },
+       { "dac_read_search",    CAP_DAC_READ_SEARCH    },
+       { "fowner",             CAP_FOWNER             },
+       { "fsetid",             CAP_FSETID             },
+       { "kill",               CAP_KILL               },
+       { "setgid",             CAP_SETGID             },
+       { "setuid",             CAP_SETUID             },
+       { "setpcap",            CAP_SETPCAP            },
+       { "linux_immutable",    CAP_LINUX_IMMUTABLE    },
+       { "net_bind_service",   CAP_NET_BIND_SERVICE   },
+       { "net_broadcast",      CAP_NET_BROADCAST      },
+       { "net_admin",          CAP_NET_ADMIN          },
+       { "net_raw",            CAP_NET_RAW            },
+       { "ipc_lock",           CAP_IPC_LOCK           },
+       { "ipc_owner",          CAP_IPC_OWNER          },
+       { "sys_module",         CAP_SYS_MODULE         },
+       { "sys_rawio",          CAP_SYS_RAWIO          },
+       { "sys_chroot",         CAP_SYS_CHROOT         },
+       { "sys_ptrace",         CAP_SYS_PTRACE         },
+       { "sys_pacct",          CAP_SYS_PACCT          },
+       { "sys_admin",          CAP_SYS_ADMIN          },
+       { "sys_boot",           CAP_SYS_BOOT           },
+       { "sys_nice",           CAP_SYS_NICE           },
+       { "sys_resource",       CAP_SYS_RESOURCE       },
+       { "sys_time",           CAP_SYS_TIME           },
+       { "sys_tty_config",     CAP_SYS_TTY_CONFIG     },
+       { "mknod",              CAP_MKNOD              },
+       { "lease",              CAP_LEASE              },
+       { "audit_write",        CAP_AUDIT_WRITE        },
+       { "audit_control",      CAP_AUDIT_CONTROL      },
+       { "setfcap",            CAP_SETFCAP            },
+       { "mac_override",       CAP_MAC_OVERRIDE       },
+       { "mac_admin",          CAP_MAC_ADMIN          },
+       { "syslog",             CAP_SYSLOG             },
+       { "wake_alarm",         CAP_WAKE_ALARM         },
+       { "block_suspend",      CAP_BLOCK_SUSPEND      },
+       { "audit_read",         CAP_AUDIT_READ         },
+       { "perfmon",            CAP_PERFMON            },
+       { "bpf",                CAP_BPF                },
+       { "checkpoint_restore", CAP_CHECKPOINT_RESTORE },
 #endif
 };
 
@@ -607,7 +597,7 @@ static int add_shmount_to_list(struct lxc_conf *conf)
 
 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
 {
-       int i, r;
+       int i, ret;
        static struct {
                int match_mask;
                int match_flag;
@@ -616,6 +606,7 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha
                const char *fstype;
                unsigned long flags;
                const char *options;
+               bool requires_cap_net_admin;
        } default_mounts[] = {
                /* Read-only bind-mounting... In older kernels, doing that
                 * required to do one MS_BIND mount and then
@@ -629,27 +620,44 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha
                 * it's busy...  MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for
                 * kernels as low as 2.6.32...
                 */
-               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc",                                           "%r/proc",                    "proc",  MS_NODEV|MS_NOEXEC|MS_NOSUID,                    NULL },
+               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc",                                           "%r/proc",                    "proc",  MS_NODEV|MS_NOEXEC|MS_NOSUID,                    NULL, false },
                /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
-               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net",                                "%r/proc/tty",                NULL,    MS_BIND,                                         NULL },
-               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys",                                    "%r/proc/sys",                NULL,    MS_BIND,                                         NULL },
-               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL,                                             "%r/proc/sys",                NULL,    MS_REMOUNT|MS_BIND|MS_RDONLY,                    NULL },
-               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty",                                    "%r/proc/sys/net",            NULL,    MS_MOVE,                                         NULL },
-               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger",                          "%r/proc/sysrq-trigger",      NULL,    MS_BIND,                                         NULL },
-               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL,                                             "%r/proc/sysrq-trigger",      NULL,    MS_REMOUNT|MS_BIND|MS_RDONLY,                    NULL },
-               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW,    "proc",                                           "%r/proc",                    "proc",  MS_NODEV|MS_NOEXEC|MS_NOSUID,                    NULL },
-               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_RW,     "sysfs",                                          "%r/sys",                     "sysfs", 0,                                               NULL },
-               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_RO,     "sysfs",                                          "%r/sys",                     "sysfs", MS_RDONLY,                                       NULL },
-               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "sysfs",                                          "%r/sys",                     "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID,                    NULL },
-               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "%r/sys",                                         "%r/sys",                     NULL,    MS_BIND,                                         NULL },
-               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  NULL,                                             "%r/sys",                     NULL,    MS_REMOUNT|MS_BIND|MS_RDONLY,                    NULL },
-               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "sysfs",                                          "%r/sys/devices/virtual/net", "sysfs", 0,                                               NULL },
-               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL,    MS_BIND,                                         NULL },
-               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  NULL,                                             "%r/sys/devices/virtual/net", NULL,    MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
-               { 0,                  0,                   NULL,                                             NULL,                         NULL,    0,                                               NULL }
+               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net",                                "%r/proc/tty",                NULL,    MS_BIND,                                         NULL, true  },
+               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys",                                    "%r/proc/sys",                NULL,    MS_BIND,                                         NULL, false },
+               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL,                                             "%r/proc/sys",                NULL,    MS_REMOUNT|MS_BIND|MS_RDONLY,                    NULL, false },
+               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty",                                    "%r/proc/sys/net",            NULL,    MS_MOVE,                                         NULL, true  },
+               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger",                          "%r/proc/sysrq-trigger",      NULL,    MS_BIND,                                         NULL, false },
+               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL,                                             "%r/proc/sysrq-trigger",      NULL,    MS_REMOUNT|MS_BIND|MS_RDONLY,                    NULL, false },
+               { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW,    "proc",                                           "%r/proc",                    "proc",  MS_NODEV|MS_NOEXEC|MS_NOSUID,                    NULL, false },
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_RW,     "sysfs",                                          "%r/sys",                     "sysfs", 0,                                               NULL, false },
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_RO,     "sysfs",                                          "%r/sys",                     "sysfs", MS_RDONLY,                                       NULL, false },
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "sysfs",                                          "%r/sys",                     "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID,                    NULL, false },
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "%r/sys",                                         "%r/sys",                     NULL,    MS_BIND,                                         NULL, false },
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  NULL,                                             "%r/sys",                     NULL,    MS_REMOUNT|MS_BIND|MS_RDONLY,                    NULL, false },
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "sysfs",                                          "%r/sys/devices/virtual/net", "sysfs", 0,                                               NULL, false },
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL,    MS_BIND,                                         NULL, false },
+               { LXC_AUTO_SYS_MASK,  LXC_AUTO_SYS_MIXED,  NULL,                                             "%r/sys/devices/virtual/net", NULL,    MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL, false },
+               { 0,                  0,                   NULL,                                             NULL,                         NULL,    0,                                               NULL, false }
        };
+        struct lxc_rootfs *rootfs = &conf->rootfs;
+        bool has_cap_net_admin;
+
+        if (flags & LXC_AUTO_PROC_MASK) {
+               ret = mkdirat(rootfs->dfd_mnt, "proc" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+               if (ret < 0 && errno != EEXIST)
+                       return log_error_errno(-errno, errno,
+                                              "Failed to create proc mountpoint under %d", rootfs->dfd_mnt);
+       }
+
+       if (flags & LXC_AUTO_SYS_MASK) {
+               ret = mkdirat(rootfs->dfd_mnt, "sys" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+               if (ret < 0 && errno != EEXIST)
+                       return log_error_errno(-errno, errno,
+                                              "Failed to create sysfs mountpoint under %d", rootfs->dfd_mnt);
+       }
 
-       for (i = 0; default_mounts[i].match_mask; i++) {
+        has_cap_net_admin = lxc_wants_cap(CAP_NET_ADMIN, conf);
+        for (i = 0; default_mounts[i].match_mask; i++) {
                __do_free char *destination = NULL, *source = NULL;
                int saved_errno;
                unsigned long mflags;
@@ -658,7 +666,7 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha
 
                if (default_mounts[i].source) {
                        /* will act like strdup if %r is not present */
-                       source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
+                       source = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].source);
                        if (!source)
                                return -1;
                }
@@ -666,25 +674,30 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha
                if (!default_mounts[i].destination)
                        return log_error(-1, "BUG: auto mounts destination %d was NULL", i);
 
+               if (!has_cap_net_admin && default_mounts[i].requires_cap_net_admin) {
+                       TRACE("Container does not have CAP_NET_ADMIN. Skipping \"%s\" mount", default_mounts[i].source ?: "(null)");
+                       continue;
+               }
+
                /* will act like strdup if %r is not present */
-               destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
+               destination = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].destination);
                if (!destination)
                        return -1;
 
                mflags = add_required_remount_flags(source, destination,
                                                    default_mounts[i].flags);
-               r = safe_mount(source, destination, default_mounts[i].fstype,
-                              mflags, default_mounts[i].options,
-                              conf->rootfs.path ? conf->rootfs.mount : NULL);
+               ret = safe_mount(source, destination, default_mounts[i].fstype,
+                               mflags, default_mounts[i].options,
+                               rootfs->path ? rootfs->mount : NULL);
                saved_errno = errno;
-               if (r < 0 && errno == ENOENT) {
+               if (ret < 0 && errno == ENOENT) {
                        INFO("Mount source or target for \"%s\" on \"%s\" does not exist. Skipping", source, destination);
-                       r = 0;
-               } else if (r < 0) {
+                       ret = 0;
+               } else if (ret < 0) {
                        SYSERROR("Failed to mount \"%s\" on \"%s\" with flags %lu", source, destination, mflags);
                }
 
-               if (r < 0) {
+               if (ret < 0) {
                        errno = saved_errno;
                        return -1;
                }
@@ -721,15 +734,12 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha
                if (flags & LXC_AUTO_CGROUP_FORCE)
                        cg_flags |= LXC_AUTO_CGROUP_FORCE;
 
-               if (!handler->cgroup_ops->mount(handler->cgroup_ops,
-                                               handler,
-                                               conf->rootfs.path ? conf->rootfs.mount : "",
-                                               cg_flags))
+               if (!handler->cgroup_ops->mount(handler->cgroup_ops, conf, cg_flags))
                        return log_error_errno(-1, errno, "Failed to mount \"/sys/fs/cgroup\"");
        }
 
        if (flags & LXC_AUTO_SHMOUNTS_MASK) {
-               int ret = add_shmount_to_list(conf);
+               ret = add_shmount_to_list(conf);
                if (ret < 0)
                        return log_error(-1, "Failed to add shmount entry to container config");
        }
@@ -768,31 +778,30 @@ static const struct dev_symlinks dev_symlinks[] = {
 
 static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
 {
-       int i, ret;
-       char path[PATH_MAX];
-       struct stat s;
-
-       for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
+       for (int i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
+               int ret;
+               struct stat s;
                const struct dev_symlinks *d = &dev_symlinks[i];
 
-               ret = snprintf(path, sizeof(path), "%s/dev/%s",
-                              rootfs->path ? rootfs->mount : "", d->name);
-               if (ret < 0 || (size_t)ret >= sizeof(path))
-                       return -1;
-
-               /* Stat the path first. If we don't get an error accept it as
+               /*
+                * Stat the path first. If we don't get an error accept it as
                 * is and don't try to create it
                 */
-               ret = stat(path, &s);
+               ret = fstatat(rootfs->dfd_dev, d->name, &s, 0);
                if (ret == 0)
                        continue;
 
-               ret = symlink(d->oldpath, path);
-               if (ret && errno != EEXIST) {
-                       if (errno == EROFS)
-                               WARN("Failed to create \"%s\". Read-only filesystem", path);
-                       else
-                               return log_error_errno(-1, errno, "Failed to create \"%s\"", path);
+               ret = symlinkat(d->oldpath, rootfs->dfd_dev, d->name);
+               if (ret) {
+                       switch (errno) {
+                       case EROFS:
+                               WARN("Failed to create \"%s\" on read-only filesystem", d->name);
+                               __fallthrough;
+                       case EEXIST:
+                               break;
+                       default:
+                               return log_error_errno(-errno, errno, "Failed to create \"%s\"", d->name);
+                       }
                }
        }
 
@@ -1062,14 +1071,14 @@ static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
        DEBUG("Using mount options: %s", mount_options);
 
        cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
-       ret = mkdirat(rootfs->mntpt_fd, "dev" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+       ret = mkdirat(rootfs->dfd_mnt, "dev" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
        if (ret < 0 && errno != EEXIST) {
                SYSERROR("Failed to create \"/dev\" directory");
                ret = -errno;
                goto reset_umask;
        }
 
-       ret = safe_mount_beneath_at(rootfs->mntpt_fd, "none", "dev", "tmpfs", 0, mount_options);
+       ret = safe_mount_beneath_at(rootfs->dfd_mnt, "none", "dev", "tmpfs", 0, mount_options);
        if (ret < 0) {
                __do_free char *fallback_path = NULL;
 
@@ -1094,7 +1103,7 @@ static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
        /* If we are running on a devtmpfs mapping, dev/pts may already exist.
         * If not, then create it and exit if that fails...
         */
-       ret = mkdirat(rootfs->mntpt_fd, "dev/pts", S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+       ret = mkdirat(rootfs->dfd_mnt, "dev/pts", S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
        if (ret < 0 && errno != EEXIST) {
                SYSERROR("Failed to create directory \"%s\"", path);
                ret = -errno;
@@ -1136,28 +1145,22 @@ enum {
 
 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
 {
-       __do_close int dev_dir_fd = -EBADF;
        int i, ret;
        mode_t cmask;
        int use_mknod = LXC_DEVNODE_MKNOD;
 
-       /* ignore, just don't try to fill in */
-       if (!exists_dir_at(rootfs->mntpt_fd, "dev"))
-               return 0;
-
-       dev_dir_fd = openat(rootfs->mntpt_fd, "dev/", O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH | O_NOFOLLOW);
-       if (dev_dir_fd < 0)
-               return -errno;
+       if (rootfs->dfd_dev < 0)
+               return log_info(0, "No /dev directory found, skipping setup");
 
        INFO("Populating \"/dev\"");
 
        cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
        for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
-               char hostpath[PATH_MAX], path[PATH_MAX];
+               char device_path[PATH_MAX];
                const struct lxc_device_node *device = &lxc_devices[i];
 
                if (use_mknod >= LXC_DEVNODE_MKNOD) {
-                       ret = mknodat(dev_dir_fd, device->name, device->mode, makedev(device->maj, device->min));
+                       ret = mknodat(rootfs->dfd_dev, device->name, device->mode, makedev(device->maj, device->min));
                        if (ret == 0 || (ret < 0 && errno == EEXIST)) {
                                DEBUG("Created device node \"%s\"", device->name);
                        } else if (ret < 0) {
@@ -1177,7 +1180,7 @@ static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
                                 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
                                 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
                                 */
-                               fd = openat(dev_dir_fd, device->name, O_RDONLY | O_CLOEXEC);
+                               fd = open_at(rootfs->dfd_dev, device->name, PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
                                if (fd >= 0) {
                                        /* Device nodes are fully useable. */
                                        use_mknod = LXC_DEVNODE_OPEN;
@@ -1195,25 +1198,52 @@ static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
                         * nodes the prio mknod() call will have created the
                         * device node so we can use it as a bind-mount target.
                         */
-                       ret = mknodat(dev_dir_fd, device->name, S_IFREG | 0000, 0);
+                       ret = mknodat(rootfs->dfd_dev, device->name, S_IFREG | 0000, 0);
                        if (ret < 0 && errno != EEXIST)
                                return log_error_errno(-1, errno, "Failed to create file \"%s\"", device->name);
                }
 
                /* Fallback to bind-mounting the device from the host. */
-               snprintf(hostpath, sizeof(hostpath), "/dev/%s", device->name);
-
-               ret = safe_mount_beneath_at(dev_dir_fd, hostpath, device->name, NULL, MS_BIND, NULL);
+               ret = snprintf(device_path, sizeof(device_path), "dev/%s", device->name);
+               if (ret < 0 || (size_t)ret >= sizeof(device_path))
+                       return ret_errno(EIO);
+
+               ret = mount_from_at(rootfs->dfd_host, device_path,
+                                   PROTECT_OPATH_FILE,
+                                   PROTECT_LOOKUP_BENEATH_XDEV,
+                                   rootfs->dfd_dev, device->name,
+                                   PROTECT_OPATH_FILE,
+                                   PROTECT_LOOKUP_BENEATH,
+                                   NULL /* fstype */,
+                                   MS_BIND /* mount flags */,
+                                   NULL);
                if (ret < 0) {
-                       const char *mntpt = rootfs->path ? rootfs->mount : NULL;
-                       if (errno == ENOSYS) {
-                               snprintf(path, sizeof(path), "%s/dev/%s", mntpt, device->name);
-                               ret = safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL);
-                       }
+                       char path[PATH_MAX];
+
+                       if (errno != ENOSYS)
+                               return log_error_errno(-errno, errno,
+                                                      "Failed to mount %d(%s) to %d(%s)",
+                                                      rootfs->dfd_host,
+                                                      device_path,
+                                                      rootfs->dfd_dev,
+                                                      device->name);
+
+                       ret = snprintf(device_path, sizeof(device_path), "/dev/%s", device->name);
+                       if (ret < 0 || (size_t)ret >= sizeof(device_path))
+                               return ret_errno(EIO);
+
+                       ret = snprintf(path, sizeof(path), "%s/dev/%s", get_rootfs_mnt(rootfs), device->name);
+                       if (ret < 0 || ret >= sizeof(path))
+                               return log_error(-1, "Failed to create device path for %s", device->name);
+
+                       ret = safe_mount(device_path, path, 0, MS_BIND, NULL, get_rootfs_mnt(rootfs));
+                       if (ret < 0)
+                               return log_error_errno(-1, errno, "Failed to bind mount host device node \"%s\" to \"%s\"", device_path, path);
+
+                       DEBUG("Bind mounted host device node \"%s\" to \"%s\"", device_path, path);
+                       continue;
                }
-               if (ret < 0)
-                       return log_error_errno(-1, errno, "Failed to bind mount host device node \"%s\" onto \"%s\"", hostpath, device->name);
-               DEBUG("Bind mounted host device node \"%s\" onto \"%s\"", hostpath, device->name);
+               DEBUG("Bind mounted host device %d(%s) to %d(%s)", rootfs->dfd_host, device_path, rootfs->dfd_dev, device->name);
        }
        (void)umask(cmask);
 
@@ -1232,8 +1262,8 @@ static int lxc_mount_rootfs(struct lxc_conf *conf)
                if (ret < 0)
                        return log_error_errno(-1, errno, "Failed to recursively turn root mount tree into dependent mount");
 
-               rootfs->mntpt_fd = openat(-1, "/", O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH);
-               if (rootfs->mntpt_fd < 0)
+               rootfs->dfd_mnt = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
+               if (rootfs->dfd_mnt < 0)
                        return -errno;
 
                return 0;
@@ -1261,8 +1291,8 @@ static int lxc_mount_rootfs(struct lxc_conf *conf)
              rootfs->path, rootfs->mount,
              rootfs->options ? rootfs->options : "(null)");
 
-       rootfs->mntpt_fd = openat(-1, rootfs->mount, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH);
-       if (rootfs->mntpt_fd < 0)
+       rootfs->dfd_mnt = open_at(-EBADF, rootfs->mount, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
+       if (rootfs->dfd_mnt < 0)
                return -errno;
 
        return 0;
@@ -1384,54 +1414,50 @@ static int lxc_chroot(const struct lxc_rootfs *rootfs)
  *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
  *    first.
  */
-static int lxc_pivot_root(const char *rootfs)
+static int lxc_pivot_root(const struct lxc_rootfs *rootfs)
 {
-       __do_close int oldroot = -EBADF, newroot = -EBADF;
+       __do_close int fd_oldroot = -EBADF;
        int ret;
 
-       oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
-       if (oldroot < 0)
+       fd_oldroot = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
+       if (fd_oldroot < 0)
                return log_error_errno(-1, errno, "Failed to open old root directory");
 
-       newroot = open(rootfs, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
-       if (newroot < 0)
-               return log_error_errno(-1, errno, "Failed to open new root directory");
-
        /* change into new root fs */
-       ret = fchdir(newroot);
+       ret = fchdir(rootfs->dfd_mnt);
        if (ret < 0)
-               return log_error_errno(-1, errno, "Failed to change to new rootfs \"%s\"", rootfs);
+               return log_error_errno(-errno, errno, "Failed to change into new root directory \"%s\"", rootfs->mount);
 
        /* pivot_root into our new root fs */
        ret = pivot_root(".", ".");
        if (ret < 0)
-               return log_error_errno(-1, errno, "Failed to pivot_root()");
+               return log_error_errno(-errno, errno, "Failed to pivot into new root directory \"%s\"", rootfs->mount);
 
        /* At this point the old-root is mounted on top of our new-root. To
         * unmounted it we must not be chdir'd into it, so escape back to
         * old-root.
         */
-       ret = fchdir(oldroot);
+       ret = fchdir(fd_oldroot);
        if (ret < 0)
-               return log_error_errno(-1, errno, "Failed to enter old root directory");
+               return log_error_errno(-errno, errno, "Failed to enter old root directory");
 
-       /* Make oldroot a depedent mount to make sure our umounts don't propagate to the
-        * host.
+       /*
+        * Make fd_oldroot a depedent mount to make sure our umounts don't
+        * propagate to the host.
         */
        ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
        if (ret < 0)
-               return log_error_errno(-1, errno, "Failed to recursively turn old root mount tree into dependent mount");
+               return log_error_errno(-errno, errno, "Failed to recursively turn old root mount tree into dependent mount");
 
        ret = umount2(".", MNT_DETACH);
        if (ret < 0)
-               return log_error_errno(-1, errno, "Failed to detach old root directory");
+               return log_error_errno(-errno, errno, "Failed to detach old root directory");
 
-       ret = fchdir(newroot);
+       ret = fchdir(rootfs->dfd_mnt);
        if (ret < 0)
-               return log_error_errno(-1, errno, "Failed to re-enter new root directory");
-
-       TRACE("pivot_root(\"%s\") successful", rootfs);
+               return log_error_errno(-errno, errno, "Failed to re-enter new root directory \"%s\"", rootfs->mount);
 
+       TRACE("Changed into new rootfs \"%s\"", rootfs->mount);
        return 0;
 }
 
@@ -1443,7 +1469,7 @@ static int lxc_setup_rootfs_switch_root(const struct lxc_rootfs *rootfs)
        if (detect_ramfs_rootfs())
                return lxc_chroot(rootfs);
 
-       return lxc_pivot_root(rootfs->mount);
+       return lxc_pivot_root(rootfs);
 }
 
 static const struct id_map *find_mapped_nsid_entry(const struct lxc_conf *conf,
@@ -1477,7 +1503,23 @@ static const struct id_map *find_mapped_nsid_entry(const struct lxc_conf *conf,
        return retmap;
 }
 
-static int lxc_setup_devpts(struct lxc_handler *handler)
+int lxc_setup_devpts_parent(struct lxc_handler *handler)
+{
+       int ret;
+
+       if (handler->conf->pty_max <= 0)
+               return 0;
+
+       ret = lxc_abstract_unix_recv_fds(handler->data_sock[1], &handler->conf->devpts_fd, 1,
+                                        &handler->conf->devpts_fd, sizeof(handler->conf->devpts_fd));
+       if (ret < 0)
+               return log_error_errno(-1, errno, "Failed to receive devpts fd from child");
+
+       TRACE("Received devpts file descriptor %d from child", handler->conf->devpts_fd);
+       return 0;
+}
+
+static int lxc_setup_devpts_child(struct lxc_handler *handler)
 {
        __do_close int devpts_fd = -EBADF;
        int ret;
@@ -1486,6 +1528,7 @@ static int lxc_setup_devpts(struct lxc_handler *handler)
        char *mntopt_sets[5];
        char default_devpts_mntopts[256] = "gid=5,newinstance,ptmxmode=0666,mode=0620";
        struct lxc_conf *conf = handler->conf;
+       struct lxc_rootfs *rootfs = &conf->rootfs;
        int sock = handler->data_sock[0];
 
        if (conf->pty_max <= 0)
@@ -1499,7 +1542,7 @@ static int lxc_setup_devpts(struct lxc_handler *handler)
        (void)umount2("/dev/pts", MNT_DETACH);
 
        /* Create mountpoint for devpts instance. */
-       ret = mkdir("/dev/pts", 0755);
+       ret = mkdirat(rootfs->dfd_dev, "pts", 0755);
        if (ret < 0 && errno != EEXIST)
                return log_error_errno(-1, errno, "Failed to create \"/dev/pts\" directory");
 
@@ -1529,18 +1572,21 @@ static int lxc_setup_devpts(struct lxc_handler *handler)
                return log_error_errno(-1, errno, "Failed to mount new devpts instance");
        DEBUG("Mount new devpts instance with options \"%s\"", *opts);
 
-       devpts_fd = openat(-EBADF, "/dev/pts", O_CLOEXEC | O_DIRECTORY | O_PATH | O_NOFOLLOW);
+       devpts_fd = open_at(rootfs->dfd_dev, "pts", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
        if (devpts_fd < 0) {
+               devpts_fd = -EBADF;
                TRACE("Failed to create detached devpts mount");
-               ret = lxc_abstract_unix_send_fds(sock, NULL, 0, NULL, 0);
+               ret = lxc_abstract_unix_send_fds(sock, NULL, 0, &devpts_fd, sizeof(int));
        } else {
                ret = lxc_abstract_unix_send_fds(sock, &devpts_fd, 1, NULL, 0);
        }
        if (ret < 0)
                return log_error_errno(-1, errno, "Failed to send devpts fd to parent");
 
+       TRACE("Sent devpts file descriptor %d to parent", devpts_fd);
+
        /* Remove any pre-existing /dev/ptmx file. */
-       ret = remove("/dev/ptmx");
+       ret = unlinkat(rootfs->dfd_dev, "ptmx", 0);
        if (ret < 0) {
                if (errno != ENOENT)
                        return log_error_errno(-1, errno, "Failed to remove existing \"/dev/ptmx\" file");
@@ -1549,7 +1595,7 @@ static int lxc_setup_devpts(struct lxc_handler *handler)
        }
 
        /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
-       ret = mknod("/dev/ptmx", S_IFREG | 0000, 0);
+       ret = mknodat(rootfs->dfd_dev, "ptmx", S_IFREG | 0000, 0);
        if (ret < 0 && errno != EEXIST)
                return log_error_errno(-1, errno, "Failed to create dummy \"/dev/ptmx\" file as bind mount target");
        DEBUG("Created dummy \"/dev/ptmx\" file as bind mount target");
@@ -1563,16 +1609,16 @@ static int lxc_setup_devpts(struct lxc_handler *handler)
                ERROR("Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
 
        /* Remove the dummy /dev/ptmx file we created above. */
-       ret = remove("/dev/ptmx");
+       ret = unlinkat(rootfs->dfd_dev, "ptmx", 0);
        if (ret < 0)
                return log_error_errno(-1, errno, "Failed to remove existing \"/dev/ptmx\"");
 
        /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
-       ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
+       ret = symlinkat("/dev/pts/ptmx", rootfs->dfd_dev, "/dev/ptmx");
        if (ret < 0)
                return log_error_errno(-1, errno, "Failed to create symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
-       DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
 
+       DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
        return 0;
 }
 
@@ -1614,7 +1660,7 @@ static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
         * When we are asked to setup a console we remove any previous
         * /dev/console bind-mounts.
         */
-       if (exists_file_at(rootfs->dev_mntpt_fd, "console")) {
+       if (exists_file_at(rootfs->dfd_dev, "console")) {
                ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
                if (ret < 0 || (size_t)ret >= sizeof(path))
                        return -1;
@@ -1630,7 +1676,7 @@ static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
         * For unprivileged containers autodev or automounts will already have
         * taken care of creating /dev/console.
         */
-       ret = mknodat(rootfs->dev_mntpt_fd, "console", S_IFREG | 0000, 0);
+       ret = mknodat(rootfs->dfd_dev, "console", S_IFREG | 0000, 0);
        if (ret < 0 && errno != EEXIST)
                return log_error_errno(-errno, errno, "Failed to create console");
 
@@ -1639,19 +1685,19 @@ static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
                return log_error_errno(-errno, errno, "Failed to set mode \"0%o\" to \"%s\"", S_IXUSR | S_IXGRP, console->name);
 
        if (pty_mnt_fd >= 0) {
-               ret = move_mount(pty_mnt_fd, "", rootfs->dev_mntpt_fd, "console", MOVE_MOUNT_F_EMPTY_PATH);
+               ret = move_mount(pty_mnt_fd, "", rootfs->dfd_dev, "console", MOVE_MOUNT_F_EMPTY_PATH);
                if (!ret) {
-                       DEBUG("Moved mount \"%s\" onto \"%s\"", console->name, path);
-                       goto finish;
+                       DEBUG("Moved mount \"%s\" onto %d(console)", console->name, rootfs->dfd_dev);
+                       return 0;
                }
 
                if (ret && errno != ENOSYS)
                        return log_error_errno(-1, errno,
-                                              "Failed to mount %d(%s) on \"%s\"",
-                                              pty_mnt_fd, console->name, path);
+                                              "Failed to mount %d(%s) on %d(console)",
+                                              pty_mnt_fd, console->name, rootfs->dfd_dev);
        }
 
-       ret = safe_mount_beneath_at(rootfs->dev_mntpt_fd, console->name, "console", NULL, MS_BIND, NULL);
+       ret = safe_mount_beneath_at(rootfs->dfd_dev, console->name, "console", NULL, MS_BIND, NULL);
        if (ret < 0) {
                if (errno == ENOSYS) {
                        ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
@@ -1664,7 +1710,6 @@ static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
                }
        }
 
-finish:
        DEBUG("Mounted pty device %d(%s) onto \"%s\"", pty_mnt_fd, console->name, path);
        return 0;
 }
@@ -2187,8 +2232,7 @@ static int mount_entry_on_relative_rootfs(struct mntent *mntent,
        return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
 }
 
-static int mount_file_entries(const struct lxc_conf *conf,
-                             const struct lxc_rootfs *rootfs, FILE *file,
+static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
                              const char *lxc_name, const char *lxc_path)
 {
        char buf[PATH_MAX];
@@ -2237,7 +2281,7 @@ static int setup_mount(const struct lxc_conf *conf,
        if (!f)
                return log_error_errno(-1, errno, "Failed to open \"%s\"", fstab);
 
-       ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
+       ret = mount_file_entries(rootfs, f, lxc_name, lxc_path);
        if (ret < 0)
                ERROR("Failed to set up mount entries");
 
@@ -2324,7 +2368,7 @@ static int setup_mount_entries(const struct lxc_conf *conf,
        if (!f)
                return -1;
 
-       return mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
+       return mount_file_entries(rootfs, f, lxc_name, lxc_path);
 }
 
 static int parse_cap(const char *cap)
@@ -2589,8 +2633,9 @@ struct lxc_conf *lxc_conf_init(void)
                return NULL;
        }
        new->rootfs.managed = true;
-       new->rootfs.mntpt_fd = -EBADF;
-       new->rootfs.dev_mntpt_fd = -EBADF;
+       new->rootfs.dfd_mnt = -EBADF;
+       new->rootfs.dfd_dev = -EBADF;
+       new->rootfs.dfd_host = -EBADF;
        new->logfd = -1;
        lxc_list_init(&new->cgroup);
        lxc_list_init(&new->cgroup2);
@@ -2618,8 +2663,7 @@ struct lxc_conf *lxc_conf_init(void)
        new->lsm_se_context = NULL;
        new->lsm_se_keyring_context = NULL;
        new->keyring_disable_session = false;
-       new->tmp_umount_proc = false;
-       new->tmp_umount_proc = 0;
+       new->transient_procfs_mnt = false;
        new->shmount.path_host = NULL;
        new->shmount.path_cont = NULL;
 
@@ -2927,19 +2971,88 @@ again:
        return freeid;
 }
 
+/*
+ * Mount a proc under @rootfs if proc self points to a pid other than
+ * my own.  This is needed to have a known-good proc mount for setting
+ * up LSMs both at container startup and attach.
+ *
+ * NOTE: not to be called from inside the container namespace!
+ */
+static int lxc_transient_proc(struct lxc_rootfs *rootfs)
+{
+       __do_close int fd_proc = -EBADF;
+       int link_to_pid, link_len, pid_self, ret;
+       char link[INTTYPE_TO_STRLEN(pid_t) + 1];
+
+       link_len = readlinkat(rootfs->dfd_mnt, "proc/self", link, sizeof(link));
+       if (link_len < 0) {
+               ret = mkdirat(rootfs->dfd_mnt, "proc", 0000);
+               if (ret < 0 && errno != EEXIST)
+                       return log_error_errno(-errno, errno, "Failed to create %d(proc)", rootfs->dfd_mnt);
+
+               goto domount;
+       } else if (link_len >= sizeof(link)) {
+               return log_error_errno(-EIO, EIO, "Truncated link target");
+       }
+       link[link_len] = '\0';
+
+       pid_self = lxc_raw_getpid();
+       INFO("Caller's PID is %d; /proc/self points to %s", pid_self, link);
+
+       ret = lxc_safe_int(link, &link_to_pid);
+       if (ret)
+               return log_error_errno(-ret, ret, "Failed to parse %s", link);
+
+       /* Correct procfs is already mounted. */
+       if (link_to_pid == pid_self)
+               return log_trace(0, "Correct procfs instance mounted");
+
+       fd_proc = open_at(rootfs->dfd_mnt, "proc", PROTECT_OPATH_DIRECTORY,
+                         PROTECT_LOOKUP_BENEATH_XDEV, 0);
+       if (fd_proc < 0)
+               return log_error_errno(-errno, errno, "Failed to open transient procfs mountpoint");
+
+       ret = snprintf(rootfs->buf, sizeof(rootfs->buf), "/proc/self/fd/%d", fd_proc);
+       if (ret < 0 || (size_t)ret >= sizeof(rootfs->buf))
+               return ret_errno(EIO);
+
+       ret = umount2(rootfs->buf, MNT_DETACH);
+       if (ret < 0)
+               SYSWARN("Failed to umount \"%s\" with MNT_DETACH", rootfs->buf);
+
+domount:
+       /* rootfs is NULL */
+       if (!rootfs->path) {
+               ret = mount("proc", rootfs->buf, "proc", 0, NULL);
+       } else {
+               ret = safe_mount_beneath_at(rootfs->dfd_mnt, "none", "proc", "proc", 0, NULL);
+               if (ret < 0) {
+                       ret = snprintf(rootfs->buf, sizeof(rootfs->buf), "%s/proc", rootfs->path ? rootfs->mount : "");
+                       if (ret < 0 || (size_t)ret >= sizeof(rootfs->buf))
+                               return ret_errno(EIO);
+
+                       ret = safe_mount("proc", rootfs->buf, "proc", 0, NULL, rootfs->mount);
+               }
+       }
+       if (ret < 0)
+               return log_error_errno(-1, errno, "Failed to mount temporary procfs");
+
+       INFO("Created transient procfs mount");
+       return 1;
+}
+
 /* NOTE: Must not be called from inside the container namespace! */
 static int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
 {
        int mounted;
 
-       mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
+       mounted = lxc_transient_proc(&conf->rootfs);
        if (mounted == -1) {
-               SYSERROR("Failed to mount proc in the container");
                /* continue only if there is no rootfs */
                if (conf->rootfs.path)
-                       return -1;
+                       return log_error_errno(-EPERM, EPERM, "Failed to create transient procfs mount");
        } else if (mounted == 1) {
-               conf->tmp_umount_proc = true;
+               conf->transient_procfs_mnt = true;
        }
 
        return 0;
@@ -2947,11 +3060,10 @@ static int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
 
 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
 {
-       if (!lxc_conf->tmp_umount_proc)
-               return;
-
-       (void)umount2("/proc", MNT_DETACH);
-       lxc_conf->tmp_umount_proc = false;
+       if (lxc_conf->transient_procfs_mnt) {
+               (void)umount2("/proc", MNT_DETACH);
+               lxc_conf->transient_procfs_mnt = false;
+       }
 }
 
 /* Walk /proc/mounts and change any shared entries to dependent mounts. */
@@ -2960,9 +3072,9 @@ void turn_into_dependent_mounts(void)
        __do_free char *line = NULL;
        __do_fclose FILE *f = NULL;
        __do_close int memfd = -EBADF, mntinfo_fd = -EBADF;
-       int ret;
-       ssize_t copied;
        size_t len = 0;
+       ssize_t copied;
+       int ret;
 
        mntinfo_fd = open("/proc/self/mountinfo", O_RDONLY | O_CLOEXEC);
        if (mntinfo_fd < 0) {
@@ -2986,12 +3098,8 @@ void turn_into_dependent_mounts(void)
                }
        }
 
-again:
-       copied = lxc_sendfile_nointr(memfd, mntinfo_fd, NULL, LXC_SENDFILE_MAX);
+       copied = fd_to_fd(mntinfo_fd, memfd);
        if (copied < 0) {
-               if (errno == EINTR)
-                       goto again;
-
                SYSERROR("Failed to copy \"/proc/self/mountinfo\"");
                return;
        }
@@ -3100,6 +3208,10 @@ int lxc_setup_rootfs_prepare_root(struct lxc_conf *conf, const char *name,
 {
        int ret;
 
+       conf->rootfs.dfd_host = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
+       if (conf->rootfs.dfd_host < 0)
+               return log_error_errno(-errno, errno, "Failed to open \"/\"");
+
        if (conf->rootfs_setup) {
                const char *path = conf->rootfs.mount;
 
@@ -3110,6 +3222,10 @@ int lxc_setup_rootfs_prepare_root(struct lxc_conf *conf, const char *name,
                if (ret < 0)
                        return log_error(-1, "Failed to bind mount container / onto itself");
 
+               conf->rootfs.dfd_mnt = openat(-EBADF, path, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH | O_NOCTTY);
+               if (conf->rootfs.dfd_mnt < 0)
+                       return log_error_errno(-errno, errno, "Failed to open file descriptor for container rootfs");
+
                return log_trace(0, "Bind mounted container / onto itself");
        }
 
@@ -3211,13 +3327,49 @@ static int lxc_setup_boot_id(void)
        return 0;
 }
 
+static int lxc_setup_keyring(struct lsm_ops *lsm_ops, const struct lxc_conf *conf)
+{
+       key_serial_t keyring;
+       int ret = 0;
+
+       if (conf->lsm_se_keyring_context)
+               ret = lsm_ops->keyring_label_set(lsm_ops, conf->lsm_se_keyring_context);
+       else if (conf->lsm_se_context)
+               ret = lsm_ops->keyring_label_set(lsm_ops, conf->lsm_se_context);
+       if (ret < 0)
+               return log_error_errno(-1, errno, "Failed to set keyring context");
+
+       /*
+        * Try to allocate a new session keyring for the container to prevent
+        * information leaks.
+        */
+       keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, prctl_arg(0),
+                        prctl_arg(0), prctl_arg(0), prctl_arg(0));
+       if (keyring < 0) {
+               switch (errno) {
+               case ENOSYS:
+                       DEBUG("The keyctl() syscall is not supported or blocked");
+                       break;
+               case EACCES:
+                       __fallthrough;
+               case EPERM:
+                       DEBUG("Failed to access kernel keyring. Continuing...");
+                       break;
+               default:
+                       SYSERROR("Failed to create kernel keyring");
+                       break;
+               }
+       }
+
+       return ret;
+}
+
 int lxc_setup(struct lxc_handler *handler)
 {
        __do_close int pty_mnt_fd = -EBADF;
        int ret;
        const char *lxcpath = handler->lxcpath, *name = handler->name;
        struct lxc_conf *lxc_conf = handler->conf;
-       char *keyring_context = NULL;
 
        ret = lxc_setup_rootfs_prepare_root(lxc_conf, name, lxcpath);
        if (ret < 0)
@@ -3230,15 +3382,9 @@ int lxc_setup(struct lxc_handler *handler)
        }
 
        if (!lxc_conf->keyring_disable_session) {
-               if (lxc_conf->lsm_se_keyring_context) {
-                       keyring_context = lxc_conf->lsm_se_keyring_context;
-               } else if (lxc_conf->lsm_se_context) {
-                       keyring_context = lxc_conf->lsm_se_context;
-               }
-
-               ret = lxc_setup_keyring(keyring_context);
+               ret = lxc_setup_keyring(handler->lsm_ops, lxc_conf);
                if (ret < 0)
-                       return -1;
+                       return log_error(-1, "Failed to setup container keyring");
        }
 
        if (handler->ns_clone_flags & CLONE_NEWNET) {
@@ -3269,9 +3415,10 @@ int lxc_setup(struct lxc_handler *handler)
                        return log_error(-1, "Failed to mount \"/dev\"");
        }
 
-       lxc_conf->rootfs.dev_mntpt_fd = openat(lxc_conf->rootfs.mntpt_fd, "dev",
-                                               O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_NOFOLLOW);
-       if (lxc_conf->rootfs.dev_mntpt_fd < 0 && errno != ENOENT)
+       lxc_conf->rootfs.dfd_dev = open_at(lxc_conf->rootfs.dfd_mnt, "dev",
+                                               PROTECT_OPATH_DIRECTORY,
+                                               PROTECT_LOOKUP_BENEATH_XDEV, 0);
+       if (lxc_conf->rootfs.dfd_dev < 0 && errno != ENOENT)
                return log_error_errno(-errno, errno, "Failed to open \"/dev\"");
 
        /* Do automatic mounts (mainly /proc and /sys), but exclude those that
@@ -3295,13 +3442,13 @@ int lxc_setup(struct lxc_handler *handler)
        if (lxc_conf->is_execute) {
                if (execveat_supported()) {
                        int fd;
-                       char path[PATH_MAX];
+                       char path[STRLITERALLEN(SBINDIR) + STRLITERALLEN("/init.lxc.static") + 1];
 
-                       ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
+                       ret = snprintf(path, sizeof(path), SBINDIR "/init.lxc.static");
                        if (ret < 0 || ret >= PATH_MAX)
                                return log_error(-1, "Path to init.lxc.static too long");
 
-                       fd = open(path, O_PATH | O_CLOEXEC);
+                       fd = open(path, O_NOCTTY | O_NOFOLLOW | O_CLOEXEC | O_PATH);
                        if (fd < 0)
                                return log_error_errno(-1, errno, "Unable to open lxc.init.static");
 
@@ -3361,7 +3508,7 @@ int lxc_setup(struct lxc_handler *handler)
        if (lxc_conf->autodev > 0)
                (void)lxc_setup_boot_id();
 
-       ret = lxc_setup_devpts(handler);
+       ret = lxc_setup_devpts_child(handler);
        if (ret < 0)
                return log_error(-1, "Failed to setup new devpts instance");
 
@@ -3393,8 +3540,9 @@ int lxc_setup(struct lxc_handler *handler)
                return log_error(-1, "Failed to drop capabilities");
        }
 
-       close_prot_errno_disarm(lxc_conf->rootfs.mntpt_fd)
-       close_prot_errno_disarm(lxc_conf->rootfs.dev_mntpt_fd)
+       close_prot_errno_disarm(lxc_conf->rootfs.dfd_mnt)
+       close_prot_errno_disarm(lxc_conf->rootfs.dfd_dev)
+       close_prot_errno_disarm(lxc_conf->rootfs.dfd_host)
        NOTICE("The container \"%s\" is set up", name);
 
        return 0;
@@ -3758,8 +3906,9 @@ void lxc_conf_free(struct lxc_conf *conf)
        free(conf->rootfs.options);
        free(conf->rootfs.path);
        free(conf->rootfs.data);
-       close_prot_errno_disarm(conf->rootfs.mntpt_fd);
-       close_prot_errno_disarm(conf->rootfs.dev_mntpt_fd);
+       close_prot_errno_disarm(conf->rootfs.dfd_mnt);
+       close_prot_errno_disarm(conf->rootfs.dfd_dev);
+       close_prot_errno_disarm(conf->rootfs.dfd_host);
        free(conf->logfile);
        if (conf->logfd != -1)
                close(conf->logfd);
@@ -3783,7 +3932,6 @@ void lxc_conf_free(struct lxc_conf *conf)
        lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
        lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
        lxc_clear_devices(conf);
-       lxc_clear_cgroup2_devices(conf);
        lxc_clear_hooks(conf, "lxc.hook");
        lxc_clear_mount_entries(conf);
        lxc_clear_idmaps(conf);
@@ -3798,6 +3946,7 @@ void lxc_conf_free(struct lxc_conf *conf)
        lxc_clear_namespace(conf);
        free(conf->cgroup_meta.dir);
        free(conf->cgroup_meta.monitor_dir);
+       free(conf->cgroup_meta.monitor_pivot_dir);
        free(conf->cgroup_meta.container_dir);
        free(conf->cgroup_meta.namespace_dir);
        free(conf->cgroup_meta.controllers);
@@ -4047,8 +4196,7 @@ int userns_exec_1(const struct lxc_conf *conf, int (*fn)(void *), void *data,
 
        close_prot_errno_disarm(pipe_fds[0]);
 
-       if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
-           conf->loglevel == LXC_LOG_LEVEL_TRACE) {
+       if (lxc_log_trace()) {
                struct id_map *map;
                struct lxc_list *it;
 
@@ -4134,7 +4282,7 @@ int userns_exec_minimal(const struct lxc_conf *conf,
 
                close_prot_errno_disarm(sock_fds[0]);
 
-               if (!lxc_setgroups(0, NULL) && errno != EPERM)
+               if (!lxc_drop_groups() && errno != EPERM)
                        _exit(EXIT_FAILURE);
 
                ret = setresgid(resgid, resgid, resgid);
@@ -4162,8 +4310,7 @@ int userns_exec_minimal(const struct lxc_conf *conf,
 
        close_prot_errno_disarm(sock_fds[0]);
 
-       if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
-           conf->loglevel == LXC_LOG_LEVEL_TRACE) {
+       if (lxc_log_trace()) {
                struct id_map *map;
                struct lxc_list *it;
 
@@ -4347,8 +4494,7 @@ int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
        /* idmap will now keep track of that memory. */
        host_gid_map = NULL;
 
-       if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
-           conf->loglevel == LXC_LOG_LEVEL_TRACE) {
+       if (lxc_log_trace()) {
                lxc_list_for_each (cur, idmap) {
                        map = cur->elem;
                        TRACE("establishing %cid mapping for \"%d\" in new "
@@ -4554,12 +4700,12 @@ int userns_exec_mapped_root(const char *path, int path_fd,
                if (!lxc_switch_uid_gid(0, 0))
                        _exit(EXIT_FAILURE);
 
-               if (!lxc_setgroups(0, NULL))
+               if (!lxc_drop_groups())
                        _exit(EXIT_FAILURE);
 
                ret = fchown(target_fd, 0, st.st_gid);
                if (ret) {
-                       SYSERROR("Failed to chown %d(%s) to -1:%d", target_fd, path, st.st_gid);
+                       SYSERROR("Failed to chown %d(%s) to 0:%d", target_fd, path, st.st_gid);
                        _exit(EXIT_FAILURE);
                }
 
@@ -4569,8 +4715,7 @@ int userns_exec_mapped_root(const char *path, int path_fd,
 
        close_prot_errno_disarm(sock_fds[0]);
 
-       if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
-           conf->loglevel == LXC_LOG_LEVEL_TRACE) {
+       if (lxc_log_trace()) {
                struct id_map *map;
                struct lxc_list *it;