start: check event loop type before closing fd

[mirror_lxc.git] / src / lxc / start.c
diff --git a/src/lxc/start.c b/src/lxc/start.c

index 90ee8dbfe72a96b281e5fbf665c60a6b775f6277..1a6046c7a40d8b4bb3aebcf521545b3fce7db355 100644 (file)
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -1,8 +1,7 @@
  /* SPDX-License-Identifier: LGPL-2.1+ */
  
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE 1
-#endif
+#include "config.h"
+
  #include <dirent.h>
  #include <errno.h>
  #include <fcntl.h>
@@ -25,21 +24,23 @@
  #include <sys/wait.h>
  #include <unistd.h>
  
+#include "lxc.h"
+
  #include "af_unix.h"
+#include "attach_options.h"
  #include "caps.h"
-#include "cgroup.h"
+#include "cgroups/cgroup.h"
+#include "cgroups/cgroup_utils.h"
  #include "commands.h"
  #include "commands_utils.h"
  #include "compiler.h"
  #include "conf.h"
-#include "config.h"
  #include "confile_utils.h"
  #include "error.h"
  #include "file_utils.h"
  #include "list.h"
  #include "log.h"
  #include "lsm/lsm.h"
-#include "lxccontainer.h"
  #include "lxclock.h"
  #include "lxcseccomp.h"
  #include "macro.h"
@@ -61,8 +62,8 @@
  #include <sys/capability.h>
  #endif
  
-#ifndef HAVE_STRLCPY
-#include "include/strlcpy.h"
+#if !HAVE_STRLCPY
+#include "strlcpy.h"
  #endif
  
  lxc_log_define(start, lxc);
@@ -112,23 +113,34 @@ static void lxc_put_nsfds(struct lxc_handler *handler)
         }
  }
  
-static int lxc_try_preserve_ns(const int pid, const char *ns)
+static int lxc_try_preserve_namespace(struct lxc_handler *handler,
+                                     lxc_namespace_t idx, const char *ns)
  {
-       int fd;
+       __do_close int fd = -EBADF;
+       int ret;
  
-       fd = lxc_preserve_ns(pid, ns);
-       if (fd < 0) {
-               if (errno != ENOENT)
-                       return log_error_errno(-EINVAL,
-                                              errno, "Failed to preserve %s namespace",
-                                              ns);
+       fd = lxc_preserve_ns(handler->pid, ns);
+       if (fd < 0)
+               return -errno;
  
-               return log_warn_errno(-EOPNOTSUPP,
-                                     errno, "Kernel does not support preserving %s namespaces",
-                                     ns);
-       }
+       ret = strnprintf(handler->nsfd_paths[idx],
+                        sizeof(handler->nsfd_paths[idx]), "%s:/proc/%d/fd/%d",
+                        ns_info[idx].proc_name, handler->monitor_pid, fd);
+       if (ret < 0)
+               return ret_errno(EIO);
  
-       return fd;
+       /*
+        * In case LXC is configured for exposing information to hooks as
+        * argv-style arguments prepare an argv array we can use.
+        */
+       handler->hook_argv[handler->hook_argc] = handler->nsfd_paths[idx];
+       handler->hook_argc++;
+
+       DEBUG("Preserved %s namespace via fd %d and stashed path as %s",
+             ns_info[idx].proc_name, fd, handler->nsfd_paths[idx]);
+
+       handler->nsfd[idx] = move_fd(fd);
+       return 0;
  }
  
  /* lxc_try_preserve_namespaces: open /proc/@pid/ns/@ns for each namespace
@@ -136,35 +148,33 @@ static int lxc_try_preserve_ns(const int pid, const char *ns)
   * Return true on success, false on failure.
   */
  static bool lxc_try_preserve_namespaces(struct lxc_handler *handler,
-                                       int ns_clone_flags, pid_t pid)
+                                       int ns_clone_flags)
  {
-       int i;
+       for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++)
+               handler->nsfd[ns_idx] = -EBADF;
  
-       for (i = 0; i < LXC_NS_MAX; i++)
-               handler->nsfd[i] = -EBADF;
-
-       for (i = 0; i < LXC_NS_MAX; i++) {
-               int fd;
+       for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++) {
+               int ret;
+               const char *ns = ns_info[ns_idx].proc_name;
  
-               if ((ns_clone_flags & ns_info[i].clone_flag) == 0)
+               if ((ns_clone_flags & ns_info[ns_idx].clone_flag) == 0)
                         continue;
  
-               fd = lxc_try_preserve_ns(pid, ns_info[i].proc_name);
-               if (fd < 0) {
-                       /* Do not fail to start container on kernels that do
-                        * not support interacting with namespaces through
-                        * /proc.
-                        */
-                       if (fd == -EOPNOTSUPP)
+               ret = lxc_try_preserve_namespace(handler, ns_idx,
+                                                ns_info[ns_idx].proc_name);
+               if (ret < 0) {
+                       if (ret == -ENOENT) {
+                               SYSERROR("Kernel does not support preserving %s namespaces", ns);
                                 continue;
+                       }
  
+                       /*
+                        * Handle kernels that do not support interacting with
+                        * namespaces through procfs.
+                        */
                         lxc_put_nsfds(handler);
-                       return false;
+                       return log_error_errno(false, errno, "Failed to preserve %s namespace", ns);
                 }
-
-               handler->nsfd[i] = fd;
-               DEBUG("Preserved %s namespace via fd %d", ns_info[i].proc_name,
-                     handler->nsfd[i]);
         }
  
         return true;
@@ -183,8 +193,8 @@ static bool match_dlog_fds(struct dirent *direntp)
         ssize_t linklen;
         int ret;
  
-       ret = snprintf(path, PATH_MAX, "/proc/self/fd/%s", direntp->d_name);
-       if (ret < 0 || ret >= PATH_MAX)
+       ret = strnprintf(path, sizeof(path), "/proc/self/fd/%s", direntp->d_name);
+       if (ret < 0)
                 return log_error(false, "Failed to create file descriptor name");
  
         linklen = readlink(path, link, PATH_MAX);
@@ -193,15 +203,40 @@ static bool match_dlog_fds(struct dirent *direntp)
         else if (linklen >= PATH_MAX)
                 return log_error(false, "The name of link path is too long - \"%s\"", path);
  
-       if (strcmp(link, "/dev/log_main")       == 0 ||
-           strcmp(link, "/dev/log_system")     == 0 ||
-           strcmp(link, "/dev/log_radio")      == 0)
+       if (strequal(link, "/dev/log_main") ||
+           strequal(link, "/dev/log_system") ||
+           strequal(link, "/dev/log_radio"))
                 return true;
  
         return false;
  }
  #endif
  
+/* Parses the LISTEN_FDS environment variable value.
+ * The returned value is the highest fd number up to which the
+ * file descriptors must be passed to the container process.
+ *
+ * For example, if LISTEN_FDS=2 then 4 is returned and file descriptors 3 and 4
+ * MUST be passed to the container process (in addition to the standard streams)
+ * to support [socket activation][systemd-listen-fds].
+ */
+static unsigned int get_listen_fds_max(void)
+{
+       int ret;
+       unsigned int num_fds;
+       const char *val;
+
+       val = getenv("LISTEN_FDS");
+       if (!val)
+               return 0;
+
+       ret = lxc_safe_uint(val, &num_fds);
+       if (ret < 0)
+               return syserror_ret(0, "Failed to parse \"LISTEN_FDS=%s\" environment variable", val);
+
+       return log_trace(num_fds, "Parsed \"LISTEN_FDS=%s\" environment variable", val);
+}
+
  int lxc_check_inherited(struct lxc_conf *conf, bool closeall,
                         int *fds_to_ignore, size_t len_fds)
  {
@@ -209,10 +244,14 @@ int lxc_check_inherited(struct lxc_conf *conf, bool closeall,
         size_t i;
         DIR *dir;
         struct dirent *direntp;
+       unsigned int listen_fds_max;
+       struct lxc_state_client *client, *nclient;
  
         if (conf && conf->close_all_fds)
                 closeall = true;
  
+       listen_fds_max = get_listen_fds_max();
+
         /*
          * Disable syslog at this point to avoid the above logging
          * function to open a new fd and make the check_inherited function
@@ -229,13 +268,12 @@ restart:
  
         while ((direntp = readdir(dir))) {
                 int ret;
-               struct lxc_list *cur;
                 bool matched = false;
  
-               if (strcmp(direntp->d_name, ".") == 0)
+               if (strequal(direntp->d_name, "."))
                         continue;
  
-               if (strcmp(direntp->d_name, "..") == 0)
+               if (strequal(direntp->d_name, ".."))
                         continue;
  
                 ret = lxc_safe_int(direntp->d_name, &fd);
@@ -254,9 +292,7 @@ restart:
  
                 /* Keep state clients that wait on reboots. */
                 if (conf) {
-                       lxc_list_for_each(cur, &conf->state_clients) {
-                               struct lxc_state_client *client = cur->elem;
-
+                       list_for_each_entry_safe(client, nclient, &conf->state_clients, head) {
                                 if (client->clientfd != fd)
                                         continue;
  
@@ -279,6 +315,12 @@ restart:
                         continue;
  
  #endif
+
+               if ((size_t)fd <= listen_fds_max) {
+                       INFO("Inheriting fd %d (using the LISTEN_FDS environment variable)", fd);
+                       continue;
+               }
+
                 if (closeall) {
                         if (close(fd))
                                 SYSINFO("Closed inherited fd %d", fd);
@@ -312,7 +354,7 @@ static int setup_signal_fd(sigset_t *oldmask)
         if (ret < 0)
                 return -EBADF;
  
-       for (int sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
+       for (size_t sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
                 ret = sigdelset(&mask, signals[sig]);
                 if (ret < 0)
                         return -EBADF;
@@ -334,7 +376,7 @@ static int setup_signal_fd(sigset_t *oldmask)
  }
  
  static int signal_handler(int fd, uint32_t events, void *data,
-                         struct lxc_epoll_descr *descr)
+                         struct lxc_async_descr *descr)
  {
         int ret;
         siginfo_t info;
@@ -354,6 +396,9 @@ static int signal_handler(int fd, uint32_t events, void *data,
         if (ret == 0 && info.si_pid == hdlr->pid)
                 hdlr->init_died = true;
  
+       TRACE("Received signal ssi_signo(%d) for ssi_pid(%d), si_signo(%d), si_pid(%d)",
+             siginfo.ssi_signo, siginfo.ssi_pid, info.si_signo, info.si_pid);
+
         /* Try to figure out a reasonable exit status to report. */
         if (hdlr->init_died) {
                 switch (info.si_code) {
@@ -400,7 +445,7 @@ static int signal_handler(int fd, uint32_t events, void *data,
         /* More robustness, protect ourself from a SIGCHLD sent
          * by a process different from the container init.
          */
-       if (siginfo.ssi_pid != hdlr->pid) {
+       if ((__u64)siginfo.ssi_pid != (__u64)hdlr->pid) {
                 NOTICE("Received %d from pid %d instead of container init %d",
                        siginfo.ssi_signo, siginfo.ssi_pid, hdlr->pid);
                 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
@@ -425,10 +470,13 @@ static int signal_handler(int fd, uint32_t events, void *data,
  int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
                             lxc_state_t state)
  {
+       struct lxc_msg msg = {
+               .type   = lxc_msg_state,
+               .value  = state,
+       };
         size_t retlen;
         ssize_t ret;
-       struct lxc_list *cur, *next;
-       struct lxc_msg msg = {.type = lxc_msg_state, .value = state};
+       struct lxc_state_client *client, *nclient;
  
         if (state == THAWED)
                 handler->state = RUNNING;
@@ -437,16 +485,14 @@ int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
  
         TRACE("Set container state to %s", lxc_state2str(state));
  
-       if (lxc_list_empty(&handler->conf->state_clients))
+       if (list_empty(&handler->conf->state_clients))
                 return log_trace(0, "No state clients registered");
  
         retlen = strlcpy(msg.name, name, sizeof(msg.name));
         if (retlen >= sizeof(msg.name))
                 return -E2BIG;
  
-       lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
-               struct lxc_state_client *client = cur->elem;
-
+       list_for_each_entry_safe(client, nclient, &handler->conf->state_clients, head) {
                 if (client->states[state] == 0) {
                         TRACE("State %s not registered for state client %d",
                               lxc_state2str(state), client->clientfd);
@@ -461,10 +507,9 @@ int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
                         SYSERROR("Failed to send message to client");
  
                 /* kick client from list */
-               lxc_list_del(cur);
+               list_del(&client->head);
                 close(client->clientfd);
-               free(cur->elem);
-               free(cur);
+               free(client);
         }
  
         return 0;
@@ -532,12 +577,11 @@ int lxc_set_state(const char *name, struct lxc_handler *handler,
  int lxc_poll(const char *name, struct lxc_handler *handler)
  {
         int ret;
-       bool has_console = true;
-       struct lxc_epoll_descr descr, descr_console;
+       struct lxc_terminal *console = &handler->conf->console;
+       struct lxc_async_descr descr, descr_console;
  
-       if (handler->conf->console.path &&
-           strcmp(handler->conf->console.path, "none") == 0)
-               has_console = false;
+       if (!wants_console(console))
+               console = NULL;
  
         ret = lxc_mainloop_open(&descr);
         if (ret < 0) {
@@ -545,7 +589,7 @@ int lxc_poll(const char *name, struct lxc_handler *handler)
                 goto out_sigfd;
         }
  
-       if (has_console) {
+       if (console) {
                 ret = lxc_mainloop_open(&descr_console);
                 if (ret < 0) {
                         ERROR("Failed to create console mainloop");
@@ -553,7 +597,10 @@ int lxc_poll(const char *name, struct lxc_handler *handler)
                 }
         }
  
-       ret = lxc_mainloop_add_handler(&descr, handler->sigfd, signal_handler, handler);
+       ret = lxc_mainloop_add_handler(&descr, handler->sigfd,
+                                      signal_handler,
+                                      default_cleanup_handler,
+                                      handler, "signal_handler");
         if (ret < 0) {
                 ERROR("Failed to add signal handler for %d to mainloop", handler->sigfd);
                 goto out_mainloop_console;
@@ -565,22 +612,12 @@ int lxc_poll(const char *name, struct lxc_handler *handler)
                 goto out_mainloop_console;
         }
  
-       if (has_console) {
-               struct lxc_terminal *console = &handler->conf->console;
-
+       if (console) {
                 ret = lxc_terminal_mainloop_add(&descr, console);
                 if (ret < 0) {
                         ERROR("Failed to add console handlers to mainloop");
                         goto out_mainloop_console;
                 }
-
-               ret = lxc_terminal_mainloop_add(&descr_console, console);
-               if (ret < 0) {
-                       ERROR("Failed to add console handlers to console mainloop");
-                       goto out_mainloop_console;
-               }
-
-               handler->conf->console.descr = &descr;
         }
  
         ret = lxc_cmd_mainloop_add(name, &descr, handler);
@@ -592,15 +629,19 @@ int lxc_poll(const char *name, struct lxc_handler *handler)
         TRACE("Mainloop is ready");
  
         ret = lxc_mainloop(&descr, -1);
-       close_prot_errno_disarm(descr.epfd);
+       if (descr.type == LXC_MAINLOOP_EPOLL)
+               close_prot_errno_disarm(descr.epfd);
         if (ret < 0 || !handler->init_died)
                 goto out_mainloop_console;
  
-       if (has_console)
-               ret = lxc_mainloop(&descr_console, 0);
+       if (console) {
+               ret = lxc_terminal_mainloop_add(&descr_console, console);
+               if (ret == 0)
+                       ret = lxc_mainloop(&descr_console, 0);
+       }
  
  out_mainloop_console:
-       if (has_console) {
+       if (console) {
                 lxc_mainloop_close(&descr_console);
                 TRACE("Closed console mainloop");
         }
@@ -664,10 +705,20 @@ struct lxc_handler *lxc_init_handler(struct lxc_handler *old,
         handler->state_socket_pair[0] = -EBADF;
         handler->state_socket_pair[1] = -EBADF;
         if (handler->conf->reboot == REBOOT_NONE)
-               lxc_list_init(&handler->conf->state_clients);
+               INIT_LIST_HEAD(&handler->conf->state_clients);
+
+       for (lxc_namespace_t idx = 0; idx < LXC_NS_MAX; idx++) {
+               handler->nsfd[idx] = -EBADF;
+
+               if (handler->conf->reboot == REBOOT_NONE)
+                       continue;
  
-       for (int i = 0; i < LXC_NS_MAX; i++)
-               handler->nsfd[i] = -EBADF;
+               handler->nsfd_paths[idx][0] = '\0';
+               handler->hook_argv[idx] = NULL;
+
+               if (handler->hook_argc != 0)
+                       handler->hook_argc = 0;
+       }
  
         handler->name = name;
         if (daemonize)
@@ -695,7 +746,7 @@ struct lxc_handler *lxc_init_handler(struct lxc_handler *old,
         }
  
         if (handler->conf->reboot == REBOOT_NONE) {
-               handler->conf->maincmd_fd = lxc_cmd_init(name, lxcpath, "command");
+               handler->conf->maincmd_fd = lxc_server_init(name, lxcpath, "command");
                 if (handler->conf->maincmd_fd < 0) {
                         ERROR("Failed to set up command socket");
                         goto on_error;
@@ -796,6 +847,10 @@ int lxc_init(const char *name, struct lxc_handler *handler)
                 return log_error(-1, "Failed to run lxc.hook.pre-start for container \"%s\"", name);
         TRACE("Ran pre-start hooks");
  
+       ret = lxc_terminal_parent(conf);
+       if (ret < 0)
+               return log_error(-1, "Failed to allocate terminal");
+
         /* The signal fd has to be created before forking otherwise if the child
          * process exits before we setup the signal fd, the event will be lost
          * and the command will be stuck.
@@ -805,30 +860,24 @@ int lxc_init(const char *name, struct lxc_handler *handler)
                 return log_error(-1, "Failed to setup SIGCHLD fd handler.");
         TRACE("Set up signal fd");
  
-       /* Do this after setting up signals since it might unblock SIGWINCH. */
-       ret = lxc_terminal_setup(conf);
-       if (ret < 0) {
-               ERROR("Failed to create console");
-               goto out_restore_sigmask;
-       }
-       TRACE("Created console");
-
         handler->cgroup_ops = cgroup_init(handler->conf);
         if (!handler->cgroup_ops) {
                 ERROR("Failed to initialize cgroup driver");
-               goto out_delete_terminal;
+               goto out_restore_sigmask;
         }
         TRACE("Initialized cgroup driver");
  
         ret = lxc_read_seccomp_config(conf);
-       if (ret < 0)
-               return log_error(-1, "Failed loading seccomp policy");
+       if (ret < 0) {
+               ERROR("Failed to read seccomp policy");
+               goto out_restore_sigmask;
+       }
         TRACE("Read seccomp policy");
  
         ret = handler->lsm_ops->prepare(handler->lsm_ops, conf, handler->lxcpath);
         if (ret < 0) {
                 ERROR("Failed to initialize LSM");
-               goto out_delete_terminal;
+               goto out_restore_sigmask;
         }
         TRACE("Initialized LSM");
  
@@ -836,63 +885,47 @@ int lxc_init(const char *name, struct lxc_handler *handler)
         handler->monitor_status_fd = move_fd(status_fd);
         return 0;
  
-out_delete_terminal:
-       lxc_terminal_delete(&handler->conf->console);
-
  out_restore_sigmask:
         (void)pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
  
         return -1;
  }
  
+void lxc_expose_namespace_environment(const struct lxc_handler *handler)
+{
+       for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
+               int ret;
+               const char *fd_path;
+
+               if (handler->nsfd[i] < 0)
+                       continue;
+
+               fd_path = handler->nsfd_paths[i] + strcspn(handler->nsfd_paths[i], "/");
+               ret = setenv(ns_info[i].env_name, fd_path, 1);
+               if (ret < 0)
+                       SYSERROR("Failed to set environment variable %s=%s",
+                                ns_info[i].env_name, fd_path);
+               else
+                       TRACE("Set environment variable %s=%s",
+                             ns_info[i].env_name, fd_path);
+       }
+}
+
  void lxc_end(struct lxc_handler *handler)
  {
         int ret;
-       pid_t self;
-       struct lxc_list *cur, *next;
-       char *namespaces[LXC_NS_MAX + 1];
-       size_t namespace_count = 0;
         const char *name = handler->name;
         struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
+       struct lxc_state_client *client, *nclient;
  
         /* The STOPPING state is there for future cleanup code which can take
          * awhile.
          */
         lxc_set_state(name, handler, STOPPING);
  
-       self = lxc_raw_getpid();
-       for (int i = 0; i < LXC_NS_MAX; i++) {
-               if (handler->nsfd[i] < 0)
-                       continue;
-
-               if (handler->conf->hooks_version == 0)
-                       ret = asprintf(&namespaces[namespace_count],
-                                     "%s:/proc/%d/fd/%d", ns_info[i].proc_name,
-                                     self, handler->nsfd[i]);
-               else
-                       ret = asprintf(&namespaces[namespace_count],
-                                     "/proc/%d/fd/%d", self, handler->nsfd[i]);
-               if (ret < 0) {
-                       SYSERROR("Failed to allocate memory");
-                       break;
-               }
-
-               if (handler->conf->hooks_version == 0) {
-                       namespace_count++;
-                       continue;
-               }
-
-               ret = setenv(ns_info[i].env_name, namespaces[namespace_count], 1);
-               if (ret < 0)
-                       SYSERROR("Failed to set environment variable %s=%s",
-                                ns_info[i].env_name, namespaces[namespace_count]);
-               else
-                       TRACE("Set environment variable %s=%s",
-                             ns_info[i].env_name, namespaces[namespace_count]);
-
-               namespace_count++;
-       }
-       namespaces[namespace_count] = NULL;
+       /* Passing information to hooks via environment variables. */
+       if (handler->conf->hooks_version > 0)
+               lxc_expose_namespace_environment(handler);
  
         if (handler->conf->reboot > REBOOT_NONE) {
                 ret = setenv("LXC_TARGET", "reboot", 1);
@@ -907,15 +940,12 @@ void lxc_end(struct lxc_handler *handler)
         }
  
         if (handler->conf->hooks_version == 0)
-               ret = run_lxc_hooks(name, "stop", handler->conf, namespaces);
+               ret = run_lxc_hooks(name, "stop", handler->conf, handler->hook_argv);
         else
                 ret = run_lxc_hooks(name, "stop", handler->conf, NULL);
         if (ret < 0)
                 ERROR("Failed to run \"lxc.hook.stop\" hook");
  
-       while (namespace_count--)
-               free(namespaces[namespace_count]);
-
         handler->lsm_ops->cleanup(handler->lsm_ops, handler->conf, handler->lxcpath);
  
         if (cgroup_ops) {
@@ -978,19 +1008,16 @@ void lxc_end(struct lxc_handler *handler)
         /* The command socket is now closed, no more state clients can register
          * themselves from now on. So free the list of state clients.
          */
-       lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
-               struct lxc_state_client *client = cur->elem;
-
+       list_for_each_entry_safe(client, nclient, &handler->conf->state_clients, head) {
                 /* Keep state clients that want to be notified about reboots. */
                 if ((handler->conf->reboot > REBOOT_NONE) &&
                     (client->states[RUNNING] == 2))
                         continue;
  
                 /* close state client socket */
-               lxc_list_del(cur);
+               list_del(&client->head);
                 close(client->clientfd);
-               free(cur->elem);
-               free(cur);
+               free(client);
         }
  
         if (handler->conf->ephemeral == 1 && handler->conf->reboot != REBOOT_REQ)
@@ -1026,19 +1053,18 @@ static int do_start(void *data)
  {
         struct lxc_handler *handler = data;
         __lxc_unused __do_close int data_sock0 = handler->data_sock[0],
-                                          data_sock1 = handler->data_sock[1];
+                                   data_sock1 = handler->data_sock[1];
         __do_close int devnull_fd = -EBADF, status_fd = -EBADF;
         int ret;
         uid_t new_uid;
         gid_t new_gid;
-       struct lxc_list *iterator;
         uid_t nsuid = 0;
         gid_t nsgid = 0;
  
         lxc_sync_fini_parent(handler);
  
-       if (lxc_abstract_unix_recv_fds(data_sock1, &status_fd, 1, NULL, 0) < 0) {
-               ERROR("Failed to receive status file descriptor to child process");
+       if (lxc_abstract_unix_recv_one_fd(data_sock1, &status_fd, NULL, 0) < 0) {
+               ERROR("Failed to receive status file descriptor from parent process");
                 goto out_warn_father;
         }
  
@@ -1081,24 +1107,10 @@ static int do_start(void *data)
                 INFO("Unshared CLONE_NEWNET");
         }
  
-       /* Tell the parent task it can begin to configure the container and wait
-        * for it to finish.
-        */
-       if (!lxc_sync_barrier_parent(handler, START_SYNC_CONFIGURE))
-               goto out_error;
-
-       if (handler->ns_clone_flags & CLONE_NEWNET) {
-               ret = lxc_network_recv_from_parent(handler);
-               if (ret < 0) {
-                       ERROR("Failed to receive veth names from parent");
-                       goto out_warn_father;
-               }
-       }
-
         /* If we are in a new user namespace, become root there to have
          * privilege over our namespace.
          */
-       if (!lxc_list_empty(&handler->conf->id_map)) {
+       if (!list_empty(&handler->conf->id_map)) {
                 if (!handler->conf->root_nsuid_map)
                         nsuid = handler->conf->init_uid;
  
@@ -1147,9 +1159,9 @@ static int do_start(void *data)
         if (handler->daemonize && !handler->conf->autodev) {
                 char path[PATH_MAX];
  
-               ret = snprintf(path, sizeof(path), "%s/dev/null",
-                              handler->conf->rootfs.mount);
-               if (ret < 0 || ret >= sizeof(path))
+               ret = strnprintf(path, sizeof(path), "%s/dev/null",
+                                handler->conf->rootfs.mount);
+               if (ret < 0)
                         goto out_warn_father;
  
                 ret = access(path, F_OK);
@@ -1162,8 +1174,11 @@ static int do_start(void *data)
                 }
         }
  
-       /* Ask father to setup cgroups and wait for him to finish. */
-       if (!lxc_sync_barrier_parent(handler, START_SYNC_CGROUP))
+       /*
+        * Tell the parent task it can begin to configure the container and wait
+        * for it to finish.
+        */
+       if (!lxc_sync_wake_parent(handler, START_SYNC_CONFIGURE))
                 goto out_error;
  
         /* Unshare cgroup namespace after we have setup our cgroups. If we do it
@@ -1235,25 +1250,24 @@ static int do_start(void *data)
                         }
  
                         ret = setns(timens_fd, CLONE_NEWTIME);
-                       if (ret) {
+                       if (ret) {
                                 SYSERROR("Failed to setns(%d(\"/proc/self/ns/time_for_children\"))", timens_fd);
                                 goto out_warn_father;
                         }
                 }
         }
  
-       /* Add the requested environment variables to the current environment to
-        * allow them to be used by the various hooks, such as the start hook
-        * below.
+       /*
+        * Add the requested environment variables to the current environment
+        * to allow them to be used by the various hooks, such as the start
+        * hook below.
          */
-       lxc_list_for_each(iterator, &handler->conf->environment) {
-               ret = putenv((char *)iterator->elem);
-               if (ret < 0) {
-                       SYSERROR("Failed to set environment variable: %s",
-                                (char *)iterator->elem);
-                       goto out_warn_father;
-               }
-       }
+       ret = lxc_set_environment(handler->conf);
+       if (ret < 0)
+               goto out_warn_father;
+
+       if (!lxc_sync_wait_parent(handler, START_SYNC_POST_CONFIGURE))
+               goto out_warn_father;
  
         /* Setup the container, ip, names, utsname, ... */
         ret = lxc_setup(handler);
@@ -1280,24 +1294,6 @@ static int do_start(void *data)
                 DEBUG("Set PR_SET_NO_NEW_PRIVS to block execve() gainable privileges");
         }
  
-       /* Some init's such as busybox will set sane tty settings on stdin,
-        * stdout, stderr which it thinks is the console. We already set them
-        * the way we wanted on the real terminal, and we want init to do its
-        * setup on its console ie. the pty allocated in lxc_terminal_setup() so
-        * make sure that that pty is stdin,stdout,stderr.
-        */
-        if (handler->conf->console.pty >= 0) {
-                if (handler->daemonize || !handler->conf->is_execute)
-                        ret = set_stdfds(handler->conf->console.pty);
-                else
-                        ret = lxc_terminal_set_stdfds(handler->conf->console.pty);
-                if (ret < 0) {
-                       ERROR("Failed to redirect std{in,out,err} to pty file descriptor %d",
-                             handler->conf->console.pty);
-                       goto out_warn_father;
-                }
-        }
-
         /* If we mounted a temporary proc, then unmount it now. */
         tmp_proc_unmount(handler->conf);
  
@@ -1305,12 +1301,6 @@ static int do_start(void *data)
         if (ret < 0)
                 goto out_warn_father;
  
-       ret = lxc_seccomp_send_notifier_fd(&handler->conf->seccomp, data_sock0);
-       if (ret < 0) {
-               SYSERROR("Failed to send seccomp notify fd to parent");
-               goto out_warn_father;
-       }
-
         ret = run_lxc_hooks(handler->name, "start", handler->conf, NULL);
         if (ret < 0) {
                 ERROR("Failed to run lxc.hook.start for container \"%s\"",
@@ -1350,6 +1340,15 @@ static int do_start(void *data)
         if (!lxc_sync_barrier_parent(handler, START_SYNC_CGROUP_LIMITS))
                 goto out_warn_father;
  
+       ret = lxc_sync_fds_child(handler);
+       if (ret < 0) {
+               SYSERROR("Failed to sync file descriptors with parent");
+               goto out_warn_father;
+       }
+
+       if (!lxc_sync_wait_parent(handler, START_SYNC_READY_START))
+               goto out_warn_father;
+
         /* Reset the environment variables the user requested in a clear
          * environment.
          */
@@ -1358,14 +1357,9 @@ static int do_start(void *data)
         if (ret < 0)
                 SYSERROR("Failed to clear environment.");
  
-       lxc_list_for_each(iterator, &handler->conf->environment) {
-               ret = putenv((char *)iterator->elem);
-               if (ret < 0) {
-                       SYSERROR("Failed to set environment variable: %s",
-                                (char *)iterator->elem);
-                       goto out_warn_father;
-               }
-       }
+       ret = lxc_set_environment(handler->conf);
+       if (ret < 0)
+               goto out_warn_father;
  
         ret = putenv("container=lxc");
         if (ret < 0) {
@@ -1403,7 +1397,7 @@ static int do_start(void *data)
          * we switched to root in the new user namespace further above. Only
          * drop groups if we can, so ensure that we have necessary privilege.
          */
-       if (lxc_list_empty(&handler->conf->id_map)) {
+       if (list_empty(&handler->conf->id_map)) {
                 #if HAVE_LIBCAP
                 if (lxc_proc_cap_is_set(CAP_SETGID, CAP_EFFECTIVE))
                 #endif
@@ -1455,44 +1449,6 @@ out_error:
         return -1;
  }
  
-static int lxc_recv_ttys_from_child(struct lxc_handler *handler)
-{
-       int i;
-       struct lxc_terminal_info *tty;
-       int ret = -1;
-       int sock = handler->data_sock[1];
-       struct lxc_conf *conf = handler->conf;
-       struct lxc_tty_info *ttys = &conf->ttys;
-
-       if (!conf->ttys.max)
-               return 0;
-
-       ttys->tty = malloc(sizeof(*ttys->tty) * ttys->max);
-       if (!ttys->tty)
-               return -1;
-
-       for (i = 0; i < conf->ttys.max; i++) {
-               int ttyfds[2];
-
-               ret = lxc_abstract_unix_recv_fds(sock, ttyfds, 2, NULL, 0);
-               if (ret < 0)
-                       break;
-
-               tty = &ttys->tty[i];
-               tty->busy = -1;
-               tty->ptx = ttyfds[0];
-               tty->pty = ttyfds[1];
-               TRACE("Received pty with ptx fd %d and pty fd %d from child", tty->ptx, tty->pty);
-       }
-
-       if (ret < 0)
-               SYSERROR("Failed to receive %zu ttys from child", ttys->max);
-       else
-               TRACE("Received %zu ttys from child", ttys->max);
-
-       return ret;
-}
-
  int resolve_clone_flags(struct lxc_handler *handler)
  {
         int i;
@@ -1508,7 +1464,7 @@ int resolve_clone_flags(struct lxc_handler *handler)
                         if ((conf->ns_clone & ns_info[i].clone_flag))
                                 handler->ns_clone_flags |= ns_info[i].clone_flag;
                 } else {
-                       if (i == LXC_NS_USER && lxc_list_empty(&handler->conf->id_map))
+                       if (i == LXC_NS_USER && list_empty(&handler->conf->id_map))
                                 continue;
  
                         if (i == LXC_NS_NET && lxc_requests_empty_network(handler))
@@ -1560,9 +1516,9 @@ int resolve_clone_flags(struct lxc_handler *handler)
   * newer glibc versions where the getpid() cache is removed and the pid/tid is
   * not reset anymore.
   * However, if for whatever reason you - dear committer - somehow need to get the
- * pid of the dummy intermediate process for do_share_ns() you need to call
- * lxc_raw_getpid(). The next lxc_raw_clone() call does not employ CLONE_VM and
- * will be fine.
+ * pid of the placeholder intermediate process for do_share_ns() you need to
+ * call lxc_raw_getpid(). The next lxc_raw_clone() call does not employ
+ * CLONE_VM and will be fine.
   */
  static inline int do_share_ns(void *arg)
  {
@@ -1598,6 +1554,40 @@ static inline int do_share_ns(void *arg)
         return 0;
  }
  
+static int core_scheduling(struct lxc_handler *handler)
+{
+       struct lxc_conf *conf = handler->conf;
+       int ret;
+
+       if (!conf->sched_core)
+               return log_trace(0, "No new core scheduling domain requested");
+
+       if (!(handler->ns_clone_flags & CLONE_NEWPID))
+               return syserror_set(-EINVAL, "Core scheduling currently requires a separate pid namespace");
+
+       ret = core_scheduling_cookie_create_threadgroup(handler->pid);
+       if (ret < 0) {
+               if (ret == -ENODEV) {
+                       INFO("The kernel doesn't support or doesn't use simultaneous multithreading (SMT)");
+                       conf->sched_core = false;
+                       return 0;
+               }
+               if (ret == -EINVAL)
+                       return syserror("The kernel does not support core scheduling");
+
+               return syserror("Failed to create new core scheduling domain");
+       }
+
+       ret = core_scheduling_cookie_get(handler->pid, &conf->sched_core_cookie);
+       if (ret || !core_scheduling_cookie_valid(conf->sched_core_cookie))
+               return syserror("Failed to retrieve core scheduling domain cookie");
+
+       TRACE("Created new core scheduling domain with cookie %llu",
+             (llu)conf->sched_core_cookie);
+
+       return 0;
+}
+
  /* lxc_spawn() performs crucial setup tasks and clone()s the new process which
   * exec()s the requested container binary.
   * Note that lxc_spawn() runs in the parent namespaces. Any operations performed
@@ -1611,7 +1601,7 @@ static int lxc_spawn(struct lxc_handler *handler)
         int i, ret;
         char pidstr[20];
         bool wants_to_map_ids;
-       struct lxc_list *id_map;
+       struct list_head *id_map;
         const char *name = handler->name;
         const char *lxcpath = handler->lxcpath;
         bool share_ns = false;
@@ -1619,7 +1609,7 @@ static int lxc_spawn(struct lxc_handler *handler)
         struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
  
         id_map = &conf->id_map;
-       wants_to_map_ids = !lxc_list_empty(id_map);
+       wants_to_map_ids = !list_empty(id_map);
  
         for (i = 0; i < LXC_NS_MAX; i++) {
                 if (!conf->ns_share[i])
@@ -1659,16 +1649,6 @@ static int lxc_spawn(struct lxc_handler *handler)
                 goto out_delete_net;
         }
  
-       /* If the rootfs is not a blockdev, prevent the container from marking
-        * it readonly.
-        * If the container is unprivileged then skip rootfs pinning.
-        */
-       ret = lxc_rootfs_prepare(&conf->rootfs, wants_to_map_ids);
-       if (ret) {
-               ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
-               goto out_delete_net;
-       }
-
         /* Create a process in a new set of namespaces. */
         if (share_ns) {
                 pid_t attacher_pid;
@@ -1728,7 +1708,26 @@ static int lxc_spawn(struct lxc_handler *handler)
                 /* Kernel might be too old for clone3(). */
                 if (handler->pid < 0) {
                         SYSTRACE("Failed to spawn container via clone3()");
+
+               /*
+                * In contrast to all other architectures arm64 verifies that
+                * the argument we use to retrieve the pidfd with is
+                * initialized to 0. But we need to be able to initialize it to
+                * a negative value such as our customary -EBADF so we can
+                * detect whether this kernel supports pidfds. If the syscall
+                * returns and the pidfd variable is set to something >= 0 then
+                * we know this is a kernel supporting pidfds. But if we can't
+                * set it to -EBADF then this won't work since 0 is a valid
+                * file descriptor too. And since legacy clone silently ignores
+                * unknown flags we are left without any way to detect support
+                * for pidfds. So let's special-case arm64 to not fail starting
+                * containers.
+                */
+               #if defined(__aarch64__)
+                       handler->pid = lxc_raw_legacy_clone(handler->clone_flags & ~CLONE_PIDFD, NULL);
+               #else
                         handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd);
+               #endif
                 }
  
                 if (handler->pid < 0) {
@@ -1745,12 +1744,16 @@ static int lxc_spawn(struct lxc_handler *handler)
                 handler->clone_flags &= ~CLONE_PIDFD;
         TRACE("Cloned child process %d", handler->pid);
  
+       ret = core_scheduling(handler);
+       if (ret < 0)
+               goto out_delete_net;
+
         /* Verify that we can actually make use of pidfds. */
         if (!lxc_can_use_pidfd(handler->pidfd))
                 close_prot_errno_disarm(handler->pidfd);
  
-       ret = snprintf(pidstr, 20, "%d", handler->pid);
-       if (ret < 0 || ret >= 20)
+       ret = strnprintf(pidstr, 20, "%d", handler->pid);
+       if (ret < 0)
                 goto out_delete_net;
  
         ret = setenv("LXC_PID", pidstr, 1);
@@ -1761,7 +1764,7 @@ static int lxc_spawn(struct lxc_handler *handler)
                 if (handler->ns_on_clone_flags & ns_info[i].clone_flag)
                         INFO("Cloned %s", ns_info[i].flag_name);
  
-       if (!lxc_try_preserve_namespaces(handler, handler->ns_on_clone_flags, handler->pid)) {
+       if (!lxc_try_preserve_namespaces(handler, handler->ns_on_clone_flags)) {
                 ERROR("Failed to preserve cloned namespaces for lxc.hook.stop");
                 goto out_delete_net;
         }
@@ -1791,24 +1794,18 @@ static int lxc_spawn(struct lxc_handler *handler)
                 }
         }
  
-       if (!lxc_sync_wake_child(handler, START_SYNC_STARTUP))
-               goto out_delete_net;
-
-       if (!lxc_sync_wait_child(handler, START_SYNC_CONFIGURE))
-               goto out_delete_net;
-
         if (!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, false)) {
                 ERROR("Failed to setup cgroup limits for container \"%s\"", name);
                 goto out_delete_net;
         }
  
-       if (!cgroup_ops->payload_enter(cgroup_ops, handler)) {
-               ERROR("Failed to enter cgroups");
+       if (!cgroup_ops->payload_delegate_controllers(cgroup_ops)) {
+               ERROR("Failed to delegate controllers to payload cgroup");
                 goto out_delete_net;
         }
  
-       if (!cgroup_ops->payload_delegate_controllers(cgroup_ops)) {
-               ERROR("Failed to delegate controllers to payload cgroup");
+       if (!cgroup_ops->payload_enter(cgroup_ops, handler)) {
+               ERROR("Failed to enter cgroups");
                 goto out_delete_net;
         }
  
@@ -1820,17 +1817,17 @@ static int lxc_spawn(struct lxc_handler *handler)
         if (!cgroup_ops->chown(cgroup_ops, handler->conf))
                 goto out_delete_net;
  
+       if (!lxc_sync_barrier_child(handler, START_SYNC_STARTUP))
+               goto out_delete_net;
+
         /* If not done yet, we're now ready to preserve the network namespace */
         if (handler->nsfd[LXC_NS_NET] < 0) {
-               ret = lxc_try_preserve_ns(handler->pid, "net");
+               ret = lxc_try_preserve_namespace(handler, LXC_NS_NET, "net");
                 if (ret < 0) {
-                       if (ret != -EOPNOTSUPP) {
+                       if (ret != -ENOENT) {
                                 SYSERROR("Failed to preserve net namespace");
                                 goto out_delete_net;
                         }
-               } else {
-                       handler->nsfd[LXC_NS_NET] = ret;
-                       DEBUG("Preserved net namespace via fd %d", ret);
                 }
         }
         ret = lxc_netns_set_nsid(handler->nsfd[LXC_NS_NET]);
@@ -1846,40 +1843,53 @@ static int lxc_spawn(struct lxc_handler *handler)
                         ERROR("Failed to create the network");
                         goto out_delete_net;
                 }
+       }
  
-               ret = lxc_network_send_to_child(handler);
-               if (ret < 0) {
-                       ERROR("Failed to send veth names to child");
-                       goto out_delete_net;
-               }
+       ret = setup_proc_filesystem(conf, handler->pid);
+       if (ret < 0) {
+               ERROR("Failed to setup procfs limits");
+               goto out_delete_net;
         }
  
-       if (!lxc_list_empty(&conf->procs)) {
-               ret = setup_proc_filesystem(&conf->procs, handler->pid);
-               if (ret < 0)
-                       goto out_delete_net;
+       ret = setup_resource_limits(conf, handler->pid);
+       if (ret < 0) {
+               ERROR("Failed to setup resource limits");
+               goto out_delete_net;
         }
  
-       /* Tell the child to continue its initialization. We'll get
-        * START_SYNC_CGROUP when it is ready for us to setup cgroups.
-        */
-       if (!lxc_sync_barrier_child(handler, START_SYNC_POST_CONFIGURE))
+       /* Tell the child to continue its initialization. */
+       if (!lxc_sync_wake_child(handler, START_SYNC_POST_CONFIGURE))
                 goto out_delete_net;
  
-       if (!lxc_list_empty(&conf->limits)) {
-               ret = setup_resource_limits(&conf->limits, handler->pid);
+       ret = lxc_rootfs_prepare_parent(handler);
+       if (ret) {
+               ERROR("Failed to prepare rootfs");
+               goto out_delete_net;
+       }
+
+       if (handler->ns_clone_flags & CLONE_NEWNET) {
+               ret = lxc_network_send_to_child(handler);
                 if (ret < 0) {
-                       ERROR("Failed to setup resource limits");
+                       SYSERROR("Failed to send veth names to child");
                         goto out_delete_net;
                 }
         }
  
-       if (!lxc_sync_barrier_child(handler, START_SYNC_CGROUP_UNSHARE))
+       if (!lxc_sync_wait_child(handler, START_SYNC_IDMAPPED_MOUNTS))
+               goto out_delete_net;
+
+       ret = lxc_idmapped_mounts_parent(handler);
+       if (ret) {
+               ERROR("Failed to setup mount entries");
+               goto out_delete_net;
+       }
+
+       if (!lxc_sync_wait_child(handler, START_SYNC_CGROUP_LIMITS))
                 goto out_delete_net;
  
         /*
-        * with isolation the limiting devices cgroup was already setup, so
-        * only setup devices here if we have no namespace directory
+        * With isolation the limiting devices cgroup was already setup, so
+        * only setup devices here if we have no namespace directory.
          */
         if (!handler->conf->cgroup_meta.namespace_dir &&
             !cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, true)) {
@@ -1894,66 +1904,62 @@ static int lxc_spawn(struct lxc_handler *handler)
         }
         TRACE("Set up cgroup2 device controller limits");
  
+       cgroup_ops->finalize(cgroup_ops);
+       TRACE("Finished setting up cgroups");
+
+       /* Run any host-side start hooks */
+       ret = run_lxc_hooks(name, "start-host", conf, NULL);
+       if (ret < 0) {
+               ERROR("Failed to run lxc.hook.start-host");
+               goto out_delete_net;
+       }
+
+       if (!lxc_sync_wake_child(handler, START_SYNC_FDS))
+               goto out_delete_net;
+
         if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
                 /* Now we're ready to preserve the cgroup namespace */
-               ret = lxc_try_preserve_ns(handler->pid, "cgroup");
+               ret = lxc_try_preserve_namespace(handler, LXC_NS_CGROUP, "cgroup");
                 if (ret < 0) {
-                       if (ret != -EOPNOTSUPP) {
+                       if (ret != -ENOENT) {
                                 SYSERROR("Failed to preserve cgroup namespace");
                                 goto out_delete_net;
                         }
-               } else {
-                       handler->nsfd[LXC_NS_CGROUP] = ret;
-                       DEBUG("Preserved cgroup namespace via fd %d", ret);
                 }
         }
  
-       cgroup_ops->payload_finalize(cgroup_ops);
-       TRACE("Finished setting up cgroups");
-
         if (handler->ns_unshare_flags & CLONE_NEWTIME) {
-               /* Now we're ready to preserve the cgroup namespace */
-               ret = lxc_try_preserve_ns(handler->pid, "time");
+               /* Now we're ready to preserve the time namespace */
+               ret = lxc_try_preserve_namespace(handler, LXC_NS_TIME, "time");
                 if (ret < 0) {
-                       if (ret != -EOPNOTSUPP) {
+                       if (ret != -ENOENT) {
                                 SYSERROR("Failed to preserve time namespace");
                                 goto out_delete_net;
                         }
-               } else {
-                       handler->nsfd[LXC_NS_TIME] = ret;
-                       DEBUG("Preserved time namespace via fd %d", ret);
                 }
         }
  
-       /* Run any host-side start hooks */
-       ret = run_lxc_hooks(name, "start-host", conf, NULL);
+       ret = lxc_sync_fds_parent(handler);
         if (ret < 0) {
-               ERROR("Failed to run lxc.hook.start-host");
+               SYSERROR("Failed to sync file descriptors with child");
                 goto out_delete_net;
         }
  
-       /* Tell the child to complete its initialization and wait for it to exec
-        * or return an error. (The child will never return
-        * START_SYNC_READY_START+1. It will either close the sync pipe, causing
-        * lxc_sync_barrier_child to return success, or return a different
-        * value, causing us to error out).
-        */
-       if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START))
+       ret = lxc_terminal_setup(conf);
+       if (ret < 0) {
+               SYSERROR("Failed to create console");
                 goto out_delete_net;
-
-       if (handler->ns_clone_flags & CLONE_NEWNET) {
-               ret = lxc_network_recv_name_and_ifindex_from_child(handler);
-               if (ret < 0) {
-                       ERROR("Failed to receive names and ifindices for network devices from child");
-                       goto out_delete_net;
-               }
         }
  
-       ret = lxc_setup_devpts_parent(handler);
-       if (ret < 0) {
-               SYSERROR("Failed to receive devpts fd from child");
+       /*
+        * Tell the child to complete its initialization and wait for it to
+        * exec or return an error. (The child will never return
+        * START_SYNC_READY_START+1. It will either close the sync pipe,
+        * causing lxc_sync_barrier_child to return success, or return a
+        * different value, causing us to error out).
+        */
+       if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START))
                 goto out_delete_net;
-       }
  
         /* Now all networks are created, network devices are moved into place,
          * and the correct names and ifindices in the respective namespaces have
@@ -1962,19 +1968,6 @@ static int lxc_spawn(struct lxc_handler *handler)
          */
         lxc_log_configured_netdevs(conf);
  
-       /* Read tty fds allocated by child. */
-       ret = lxc_recv_ttys_from_child(handler);
-       if (ret < 0) {
-               ERROR("Failed to receive tty info from child process");
-               goto out_delete_net;
-       }
-
-       ret = lxc_seccomp_recv_notifier_fd(&handler->conf->seccomp, data_sock1);
-       if (ret < 0) {
-               SYSERROR("Failed to receive seccomp notify fd from child");
-               goto out_delete_net;
-       }
-
         ret = handler->ops->post_start(handler, handler->data);
         if (ret < 0)
                 goto out_abort;
@@ -2032,20 +2025,33 @@ int __lxc_start(struct lxc_handler *handler, struct lxc_operations *ops,
                 goto out_abort;
         }
  
+       if (!cgroup_ops->monitor_delegate_controllers(cgroup_ops)) {
+               ERROR("Failed to delegate controllers to monitor cgroup");
+               ret = -1;
+               goto out_abort;
+       }
+
         if (!cgroup_ops->monitor_enter(cgroup_ops, handler)) {
                 ERROR("Failed to enter monitor cgroup");
                 ret = -1;
                 goto out_abort;
         }
  
-       if (!cgroup_ops->monitor_delegate_controllers(cgroup_ops)) {
-               ERROR("Failed to delegate controllers to monitor cgroup");
+       /* If the rootfs is not a blockdev, prevent the container from marking
+        * it readonly.
+        * If the container is unprivileged then skip rootfs pinning.
+        */
+       ret = lxc_rootfs_init(conf, !list_empty(&conf->id_map));
+       if (ret) {
+               ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
                 ret = -1;
                 goto out_abort;
         }
  
-       if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) {
-               /* If the backing store is a device, mount it here and now. */
+       if (geteuid() == 0 && !list_empty(&conf->id_map)) {
+               /*
+                * Most filesystems can't be mounted inside a userns so handle them here.
+                */
                 if (rootfs_is_blockdev(conf)) {
                         ret = unshare(CLONE_NEWNS);
                         if (ret < 0) {
@@ -2054,7 +2060,6 @@ int __lxc_start(struct lxc_handler *handler, struct lxc_operations *ops,
                         }
                         INFO("Unshared CLONE_NEWNS");
  
-                       turn_into_dependent_mounts();
                         ret = lxc_setup_rootfs_prepare_root(conf, name, lxcpath);
                         if (ret < 0) {
                                 ERROR("Error setting up rootfs mount as root before spawn");
@@ -2200,8 +2205,8 @@ static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
         }
         INFO("Destroyed rootfs for container \"%s\"", name);
  
-       ret = snprintf(destroy, PATH_MAX, "%s/%s", handler->lxcpath, name);
-       if (ret < 0 || ret >= PATH_MAX) {
+       ret = strnprintf(destroy, sizeof(destroy), "%s/%s", handler->lxcpath, name);
+       if (ret < 0) {
                 ERROR("Error destroying directory for container \"%s\"", name);
                 return;
         }