]> git.proxmox.com Git - mirror_lxc.git/blobdiff - src/lxc/start.c
Merge pull request #3069 from brauner/2019-07-01/network_creation
[mirror_lxc.git] / src / lxc / start.c
index 17806704259d8fee404dae8f13178475ca5e783a..9e28d3dcdfd65df3abf8607e2a563c4cf6e2d697 100644 (file)
@@ -26,7 +26,6 @@
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE 1
 #endif
-#include <alloca.h>
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
@@ -67,6 +66,7 @@
 #include "lxcseccomp.h"
 #include "macro.h"
 #include "mainloop.h"
+#include "memory_utils.h"
 #include "monitor.h"
 #include "namespace.h"
 #include "network.h"
@@ -97,16 +97,13 @@ static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
 
 static void print_top_failing_dir(const char *path)
 {
+       __do_free char *copy = NULL;
        int ret;
-       size_t len;
-       char *copy, *e, *p, saved;
-
-       len = strlen(path);
-       copy = alloca(len + 1);
-       (void)strlcpy(copy, path, len + 1);
+       char *e, *p, saved;
 
+       copy = must_copy_string(path);
        p = copy;
-       e = copy + len;
+       e = copy + strlen(path);
 
        while (p < e) {
                while (p < e && *p == '/')
@@ -180,8 +177,6 @@ static bool lxc_try_preserve_namespaces(struct lxc_handler *handler,
 
                fd = lxc_try_preserve_ns(pid, ns_info[i].proc_name);
                if (fd < 0) {
-                       handler->nsfd[i] = -EBADF;
-
                        /* Do not fail to start container on kernels that do
                         * not support interacting with namespaces through
                         * /proc.
@@ -206,6 +201,38 @@ static inline bool match_stdfds(int fd)
        return (fd == STDIN_FILENO || fd == STDOUT_FILENO || fd == STDERR_FILENO);
 }
 
+#ifdef HAVE_DLOG
+static bool match_dlog_fds(struct dirent *direntp)
+{
+       char path[PATH_MAX] = {0};
+       char link[PATH_MAX] = {0};
+       ssize_t linklen;
+       int ret;
+
+       ret = snprintf(path, PATH_MAX, "/proc/self/fd/%s", direntp->d_name);
+       if (ret < 0 || ret >= PATH_MAX) {
+               ERROR("Failed to create file descriptor name");
+               return false;
+       }
+
+       linklen = readlink(path, link, PATH_MAX);
+       if (linklen < 0) {
+               SYSERROR("Failed to read link path - \"%s\"", path);
+               return false;
+       } else if (linklen >= PATH_MAX) {
+               ERROR("The name of link path is too long - \"%s\"", path);
+               return false;
+       }
+
+       if (strcmp(link, "/dev/log_main") == 0 ||
+           strcmp(link, "/dev/log_system") == 0 ||
+           strcmp(link, "/dev/log_radio") == 0)
+               return true;
+
+       return false;
+}
+#endif
+
 int lxc_check_inherited(struct lxc_conf *conf, bool closeall,
                        int *fds_to_ignore, size_t len_fds)
 {
@@ -273,6 +300,11 @@ restart:
                if (match_stdfds(fd))
                        continue;
 
+#ifdef HAVE_DLOG
+               if (match_dlog_fds(direntp))
+                       continue;
+
+#endif
                if (closeall) {
                        close(fd);
                        closedir(dir);
@@ -295,7 +327,6 @@ restart:
 static int setup_signal_fd(sigset_t *oldmask)
 {
        int ret;
-       int sig;
        sigset_t mask;
        const int signals[] = {SIGBUS, SIGILL, SIGSEGV, SIGWINCH};
 
@@ -304,7 +335,7 @@ static int setup_signal_fd(sigset_t *oldmask)
        if (ret < 0)
                return -EBADF;
 
-       for (sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
+       for (int sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
                ret = sigdelset(&mask, signals[sig]);
                if (ret < 0)
                        return -EBADF;
@@ -375,15 +406,29 @@ static int signal_handler(int fd, uint32_t events, void *data,
        }
 
        if (siginfo.ssi_signo == SIGHUP) {
-               kill(hdlr->pid, SIGTERM);
+               if (hdlr->pidfd >= 0)
+                       lxc_raw_pidfd_send_signal(hdlr->pidfd, SIGTERM, NULL, 0);
+               else if (hdlr->proc_pidfd >= 0)
+                       lxc_raw_pidfd_send_signal(hdlr->proc_pidfd, SIGTERM, NULL, 0);
+               else
+                       kill(hdlr->pid, SIGTERM);
                INFO("Killing %d since terminal hung up", hdlr->pid);
-               return hdlr->init_died ? LXC_MAINLOOP_CLOSE : LXC_MAINLOOP_CONTINUE;
+               return hdlr->init_died ? LXC_MAINLOOP_CLOSE
+                                      : LXC_MAINLOOP_CONTINUE;
        }
 
        if (siginfo.ssi_signo != SIGCHLD) {
-               kill(hdlr->pid, siginfo.ssi_signo);
+               if (hdlr->pidfd >= 0)
+                       lxc_raw_pidfd_send_signal(hdlr->pidfd,
+                                                 siginfo.ssi_signo, NULL, 0);
+               else if (hdlr->proc_pidfd >= 0)
+                       lxc_raw_pidfd_send_signal(hdlr->proc_pidfd,
+                                                 siginfo.ssi_signo, NULL, 0);
+               else
+                       kill(hdlr->pid, siginfo.ssi_signo);
                INFO("Forwarded signal %d to pid %d", siginfo.ssi_signo, hdlr->pid);
-               return hdlr->init_died ? LXC_MAINLOOP_CLOSE : LXC_MAINLOOP_CONTINUE;
+               return hdlr->init_died ? LXC_MAINLOOP_CLOSE
+                                      : LXC_MAINLOOP_CONTINUE;
        }
 
        /* More robustness, protect ourself from a SIGCHLD sent
@@ -392,18 +437,24 @@ static int signal_handler(int fd, uint32_t events, void *data,
        if (siginfo.ssi_pid != hdlr->pid) {
                NOTICE("Received %d from pid %d instead of container init %d",
                       siginfo.ssi_signo, siginfo.ssi_pid, hdlr->pid);
-               return hdlr->init_died ? LXC_MAINLOOP_CLOSE : LXC_MAINLOOP_CONTINUE;
+               return hdlr->init_died ? LXC_MAINLOOP_CLOSE
+                                      : LXC_MAINLOOP_CONTINUE;
        }
 
        if (siginfo.ssi_code == CLD_STOPPED) {
                INFO("Container init process was stopped");
-               return hdlr->init_died ? LXC_MAINLOOP_CLOSE : LXC_MAINLOOP_CONTINUE;
-       } else if (siginfo.ssi_code == CLD_CONTINUED) {
+               return hdlr->init_died ? LXC_MAINLOOP_CLOSE
+                                      : LXC_MAINLOOP_CONTINUE;
+       }
+
+       if (siginfo.ssi_code == CLD_CONTINUED) {
                INFO("Container init process was continued");
-               return hdlr->init_died ? LXC_MAINLOOP_CLOSE : LXC_MAINLOOP_CONTINUE;
+               return hdlr->init_died ? LXC_MAINLOOP_CLOSE
+                                      : LXC_MAINLOOP_CONTINUE;
        }
 
        DEBUG("Container init process %d exited", hdlr->pid);
+
        return LXC_MAINLOOP_CLOSE;
 }
 
@@ -413,7 +464,6 @@ int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
        size_t retlen;
        ssize_t ret;
        struct lxc_list *cur, *next;
-       struct lxc_state_client *client;
        struct lxc_msg msg = {.type = lxc_msg_state, .value = state};
 
        if (state == THAWED)
@@ -433,7 +483,7 @@ int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
                return -E2BIG;
 
        lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
-               client = cur->elem;
+               struct lxc_state_client *client = cur->elem;
 
                if (client->states[state] == 0) {
                        TRACE("State %s not registered for state client %d",
@@ -553,6 +603,12 @@ int lxc_poll(const char *name, struct lxc_handler *handler)
                goto out_mainloop_console;
        }
 
+       ret = lxc_seccomp_setup_proxy(&handler->conf->seccomp, &descr, handler);
+       if (ret < 0) {
+               ERROR("Failed to setup seccomp proxy");
+               goto out_mainloop_console;
+       }
+
        if (has_console) {
                struct lxc_terminal *console = &handler->conf->console;
 
@@ -614,6 +670,10 @@ void lxc_zero_handler(struct lxc_handler *handler)
 
        handler->pinfd = -1;
 
+       handler->pidfd = -EBADF;
+
+       handler->proc_pidfd = -EBADF;
+
        handler->sigfd = -1;
 
        for (i = 0; i < LXC_NS_MAX; i++)
@@ -634,6 +694,12 @@ void lxc_free_handler(struct lxc_handler *handler)
        if (handler->pinfd >= 0)
                close(handler->pinfd);
 
+       if (handler->pidfd >= 0)
+               close(handler->pidfd);
+
+       if (handler->proc_pidfd >= 0)
+               close(handler->proc_pidfd);
+
        if (handler->sigfd >= 0)
                close(handler->sigfd);
 
@@ -649,6 +715,9 @@ void lxc_free_handler(struct lxc_handler *handler)
        if (handler->state_socket_pair[1] >= 0)
                close(handler->state_socket_pair[1]);
 
+       if (handler->cgroup_ops)
+               cgroup_exit(handler->cgroup_ops);
+
        handler->conf = NULL;
        free(handler);
        handler = NULL;
@@ -675,6 +744,8 @@ struct lxc_handler *lxc_init_handler(const char *name, struct lxc_conf *conf,
        handler->conf = conf;
        handler->lxcpath = lxcpath;
        handler->pinfd = -1;
+       handler->pidfd = -EBADF;
+       handler->proc_pidfd = -EBADF;
        handler->sigfd = -EBADF;
        handler->init_died = false;
        handler->state_socket_pair[0] = handler->state_socket_pair[1] = -1;
@@ -959,7 +1030,6 @@ void lxc_fini(const char *name, struct lxc_handler *handler)
 
        cgroup_ops->payload_destroy(cgroup_ops, handler);
        cgroup_ops->monitor_destroy(cgroup_ops, handler);
-       cgroup_exit(cgroup_ops);
 
        if (handler->conf->reboot == REBOOT_NONE) {
                /* For all new state clients simply close the command socket.
@@ -986,6 +1056,9 @@ void lxc_fini(const char *name, struct lxc_handler *handler)
                lxc_set_state(name, handler, STOPPED);
        }
 
+       /* Avoid lingering namespace references. */
+       lxc_put_nsfds(handler);
+
        ret = run_lxc_hooks(name, "post-stop", handler->conf, NULL);
        if (ret < 0) {
                ERROR("Failed to run lxc.hook.post-stop for container \"%s\"", name);
@@ -1034,32 +1107,37 @@ void lxc_fini(const char *name, struct lxc_handler *handler)
 
 void lxc_abort(const char *name, struct lxc_handler *handler)
 {
-       int ret, status;
+       int ret = 0;
+       int status;
 
        lxc_set_state(name, handler, ABORTING);
 
-       if (handler->pid > 0) {
+       if (handler->pidfd > 0)
+               ret = lxc_raw_pidfd_send_signal(handler->pidfd, SIGKILL, NULL, 0);
+       else if (handler->proc_pidfd > 0)
+               ret = lxc_raw_pidfd_send_signal(handler->proc_pidfd, SIGKILL, NULL, 0);
+       else if (handler->pid > 0)
                ret = kill(handler->pid, SIGKILL);
-               if (ret < 0)
-                       SYSERROR("Failed to send SIGKILL to %d", handler->pid);
-       }
+       if (ret < 0)
+               SYSERROR("Failed to send SIGKILL to %d", handler->pid);
 
-       while ((ret = waitpid(-1, &status, 0)) > 0) {
-               ;
-       }
+       do {
+               ret = waitpid(-1, &status, 0);
+       } while (ret > 0);
 }
 
 static int do_start(void *data)
 {
+       struct lxc_handler *handler = data;
+       ATTR_UNUSED __do_close_prot_errno int data_sock0 = handler->data_sock[0],
+                                             data_sock1 = handler->data_sock[1];
        int ret;
-       char path[PATH_MAX];
        uid_t new_uid;
        gid_t new_gid;
        struct lxc_list *iterator;
        uid_t nsuid = 0;
        gid_t nsgid = 0;
        int devnull_fd = -1;
-       struct lxc_handler *handler = data;
 
        lxc_sync_fini_parent(handler);
 
@@ -1069,7 +1147,7 @@ static int do_start(void *data)
         * exit before we set the pdeath signal leading to a unsupervized
         * container.
         */
-       ret = lxc_set_death_signal(SIGKILL, 0);
+       ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid);
        if (ret < 0) {
                SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
                goto out_warn_father;
@@ -1115,10 +1193,12 @@ static int do_start(void *data)
        if (ret < 0)
                goto out_error;
 
-       ret = lxc_network_recv_veth_names_from_parent(handler);
-       if (ret < 0) {
-               ERROR("Failed to receive veth names from parent");
-               goto out_warn_father;
+       if (handler->ns_clone_flags & CLONE_NEWNET) {
+               ret = lxc_network_recv_veth_names_from_parent(handler);
+               if (ret < 0) {
+                       ERROR("Failed to receive veth names from parent");
+                       goto out_warn_father;
+               }
        }
 
        /* If we are in a new user namespace, become root there to have
@@ -1147,7 +1227,7 @@ static int do_start(void *data)
                        goto out_warn_father;
 
                /* set{g,u}id() clears deathsignal */
-               ret = lxc_set_death_signal(SIGKILL, 0);
+               ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid);
                if (ret < 0) {
                        SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
                        goto out_warn_father;
@@ -1160,11 +1240,6 @@ static int do_start(void *data)
                goto out_warn_father;
        }
 
-       ret = snprintf(path, sizeof(path), "%s/dev/null",
-                      handler->conf->rootfs.mount);
-       if (ret < 0 || ret >= sizeof(path))
-               goto out_warn_father;
-
        /* In order to checkpoint restore, we need to have everything in the
         * same mount namespace. However, some containers may not have a
         * reasonable /dev (in particular, they may not have /dev/null), so we
@@ -1176,6 +1251,13 @@ static int do_start(void *data)
         * where it isn't wanted.
         */
        if (handler->daemonize && !handler->conf->autodev) {
+               char path[PATH_MAX];
+               
+               ret = snprintf(path, sizeof(path), "%s/dev/null",
+                              handler->conf->rootfs.mount);
+               if (ret < 0 || ret >= sizeof(path))
+                       goto out_warn_father;
+               
                ret = access(path, F_OK);
                if (ret != 0) {
                        devnull_fd = open_devnull();
@@ -1208,10 +1290,16 @@ static int do_start(void *data)
        if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
                ret = unshare(CLONE_NEWCGROUP);
                if (ret < 0) {
-                       INFO("Failed to unshare CLONE_NEWCGROUP");
-                       goto out_warn_father;
+                       if (errno != EINVAL) {
+                               SYSERROR("Failed to unshare CLONE_NEWCGROUP");
+                               goto out_warn_father;
+                       }
+
+                       handler->ns_clone_flags &= ~CLONE_NEWCGROUP;
+                       SYSINFO("Kernel does not support CLONE_NEWCGROUP");
+               } else {
+                       INFO("Unshared CLONE_NEWCGROUP");
                }
-               INFO("Unshared CLONE_NEWCGROUP");
        }
 
        /* Add the requested environment variables to the current environment to
@@ -1229,8 +1317,6 @@ static int do_start(void *data)
 
        /* Setup the container, ip, names, utsname, ... */
        ret = lxc_setup(handler);
-       close(handler->data_sock[1]);
-       close(handler->data_sock[0]);
        if (ret < 0) {
                ERROR("Failed to setup container \"%s\"", handler->name);
                goto out_warn_father;
@@ -1281,6 +1367,12 @@ static int do_start(void *data)
        if (ret < 0)
                goto out_warn_father;
 
+       ret = lxc_seccomp_send_notifier_fd(&handler->conf->seccomp, data_sock0);
+       if (ret < 0) {
+               SYSERROR("Failed to send seccomp notify fd to parent");
+               goto out_warn_father;
+       }
+
        ret = run_lxc_hooks(handler->name, "start", handler->conf, NULL);
        if (ret < 0) {
                ERROR("Failed to run lxc.hook.start for container \"%s\"",
@@ -1389,7 +1481,7 @@ static int do_start(void *data)
        }
 
        if (handler->conf->monitor_signal_pdeath != SIGKILL) {
-               ret = lxc_set_death_signal(handler->conf->monitor_signal_pdeath, 0);
+               ret = lxc_set_death_signal(handler->conf->monitor_signal_pdeath, handler->monitor_pid);
                if (ret < 0) {
                        SYSERROR("Failed to set PR_SET_PDEATHSIG to %d",
                                 handler->conf->monitor_signal_pdeath);
@@ -1460,11 +1552,11 @@ int resolve_clone_flags(struct lxc_handler *handler)
        struct lxc_conf *conf = handler->conf;
 
        for (i = 0; i < LXC_NS_MAX; i++) {
-               if (conf->ns_keep != 0) {
-                       if ((conf->ns_keep & ns_info[i].clone_flag) == 0)
+               if (conf->ns_keep) {
+                       if (!(conf->ns_keep & ns_info[i].clone_flag))
                                handler->ns_clone_flags |= ns_info[i].clone_flag;
-               } else if (conf->ns_clone != 0) {
-                       if ((conf->ns_clone & ns_info[i].clone_flag) > 0)
+               } else if (conf->ns_clone) {
+                       if ((conf->ns_clone & ns_info[i].clone_flag))
                                handler->ns_clone_flags |= ns_info[i].clone_flag;
                } else {
                        if (i == LXC_NS_USER && lxc_list_empty(&handler->conf->id_map))
@@ -1496,7 +1588,7 @@ int resolve_clone_flags(struct lxc_handler *handler)
  * getpid() in the child would return the parent's pid. This is all fixed in
  * newer glibc versions where the getpid() cache is removed and the pid/tid is
  * not reset anymore.
- * However, if for whatever reason you - dear commiter - somehow need to get the
+ * However, if for whatever reason you - dear committer - somehow need to get the
  * pid of the dummy intermediate process for do_share_ns() you need to call
  * lxc_raw_getpid(). The next lxc_raw_clone() call does not employ CLONE_VM and
  * will be fine.
@@ -1527,80 +1619,33 @@ static inline int do_share_ns(void *arg)
 
        flags = handler->ns_on_clone_flags;
        flags |= CLONE_PARENT;
-       handler->pid = lxc_raw_clone_cb(do_start, handler, flags);
+       handler->pid = lxc_raw_clone_cb(do_start, handler, CLONE_PIDFD | flags,
+                                       &handler->pidfd);
        if (handler->pid < 0)
                return -1;
 
        return 0;
 }
 
-static int lxc_setup_shmount(struct lxc_conf *conf)
+static int proc_pidfd_open(pid_t pid)
 {
-       size_t len_cont;
-       char *full_cont_path;
-       int ret = -1;
-
-       /* Construct the shmount path under the container root. */
-       len_cont = strlen(conf->rootfs.mount) + 1 + strlen(conf->shmount.path_cont);
-       /* +1 for the terminating '\0' */
-       full_cont_path = malloc(len_cont + 1);
-       if (!full_cont_path) {
-               SYSERROR("Not enough memory");
-               return -ENOMEM;
-       }
+       __do_close_prot_errno int proc_pidfd = -EBADF;
+       char path[100];
 
-       ret = snprintf(full_cont_path, len_cont + 1, "%s/%s",
-                      conf->rootfs.mount, conf->shmount.path_cont);
-       if (ret < 0 || ret >= len_cont + 1) {
-               SYSERROR("Failed to create filename");
-               free(full_cont_path);
+       snprintf(path, sizeof(path), "/proc/%d", pid);
+       proc_pidfd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+       if (proc_pidfd < 0) {
+               SYSERROR("Failed to open %s", path);
                return -1;
        }
 
-       /* Check if shmount point is already set up. */
-       if (is_shared_mountpoint(conf->shmount.path_host)) {
-               INFO("Path \"%s\" is already MS_SHARED. Reusing",
-                    conf->shmount.path_host);
-               free(full_cont_path);
-               return 0;
-       }
-
-       /* Create host and cont mount paths */
-       ret = mkdir_p(conf->shmount.path_host, 0711);
-       if (ret < 0 && errno != EEXIST) {
-               SYSERROR("Failed to create directory \"%s\"",
-                        conf->shmount.path_host);
-               free(full_cont_path);
-               return ret;
-       }
-
-       ret = mkdir_p(full_cont_path, 0711);
-       if (ret < 0 && errno != EEXIST) {
-               SYSERROR("Failed to create directory \"%s\"", full_cont_path);
-               free(full_cont_path);
-               return ret;
-       }
-
-       /* Prepare host mountpoint */
-       ret = mount("tmpfs", conf->shmount.path_host, "tmpfs", 0,
-                   "size=100k,mode=0711");
-       if (ret < 0) {
-               SYSERROR("Failed to mount \"%s\"", conf->shmount.path_host);
-               free(full_cont_path);
-               return ret;
-       }
-
-       ret = mount(conf->shmount.path_host, conf->shmount.path_host, "none",
-                   MS_REC | MS_SHARED, "");
-       if (ret < 0) {
-               SYSERROR("Failed to make shared \"%s\"", conf->shmount.path_host);
-               free(full_cont_path);
-               return ret;
+       /* Test whether we can send signals. */
+       if (lxc_raw_pidfd_send_signal(proc_pidfd, 0, NULL, 0)) {
+               SYSERROR("Failed to send signal through pidfd");
+               return -1;
        }
 
-       INFO("Setup shared mount point \"%s\"", conf->shmount.path_host);
-       free(full_cont_path);
-       return 0;
+       return move_fd(proc_pidfd);
 }
 
 /* lxc_spawn() performs crucial setup tasks and clone()s the new process which
@@ -1612,6 +1657,7 @@ static int lxc_setup_shmount(struct lxc_conf *conf)
  */
 static int lxc_spawn(struct lxc_handler *handler)
 {
+       __do_close_prot_errno int data_sock0 = -EBADF, data_sock1 = -EBADF;
        int i, ret;
        char pidstr[20];
        bool wants_to_map_ids;
@@ -1644,47 +1690,13 @@ static int lxc_spawn(struct lxc_handler *handler)
                         handler->data_sock);
        if (ret < 0)
                goto out_sync_fini;
+       data_sock0 = handler->data_sock[0];
+       data_sock1 = handler->data_sock[1];
 
        ret = resolve_clone_flags(handler);
        if (ret < 0)
                goto out_sync_fini;
 
-       if (conf->shmount.path_host) {
-               if (!conf->shmount.path_cont)
-                       goto out_sync_fini;
-
-               ret = lxc_setup_shmount(conf);
-               if (ret < 0) {
-                       ERROR("Failed to setup shared mount point");
-                       goto out_sync_fini;
-               }
-       }
-
-       if (handler->ns_clone_flags & CLONE_NEWNET) {
-               if (!lxc_list_empty(&conf->network)) {
-
-                       /* Find gateway addresses from the link device, which is
-                        * no longer accessible inside the container. Do this
-                        * before creating network interfaces, since goto
-                        * out_delete_net does not work before lxc_clone.
-                        */
-                       ret = lxc_find_gateway_addresses(handler);
-                       if (ret < 0) {
-                               ERROR("Failed to find gateway addresses");
-                               goto out_sync_fini;
-                       }
-
-                       /* That should be done before the clone because we will
-                        * fill the netdev index and use them in the child.
-                        */
-                       ret = lxc_create_network_priv(handler);
-                       if (ret < 0) {
-                               ERROR("Failed to create the network");
-                               goto out_delete_net;
-                       }
-               }
-       }
-
        if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
                ERROR("Failed creating cgroups");
                goto out_delete_net;
@@ -1717,7 +1729,7 @@ static int lxc_spawn(struct lxc_handler *handler)
                pid_t attacher_pid;
 
                attacher_pid = lxc_clone(do_share_ns, handler,
-                                        CLONE_VFORK | CLONE_VM | CLONE_FILES);
+                                        CLONE_VFORK | CLONE_VM | CLONE_FILES, NULL);
                if (attacher_pid < 0) {
                        SYSERROR(LXC_CLONE_ERROR);
                        goto out_delete_net;
@@ -1730,7 +1742,8 @@ static int lxc_spawn(struct lxc_handler *handler)
                }
        } else {
                handler->pid = lxc_raw_clone_cb(do_start, handler,
-                                               handler->ns_on_clone_flags);
+                                               CLONE_PIDFD | handler->ns_on_clone_flags,
+                                               &handler->pidfd);
        }
        if (handler->pid < 0) {
                SYSERROR(LXC_CLONE_ERROR);
@@ -1738,6 +1751,20 @@ static int lxc_spawn(struct lxc_handler *handler)
        }
        TRACE("Cloned child process %d", handler->pid);
 
+       if (handler->pidfd < 0) {
+               handler->proc_pidfd = proc_pidfd_open(handler->pid);
+               if (handler->proc_pidfd < 0 && (errno != ENOSYS))
+                       goto out_delete_net;
+       }
+
+       ret = snprintf(pidstr, 20, "%d", handler->pid);
+       if (ret < 0 || ret >= 20)
+               goto out_delete_net;
+
+       ret = setenv("LXC_PID", pidstr, 1);
+       if (ret < 0)
+               SYSERROR("Failed to set environment variable: LXC_PID=%s", pidstr);
+
        for (i = 0; i < LXC_NS_MAX; i++)
                if (handler->ns_on_clone_flags & ns_info[i].clone_flag)
                        INFO("Cloned %s", ns_info[i].flag_name);
@@ -1785,49 +1812,40 @@ static int lxc_spawn(struct lxc_handler *handler)
        if (!cgroup_ops->chown(cgroup_ops, handler->conf))
                goto out_delete_net;
 
-       /* Now we're ready to preserve the network namespace */
-       ret = lxc_try_preserve_ns(handler->pid, "net");
-       if (ret < 0) {
-               if (ret != -EOPNOTSUPP) {
-                       SYSERROR("Failed to preserve net namespace");
-                       goto out_delete_net;
+       /* If not done yet, we're now ready to preserve the network namespace */
+       if (handler->nsfd[LXC_NS_NET] < 0) {
+               ret = lxc_try_preserve_ns(handler->pid, "net");
+               if (ret < 0) {
+                       if (ret != -EOPNOTSUPP) {
+                               SYSERROR("Failed to preserve net namespace");
+                               goto out_delete_net;
+                       }
+               } else {
+                       handler->nsfd[LXC_NS_NET] = ret;
+                       DEBUG("Preserved net namespace via fd %d", ret);
                }
-       } else {
-               handler->nsfd[LXC_NS_NET] = ret;
-               DEBUG("Preserved net namespace via fd %d", ret);
-
-               ret = lxc_netns_set_nsid(handler->nsfd[LXC_NS_NET]);
-               if (ret < 0)
-                       SYSERROR("Failed to allocate new network namespace id");
-               else
-                       TRACE("Allocated new network namespace id");
        }
+       ret = lxc_netns_set_nsid(handler->nsfd[LXC_NS_NET]);
+       if (ret < 0)
+               SYSWARN("Failed to allocate new network namespace id");
+       else
+               TRACE("Allocated new network namespace id");
 
        /* Create the network configuration. */
        if (handler->ns_clone_flags & CLONE_NEWNET) {
-               ret = lxc_network_move_created_netdev_priv(handler->lxcpath,
-                                                          handler->name,
-                                                          &conf->network,
-                                                          handler->pid);
+               ret = lxc_create_network(handler);
                if (ret < 0) {
-                       ERROR("Failed to create the configured network");
+                       ERROR("Failed to create the network");
                        goto out_delete_net;
                }
 
-               ret = lxc_create_network_unpriv(handler->lxcpath, handler->name,
-                                               &conf->network, handler->pid, conf->hooks_version);
+               ret = lxc_network_send_veth_names_to_child(handler);
                if (ret < 0) {
-                       ERROR("Failed to create the configured network");
+                       ERROR("Failed to send veth names to child");
                        goto out_delete_net;
                }
        }
 
-       ret = lxc_network_send_veth_names_to_child(handler);
-       if (ret < 0) {
-               ERROR("Failed to send veth names to child");
-               goto out_delete_net;
-       }
-
        if (!lxc_list_empty(&conf->procs)) {
                ret = setup_proc_filesystem(&conf->procs, handler->pid);
                if (ret < 0)
@@ -1873,14 +1891,6 @@ static int lxc_spawn(struct lxc_handler *handler)
                }
        }
 
-       ret = snprintf(pidstr, 20, "%d", handler->pid);
-       if (ret < 0 || ret >= 20)
-               goto out_delete_net;
-
-       ret = setenv("LXC_PID", pidstr, 1);
-       if (ret < 0)
-               SYSERROR("Failed to set environment variable: LXC_PID=%s", pidstr);
-
        /* Run any host-side start hooks */
        ret = run_lxc_hooks(name, "start-host", conf, NULL);
        if (ret < 0) {
@@ -1898,15 +1908,16 @@ static int lxc_spawn(struct lxc_handler *handler)
        if (ret < 0)
                goto out_delete_net;
 
-       ret = lxc_network_recv_name_and_ifindex_from_child(handler);
-       if (ret < 0) {
-               ERROR("Failed to receive names and ifindices for network "
-                     "devices from child");
-               goto out_delete_net;
+       if (handler->ns_clone_flags & CLONE_NEWNET) {
+               ret = lxc_network_recv_name_and_ifindex_from_child(handler);
+               if (ret < 0) {
+                       ERROR("Failed to receive names and ifindices for network devices from child");
+                       goto out_delete_net;
+               }
        }
 
        /* Now all networks are created, network devices are moved into place,
-        * and the correct names and ifindeces in the respective namespaces have
+        * and the correct names and ifindices in the respective namespaces have
         * been recorded. The corresponding structs have now all been filled. So
         * log them for debugging purposes.
         */
@@ -1919,6 +1930,12 @@ static int lxc_spawn(struct lxc_handler *handler)
                goto out_delete_net;
        }
 
+       ret = lxc_seccomp_recv_notifier_fd(&handler->conf->seccomp, data_sock1);
+       if (ret < 0) {
+               SYSERROR("Failed to receive seccomp notify fd from child");
+               goto out_delete_net;
+       }
+
        ret = handler->ops->post_start(handler, handler->data);
        if (ret < 0)
                goto out_abort;
@@ -1961,7 +1978,7 @@ int __lxc_start(const char *name, struct lxc_handler *handler,
        ret = lxc_init(name, handler);
        if (ret < 0) {
                ERROR("Failed to initialize container \"%s\"", name);
-               return -1;
+               goto out_fini_nonet;
        }
        handler->ops = ops;
        handler->data = data;
@@ -1970,16 +1987,19 @@ int __lxc_start(const char *name, struct lxc_handler *handler,
 
        if (!attach_block_device(handler->conf)) {
                ERROR("Failed to attach block device");
+               ret = -1;
                goto out_fini_nonet;
        }
 
        if (!cgroup_ops->monitor_create(cgroup_ops, handler)) {
                ERROR("Failed to create monitor cgroup");
+               ret = -1;
                goto out_fini_nonet;
        }
 
        if (!cgroup_ops->monitor_enter(cgroup_ops, handler->monitor_pid)) {
                ERROR("Failed to enter monitor cgroup");
+               ret = -1;
                goto out_fini_nonet;
        }
 
@@ -2008,11 +2028,6 @@ int __lxc_start(const char *name, struct lxc_handler *handler,
                ERROR("Failed to spawn container \"%s\"", name);
                goto out_detach_blockdev;
        }
-       /* close parent side of data socket */
-       close(handler->data_sock[0]);
-       handler->data_sock[0] = -1;
-       close(handler->data_sock[1]);
-       handler->data_sock[1] = -1;
 
        handler->conf->reboot = REBOOT_NONE;
 
@@ -2024,6 +2039,7 @@ int __lxc_start(const char *name, struct lxc_handler *handler,
 
        if (!handler->init_died && handler->pid > 0) {
                ERROR("Child process is not killed");
+               ret = -1;
                goto out_abort;
        }
 
@@ -2125,7 +2141,7 @@ int lxc_start(const char *name, char *const argv[], struct lxc_handler *handler,
 static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
                                            const char *name)
 {
-       char destroy[MAXPATHLEN];
+       char destroy[PATH_MAX];
        struct lxc_container *c;
        int ret = 0;
        bool bret = true;
@@ -2139,8 +2155,8 @@ static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
        }
        INFO("Destroyed rootfs for container \"%s\"", name);
 
-       ret = snprintf(destroy, MAXPATHLEN, "%s/%s", handler->lxcpath, name);
-       if (ret < 0 || ret >= MAXPATHLEN) {
+       ret = snprintf(destroy, PATH_MAX, "%s/%s", handler->lxcpath, name);
+       if (ret < 0 || ret >= PATH_MAX) {
                ERROR("Error destroying directory for container \"%s\"", name);
                return;
        }