ovl_rsync: make sure to umount

[mirror_lxc.git] / src / lxc / start.c
diff --git a/src/lxc/start.c b/src/lxc/start.c

index 251bd26bbdd2c2d2809cce94ed3c53747e6750cc..4e977c52fd9e5d8a3adee7034f1c7e43e8334c79 100644 (file)
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -24,7 +24,6 @@
  #include "config.h"
  
  #include <stdio.h>
-#undef _GNU_SOURCE
  #include <string.h>
  #include <stdlib.h>
  #include <dirent.h>
@@ -32,7 +31,8 @@
  #include <unistd.h>
  #include <signal.h>
  #include <fcntl.h>
-#include <termios.h>
+#include <grp.h>
+#include <poll.h>
  #include <sys/param.h>
  #include <sys/file.h>
  #include <sys/mount.h>
@@ -43,7 +43,6 @@
  #include <sys/types.h>
  #include <sys/wait.h>
  #include <sys/un.h>
-#include <sys/poll.h>
  #include <sys/syscall.h>
  
  #if HAVE_SYS_CAPABILITY_H
@@ -70,8 +69,9 @@
  #include "namespace.h"
  #include "lxcseccomp.h"
  #include "caps.h"
-#include "lxclock.h"
+#include "bdev.h"
  #include "lsm/lsm.h"
+#include "lxclock.h"
  
  lxc_log_define(lxc_start, lxc);
  
@@ -84,38 +84,63 @@ const struct ns_info ns_info[LXC_NS_MAX] = {
         [LXC_NS_NET] = {"net", CLONE_NEWNET}
  };
  
+extern void mod_all_rdeps(struct lxc_container *c, bool inc);
+static bool do_destroy_container(struct lxc_conf *conf);
+static int lxc_rmdir_onedev_wrapper(void *data);
+static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
+                                           const char *name);
+
+static void print_top_failing_dir(const char *path)
+{
+       size_t len = strlen(path);
+       char *copy = alloca(len+1), *p, *e, saved;
+       strcpy(copy, path);
+
+       p = copy;
+       e = copy + len;
+       while (p < e) {
+               while (p < e && *p == '/') p++;
+               while (p < e && *p != '/') p++;
+               saved = *p;
+               *p = '\0';
+               if (access(copy, X_OK)) {
+                       SYSERROR("could not access %s.  Please grant it 'x' " \
+                             "access, or add an ACL for the container root.",
+                             copy);
+                       return;
+               }
+               *p = saved;
+       }
+}
+
  static void close_ns(int ns_fd[LXC_NS_MAX]) {
         int i;
  
-       process_lock();
         for (i = 0; i < LXC_NS_MAX; i++) {
                 if (ns_fd[i] > -1) {
                         close(ns_fd[i]);
                         ns_fd[i] = -1;
                 }
         }
-       process_unlock();
  }
  
  static int preserve_ns(int ns_fd[LXC_NS_MAX], int clone_flags) {
         int i, saved_errno;
         char path[MAXPATHLEN];
  
-       if (access("/proc/self/ns", X_OK)) {
-               ERROR("Does this kernel version support 'attach'?");
-               return -1;
-       }
-
         for (i = 0; i < LXC_NS_MAX; i++)
                 ns_fd[i] = -1;
  
+       if (access("/proc/self/ns", X_OK)) {
+               WARN("Kernel does not support attach; preserve_ns ignored");
+               return 0;
+       }
+
         for (i = 0; i < LXC_NS_MAX; i++) {
                 if ((clone_flags & ns_info[i].clone_flag) == 0)
                         continue;
                 snprintf(path, MAXPATHLEN, "/proc/self/ns/%s", ns_info[i].proc_name);
-               process_lock();
                 ns_fd[i] = open(path, O_RDONLY | O_CLOEXEC);
-               process_unlock();
                 if (ns_fd[i] < 0)
                         goto error;
         }
@@ -152,16 +177,26 @@ static int match_fd(int fd)
         return (fd == 0 || fd == 1 || fd == 2);
  }
  
-int lxc_check_inherited(struct lxc_conf *conf, int fd_to_ignore)
+/*
+ * Check for any fds we need to close
+ * * if fd_to_ignore != -1, then if we find that fd open we will ignore it.
+ * * By default we warn about open fds we find.
+ * * If closeall is true, we will close open fds.
+ * * If lxc-start was passed "-C", then conf->close_all_fds will be true,
+ *     in which case we also close all open fds.
+ * * A daemonized container will always pass closeall=true.
+ */
+int lxc_check_inherited(struct lxc_conf *conf, bool closeall, int fd_to_ignore)
  {
         struct dirent dirent, *direntp;
         int fd, fddir;
         DIR *dir;
  
+       if (conf && conf->close_all_fds)
+               closeall = true;
+
  restart:
-       process_lock();
         dir = opendir("/proc/self/fd");
-       process_unlock();
         if (!dir) {
                 WARN("failed to open directory: %m");
                 return -1;
@@ -184,23 +219,22 @@ restart:
                 if (fd == fddir || fd == lxc_log_fd || fd == fd_to_ignore)
                         continue;
  
+               if (current_config && fd == current_config->logfd)
+                       continue;
+
                 if (match_fd(fd))
                         continue;
  
-               if (conf->close_all_fds) {
-                       process_lock();
+               if (closeall) {
                         close(fd);
                         closedir(dir);
-                       process_unlock();
                         INFO("closed inherited fd %d", fd);
                         goto restart;
                 }
                 WARN("inherited fd %d", fd);
         }
  
-       process_lock();
         closedir(dir); /* cannot fail */
-       process_unlock();
         return 0;
  }
  
@@ -328,7 +362,7 @@ int lxc_poll(const char *name, struct lxc_handler *handler)
                         goto out_mainloop_open;
                 }
                 #else
-                       DEBUG("not starting utmp handler as cap_sys_boot cannot be dropped without capabilities support\n");
+                       DEBUG("not starting utmp handler as cap_sys_boot cannot be dropped without capabilities support");
                 #endif
         }
  
@@ -337,9 +371,7 @@ int lxc_poll(const char *name, struct lxc_handler *handler)
  out_mainloop_open:
         lxc_mainloop_close(&descr);
  out_sigfd:
-       process_lock();
         close(sigfd);
-       process_unlock();
         return -1;
  }
  
@@ -353,6 +385,7 @@ struct lxc_handler *lxc_init(const char *name, struct lxc_conf *conf, const char
  
         memset(handler, 0, sizeof(*handler));
  
+       handler->ttysock[0] = handler->ttysock[1] = -1;
         handler->conf = conf;
         handler->lxcpath = lxcpath;
         handler->pinfd = -1;
@@ -380,16 +413,16 @@ struct lxc_handler *lxc_init(const char *name, struct lxc_conf *conf, const char
         }
  
         /* Start of environment variable setup for hooks */
-       if (setenv("LXC_NAME", name, 1)) {
+       if (name && setenv("LXC_NAME", name, 1)) {
                 SYSERROR("failed to set environment variable for container name");
         }
-       if (setenv("LXC_CONFIG_FILE", conf->rcfile, 1)) {
+       if (conf->rcfile && setenv("LXC_CONFIG_FILE", conf->rcfile, 1)) {
                 SYSERROR("failed to set environment variable for config path");
         }
-       if (setenv("LXC_ROOTFS_MOUNT", conf->rootfs.mount, 1)) {
+       if (conf->rootfs.mount && setenv("LXC_ROOTFS_MOUNT", conf->rootfs.mount, 1)) {
                 SYSERROR("failed to set environment variable for rootfs mount");
         }
-       if (setenv("LXC_ROOTFS_PATH", conf->rootfs.path, 1)) {
+       if (conf->rootfs.path && setenv("LXC_ROOTFS_PATH", conf->rootfs.path, 1)) {
                 SYSERROR("failed to set environment variable for rootfs mount");
         }
         if (conf->console.path && setenv("LXC_CONSOLE", conf->console.path, 1)) {
@@ -405,11 +438,6 @@ struct lxc_handler *lxc_init(const char *name, struct lxc_conf *conf, const char
                 goto out_aborting;
         }
  
-       if (lxc_create_tty(name, conf)) {
-               ERROR("failed to create the ttys");
-               goto out_aborting;
-       }
-
         /* the signal fd has to be created before forking otherwise
          * if the child process exits before we setup the signal fd,
          * the event will be lost and the command will be stuck */
@@ -440,9 +468,7 @@ out_delete_tty:
  out_aborting:
         lxc_set_state(name, handler, ABORTING);
  out_close_maincmd_fd:
-       process_lock();
         close(conf->maincmd_fd);
-       process_unlock();
         conf->maincmd_fd = -1;
  out_free_name:
         free(handler->name);
@@ -452,7 +478,7 @@ out_free:
         return NULL;
  }
  
-static void lxc_fini(const char *name, struct lxc_handler *handler)
+void lxc_fini(const char *name, struct lxc_handler *handler)
  {
         /* The STOPPING state is there for future cleanup code
          * which can take awhile
@@ -469,15 +495,17 @@ static void lxc_fini(const char *name, struct lxc_handler *handler)
  
         lxc_console_delete(&handler->conf->console);
         lxc_delete_tty(&handler->conf->tty_info);
-       process_lock();
         close(handler->conf->maincmd_fd);
-       process_unlock();
         handler->conf->maincmd_fd = -1;
         free(handler->name);
-       if (handler->cgroup) {
-               lxc_cgroup_process_info_free_and_remove(handler->cgroup);
-               handler->cgroup = NULL;
+       if (handler->ttysock[0] != -1) {
+               close(handler->ttysock[0]);
+               close(handler->ttysock[1]);
         }
+       if (handler->conf->ephemeral == 1 && handler->conf->reboot != 1) {
+               lxc_destroy_container_on_signal(handler, name);
+       }
+       cgroup_destroy(handler);
         free(handler);
  }
  
@@ -519,18 +547,14 @@ static int must_drop_cap_sys_boot(struct lxc_conf *conf)
          int status;
          pid_t pid;
  
-       process_lock();
         f = fopen("/proc/sys/kernel/ctrl-alt-del", "r");
-       process_unlock();
         if (!f) {
                 DEBUG("failed to open /proc/sys/kernel/ctrl-alt-del");
                 return 1;
         }
  
         ret = fscanf(f, "%d", &v);
-       process_lock();
         fclose(f);
-       process_unlock();
         if (ret != 1) {
                 DEBUG("Failed to read /proc/sys/kernel/ctrl-alt-del");
                 return 1;
@@ -542,19 +566,22 @@ static int must_drop_cap_sys_boot(struct lxc_conf *conf)
                 flags |= CLONE_NEWUSER;
  
  #ifdef __ia64__
-        pid = __clone2(container_reboot_supported, stack, stack_size, flags,  &cmd);
+       pid = __clone2(container_reboot_supported, stack, stack_size, flags,  &cmd);
  #else
-        stack += stack_size;
-        pid = clone(container_reboot_supported, stack, flags, &cmd);
+       stack += stack_size;
+       pid = clone(container_reboot_supported, stack, flags, &cmd);
  #endif
-        if (pid < 0) {
-                SYSERROR("failed to clone\n");
-                return -1;
-        }
-        if (wait(&status) < 0) {
-                SYSERROR("unexpected wait error: %m\n");
-                return -1;
-        }
+       if (pid < 0) {
+               if (flags & CLONE_NEWUSER)
+                       ERROR("failed to clone (%#x): %s (includes CLONE_NEWUSER)", flags, strerror(errno));
+               else
+                       ERROR("failed to clone (%#x): %s", flags, strerror(errno));
+               return -1;
+       }
+       if (wait(&status) < 0) {
+               SYSERROR("unexpected wait error: %m");
+               return -1;
+       }
  
         if (WEXITSTATUS(status) != 1)
                 return 1;
@@ -562,10 +589,56 @@ static int must_drop_cap_sys_boot(struct lxc_conf *conf)
         return 0;
  }
  
+/*
+ * netpipe is used in the unprivileged case to transfer the ifindexes
+ * from parent to child
+ */
+static int netpipe = -1;
+
+static inline int count_veths(struct lxc_list *network)
+{
+       struct lxc_list *iterator;
+       struct lxc_netdev *netdev;
+       int count = 0;
+
+       lxc_list_for_each(iterator, network) {
+               netdev = iterator->elem;
+               if (netdev->type != LXC_NET_VETH)
+                       continue;
+               count++;
+       }
+       return count;
+}
+
+static int read_unpriv_netifindex(struct lxc_list *network)
+{
+       struct lxc_list *iterator;
+       struct lxc_netdev *netdev;
+
+       if (netpipe == -1)
+               return 0;
+       lxc_list_for_each(iterator, network) {
+               netdev = iterator->elem;
+               if (netdev->type != LXC_NET_VETH)
+                       continue;
+               if (!(netdev->name = malloc(IFNAMSIZ))) {
+                       ERROR("Out of memory");
+                       close(netpipe);
+                       return -1;
+               }
+               if (read(netpipe, netdev->name, IFNAMSIZ) != IFNAMSIZ) {
+                       close(netpipe);
+                       return -1;
+               }
+       }
+       close(netpipe);
+       return 0;
+}
+
  static int do_start(void *data)
  {
+       struct lxc_list *iterator;
         struct lxc_handler *handler = data;
-       const char *lsm_label = NULL;
  
         if (sigprocmask(SIG_SETMASK, &handler->oldmask, NULL)) {
                 SYSERROR("failed to set sigprocmask");
@@ -587,9 +660,7 @@ static int do_start(void *data)
  
         /* don't leak the pinfd to the container */
         if (handler->pinfd >= 0) {
-               process_lock();
                 close(handler->pinfd);
-               process_unlock();
         }
  
         /* Tell the parent task it can begin to configure the
@@ -598,20 +669,42 @@ static int do_start(void *data)
         if (lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE))
                 return -1;
  
+       if (read_unpriv_netifindex(&handler->conf->network) < 0)
+               goto out_warn_father;
+
         /*
          * if we are in a new user namespace, become root there to have
-        * privilege over our namespace
+        * privilege over our namespace. When using lxc-execute we default to root,
+        * but this can be overriden using the lxc.init_uid and lxc.init_gid
+        * configuration options.
          */
         if (!lxc_list_empty(&handler->conf->id_map)) {
-               NOTICE("switching to gid/uid 0 in new user namespace");
-               if (setgid(0)) {
+               gid_t new_gid = 0;
+               if (handler->conf->is_execute && handler->conf->init_gid)
+                       new_gid = handler->conf->init_gid;
+
+               uid_t new_uid = 0;
+               if (handler->conf->is_execute && handler->conf->init_uid)
+                       new_uid = handler->conf->init_uid;
+
+               NOTICE("switching to gid/uid %d/%d in new user namespace", new_gid, new_uid);
+               if (setgid(new_gid)) {
                         SYSERROR("setgid");
                         goto out_warn_father;
                 }
-               if (setuid(0)) {
+               if (setuid(new_uid)) {
                         SYSERROR("setuid");
                         goto out_warn_father;
                 }
+               if (setgroups(0, NULL)) {
+                       SYSERROR("setgroups");
+                       goto out_warn_father;
+               }
+       }
+
+       if (access(handler->lxcpath, X_OK)) {
+               print_top_failing_dir(handler->lxcpath);
+               goto out_warn_father;
         }
  
         #if HAVE_SYS_CAPABILITY_H
@@ -620,12 +713,12 @@ static int do_start(void *data)
                         SYSERROR("failed to remove CAP_SYS_BOOT capability");
                         goto out_warn_father;
                 }
-               DEBUG("Dropped cap_sys_boot\n");
+               DEBUG("Dropped cap_sys_boot");
         }
         #endif
  
         /* Setup the container, ip, names, utsname, ... */
-       if (lxc_setup(handler->name, handler->conf, handler->lxcpath, handler->cgroup, handler->data) ){
+       if (lxc_setup(handler)) {
                 ERROR("failed to setup the container");
                 goto out_warn_father;
         }
@@ -635,13 +728,20 @@ static int do_start(void *data)
                 return -1;
  
         /* Set the label to change to when we exec(2) the container's init */
-       if (!strcmp(lsm_name(), "AppArmor"))
-               lsm_label = handler->conf->lsm_aa_profile;
-       else if (!strcmp(lsm_name(), "SELinux"))
-               lsm_label = handler->conf->lsm_se_context;
-       if (lsm_process_label_set(lsm_label, 1, 1) < 0)
+       if (lsm_process_label_set(NULL, handler->conf, 1, 1) < 0)
                 goto out_warn_father;
-       lsm_proc_unmount(handler->conf);
+
+       /* Some init's such as busybox will set sane tty settings on stdin,
+        * stdout, stderr which it thinks is the console. We already set them
+        * the way we wanted on the real terminal, and we want init to do its
+        * setup on its console ie. the pty allocated in lxc_console_create()
+        * so make sure that that pty is stdin,stdout,stderr.
+        */
+       if (lxc_console_set_stdfds(handler) < 0)
+               goto out_warn_father;
+
+       /* If we mounted a temporary proc, then unmount it now */
+       tmp_proc_unmount(handler->conf);
  
         if (lxc_seccomp_load(handler->conf) != 0)
                 goto out_warn_father;
@@ -652,7 +752,7 @@ static int do_start(void *data)
         }
  
         /* The clearenv() and putenv() calls have been moved here
-        * to allow us to use enviroment variables passed to the various
+        * to allow us to use environment variables passed to the various
          * hooks, such as the start hook above.  Not all of the
          * variables like CONFIG_PATH or ROOTFS are valid in this
          * context but others are. */
@@ -661,14 +761,29 @@ static int do_start(void *data)
                 /* don't error out though */
         }
  
+       lxc_list_for_each(iterator, &handler->conf->environment) {
+               if (putenv((char *)iterator->elem)) {
+                       SYSERROR("failed to set environment variable '%s'", (char *)iterator->elem);
+                       goto out_warn_father;
+               }
+       }
+
         if (putenv("container=lxc")) {
-               SYSERROR("failed to set environment variable");
+               SYSERROR("failed to set environment variable 'container=lxc'");
                 goto out_warn_father;
         }
  
-       process_lock();
+       if (handler->conf->pty_names) {
+               if (putenv(handler->conf->pty_names)) {
+                       SYSERROR("failed to set environment variable for container ptys");
+                       goto out_warn_father;
+               }
+       }
+
         close(handler->sigfd);
-       process_unlock();
+
+       if (handler->backgrounded && null_stdfds() < 0)
+               goto out_warn_father;
  
         /* after this call, we are in error because this
          * ops should not return as it execs */
@@ -681,7 +796,7 @@ out_warn_father:
         return -1;
  }
  
-int save_phys_nics(struct lxc_conf *conf)
+static int save_phys_nics(struct lxc_conf *conf)
  {
         struct lxc_list *iterator;
  
@@ -702,7 +817,7 @@ int save_phys_nics(struct lxc_conf *conf)
                         SYSERROR("failed to allocate memory");
                         return -1;
                 }
-               INFO("stored saved_nic #%d idx %d name %s\n", conf->num_savednics,
+               INFO("stored saved_nic #%d idx %d name %s", conf->num_savednics,
                         conf->saved_nics[conf->num_savednics].ifindex,
                         conf->saved_nics[conf->num_savednics].orig_name);
                 conf->num_savednics++;
@@ -711,14 +826,85 @@ int save_phys_nics(struct lxc_conf *conf)
         return 0;
  }
  
-int lxc_spawn(struct lxc_handler *handler)
+static int recv_fd(int sock, int *fd)
+{
+       if (lxc_abstract_unix_recv_fd(sock, fd, NULL, 0) < 0) {
+               SYSERROR("Error receiving tty fd from child");
+               return -1;
+       }
+       if (*fd == -1)
+               return -1;
+       return 0;
+}
+
+static int recv_ttys_from_child(struct lxc_handler *handler)
+{
+       struct lxc_conf *conf = handler->conf;
+       int i, sock = handler->ttysock[1];
+       struct lxc_tty_info *tty_info = &conf->tty_info;
+
+       if (!conf->tty)
+               return 0;
+
+       tty_info->pty_info = malloc(sizeof(*tty_info->pty_info)*conf->tty);
+       if (!tty_info->pty_info) {
+               SYSERROR("failed to allocate pty_info");
+               return -1;
+       }
+
+       for (i = 0; i < conf->tty; i++) {
+               struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
+               pty_info->busy = 0;
+               if (recv_fd(sock, &pty_info->slave) < 0 ||
+                               recv_fd(sock, &pty_info->master) < 0) {
+                       ERROR("Error receiving tty info from child");
+                       return -1;
+               }
+       }
+       tty_info->nbtty = conf->tty;
+
+       return 0;
+}
+
+void resolve_clone_flags(struct lxc_handler *handler)
+{
+       handler->clone_flags = CLONE_NEWPID | CLONE_NEWNS;
+
+       if (!lxc_list_empty(&handler->conf->id_map)) {
+               INFO("Cloning a new user namespace");
+               handler->clone_flags |= CLONE_NEWUSER;
+       }
+
+       if (handler->conf->inherit_ns_fd[LXC_NS_NET] == -1) {
+               if (!lxc_requests_empty_network(handler))
+                       handler->clone_flags |= CLONE_NEWNET;
+       } else {
+               INFO("Inheriting a net namespace");
+       }
+
+       if (handler->conf->inherit_ns_fd[LXC_NS_IPC] == -1) {
+               handler->clone_flags |= CLONE_NEWIPC;
+       } else {
+               INFO("Inheriting an IPC namespace");
+       }
+
+       if (handler->conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
+               handler->clone_flags |= CLONE_NEWUTS;
+       } else {
+               INFO("Inheriting a UTS namespace");
+       }
+}
+
+static int lxc_spawn(struct lxc_handler *handler)
  {
         int failed_before_rename = 0;
         const char *name = handler->name;
-       struct cgroup_meta_data *cgroup_meta = NULL;
-       const char *cgroup_pattern = NULL;
+       bool cgroups_connected = false;
         int saved_ns_fd[LXC_NS_MAX];
         int preserve_mask = 0, i;
+       int netpipepair[2], nveths;
+
+       netpipe = -1;
  
         for (i = 0; i < LXC_NS_MAX; i++)
                 if (handler->conf->inherit_ns_fd[i] != -1)
@@ -727,16 +913,14 @@ int lxc_spawn(struct lxc_handler *handler)
         if (lxc_sync_init(handler))
                 return -1;
  
-       handler->clone_flags = CLONE_NEWPID|CLONE_NEWNS;
-       if (!lxc_list_empty(&handler->conf->id_map)) {
-               INFO("Cloning a new user namespace");
-               handler->clone_flags |= CLONE_NEWUSER;
+       if (socketpair(AF_UNIX, SOCK_DGRAM, 0, handler->ttysock) < 0) {
+               lxc_sync_fini(handler);
+               return -1;
         }
  
-       if (handler->conf->inherit_ns_fd[LXC_NS_NET] == -1) {
-               if (!lxc_requests_empty_network(handler))
-                       handler->clone_flags |= CLONE_NEWNET;
+       resolve_clone_flags(handler);
  
+       if (handler->clone_flags & CLONE_NEWNET) {
                 if (!lxc_list_empty(&handler->conf->network)) {
  
                         /* Find gateway addresses from the link device, which is
@@ -763,58 +947,45 @@ int lxc_spawn(struct lxc_handler *handler)
                         ERROR("failed to save physical nic info");
                         goto out_abort;
                 }
-       } else {
-               INFO("Inheriting a net namespace");
         }
  
-       if (handler->conf->inherit_ns_fd[LXC_NS_IPC] == -1) {
-               handler->clone_flags |= CLONE_NEWIPC;
-       } else {
-               INFO("Inheriting an IPC namespace");
-       }
-
-       if (handler->conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
-               handler->clone_flags |= CLONE_NEWUTS;
-       } else {
-               INFO("Inheriting a UTS namespace");
-       }
-
-
-       cgroup_meta = lxc_cgroup_load_meta();
-       if (!cgroup_meta) {
-               ERROR("failed to detect cgroup metadata");
+       if (!cgroup_init(handler)) {
+               ERROR("failed initializing cgroup support");
                 goto out_delete_net;
         }
  
-       /* if we are running as root, use system cgroup pattern, otherwise
-        * just create a cgroup under the current one. But also fall back to
-        * that if for some reason reading the configuration fails and no
-        * default value is available
-        */
-       if (getuid() == 0)
-               cgroup_pattern = default_cgroup_pattern();
-       if (!cgroup_pattern)
-               cgroup_pattern = "%n";
+       cgroups_connected = true;
  
-       /* Create cgroup before doing clone(), so the child will know from
-        * handler which cgroup it is going to be put in later.
-        */
-       if ((handler->cgroup = lxc_cgroup_create(name, cgroup_pattern, cgroup_meta, NULL)) == NULL) {
-               ERROR("failed to create cgroups for '%s'", name);
+       if (!cgroup_create(handler)) {
+               ERROR("failed creating cgroups");
                 goto out_delete_net;
         }
  
         /*
          * if the rootfs is not a blockdev, prevent the container from
          * marking it readonly.
+        *
+        * if the container is unprivileged then skip rootfs pinning
          */
+       if (lxc_list_empty(&handler->conf->id_map)) {
+               handler->pinfd = pin_rootfs(handler->conf->rootfs.path);
+               if (handler->pinfd == -1)
+                       INFO("failed to pin the container's rootfs");
+       }
  
-       handler->pinfd = pin_rootfs(handler->conf->rootfs.path);
-       if (handler->pinfd == -1)
-               INFO("failed to pin the container's rootfs");
+       if (preserve_ns(saved_ns_fd, preserve_mask) < 0)
+               goto out_delete_net;
+       if (attach_ns(handler->conf->inherit_ns_fd) < 0)
+               goto out_delete_net;
  
-       preserve_ns(saved_ns_fd, preserve_mask);
-       attach_ns(handler->conf->inherit_ns_fd);
+       if (am_unpriv() && (nveths = count_veths(&handler->conf->network))) {
+               if (pipe(netpipepair) < 0) {
+                       SYSERROR("Error creating pipe");
+                       goto out_delete_net;
+               }
+               /* store netpipe in the global var for do_start's use */
+               netpipe = netpipepair[0];
+       }
  
         /* Create a process in a new set of namespaces */
         handler->pid = lxc_clone(do_start, handler, handler->clone_flags);
@@ -823,27 +994,27 @@ int lxc_spawn(struct lxc_handler *handler)
                 goto out_delete_net;
         }
  
-       attach_ns(saved_ns_fd);
+       if (attach_ns(saved_ns_fd))
+               WARN("failed to restore saved namespaces");
  
         lxc_sync_fini_child(handler);
  
         if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE))
                 failed_before_rename = 1;
  
-       /* In case there is still legacy ns cgroup support in the kernel.
-        * Should be removed at some later point in time.
-        */
-       if (lxc_cgroup_create_legacy(handler->cgroup, name, handler->pid) < 0) {
-               ERROR("failed to create legacy ns cgroups for '%s'", name);
+       if (!cgroup_create_legacy(handler)) {
+               ERROR("failed to setup the legacy cgroups for %s", name);
                 goto out_delete_net;
         }
-
-       if (lxc_setup_cgroup_without_devices(handler, &handler->conf->cgroup)) {
-               ERROR("failed to setup the cgroups for '%s'", name);
+       if (!cgroup_setup_limits(handler, false)) {
+               ERROR("failed to setup the cgroup limits for '%s'", name);
                 goto out_delete_net;
         }
  
-       if (lxc_cgroup_enter(handler->cgroup, handler->pid, false) < 0)
+       if (!cgroup_enter(handler))
+               goto out_delete_net;
+
+       if (!cgroup_chown(handler))
                 goto out_delete_net;
  
         if (failed_before_rename)
@@ -857,6 +1028,23 @@ int lxc_spawn(struct lxc_handler *handler)
                 }
         }
  
+       if (netpipe != -1) {
+               struct lxc_list *iterator;
+               struct lxc_netdev *netdev;
+
+               close(netpipe);
+               lxc_list_for_each(iterator, &handler->conf->network) {
+                       netdev = iterator->elem;
+                       if (netdev->type != LXC_NET_VETH)
+                               continue;
+                       if (write(netpipepair[1], netdev->name, IFNAMSIZ) != IFNAMSIZ) {
+                               ERROR("Error writing veth name to container");
+                               goto out_delete_net;
+                       }
+               }
+               close(netpipepair[1]);
+       }
+
         /* map the container uids - the container became an invalid
          * userid the moment it was cloned with CLONE_NEWUSER - this
          * call doesn't change anything immediately, but allows the
@@ -873,11 +1061,20 @@ int lxc_spawn(struct lxc_handler *handler)
         if (lxc_sync_barrier_child(handler, LXC_SYNC_POST_CONFIGURE))
                 goto out_delete_net;
  
-       if (lxc_setup_cgroup_devices(handler, &handler->conf->cgroup)) {
+       if (!cgroup_setup_limits(handler, true)) {
                 ERROR("failed to setup the devices cgroup for '%s'", name);
                 goto out_delete_net;
         }
  
+       cgroup_disconnect();
+       cgroups_connected = false;
+
+       /* read tty fds allocated by child */
+       if (recv_ttys_from_child(handler) < 0) {
+               ERROR("failed to receive tty info from child");
+               goto out_delete_net;
+       }
+
         /* Tell the child to complete its initialization and wait for
          * it to exec or return an error.  (the child will never
          * return LXC_SYNC_POST_CGROUP+1.  It will either close the
@@ -900,34 +1097,54 @@ int lxc_spawn(struct lxc_handler *handler)
                 goto out_abort;
         }
  
-       lxc_cgroup_put_meta(cgroup_meta);
         lxc_sync_fini(handler);
  
         return 0;
  
  out_delete_net:
+       if (cgroups_connected)
+               cgroup_disconnect();
         if (handler->clone_flags & CLONE_NEWNET)
                 lxc_delete_network(handler);
  out_abort:
-       lxc_cgroup_put_meta(cgroup_meta);
         lxc_abort(name, handler);
         lxc_sync_fini(handler);
         if (handler->pinfd >= 0) {
-               process_lock();
                 close(handler->pinfd);
-               process_unlock();
                 handler->pinfd = -1;
         }
  
         return -1;
  }
  
+int get_netns_fd(int pid)
+{
+       char path[MAXPATHLEN];
+       int ret, fd;
+
+       ret = snprintf(path, MAXPATHLEN, "/proc/%d/ns/net", pid);
+       if (ret < 0 || ret >= MAXPATHLEN) {
+               WARN("Failed to pin netns file for pid %d", pid);
+               return -1;
+       }
+
+       fd = open(path, O_RDONLY);
+       if (fd < 0) {
+               WARN("Failed to pin netns file %s for pid %d: %s",
+                               path, pid, strerror(errno));
+               return -1;
+       }
+       return fd;
+}
+
  int __lxc_start(const char *name, struct lxc_conf *conf,
-               struct lxc_operations* ops, void *data, const char *lxcpath)
+               struct lxc_operations* ops, void *data, const char *lxcpath,
+               bool backgrounded)
  {
         struct lxc_handler *handler;
         int err = -1;
         int status;
+       int netnsfd = -1;
  
         handler = lxc_init(name, conf, lxcpath);
         if (!handler) {
@@ -936,27 +1153,55 @@ int __lxc_start(const char *name, struct lxc_conf *conf,
         }
         handler->ops = ops;
         handler->data = data;
+       handler->backgrounded = backgrounded;
  
         if (must_drop_cap_sys_boot(handler->conf)) {
                 #if HAVE_SYS_CAPABILITY_H
-               DEBUG("Dropping cap_sys_boot\n");
+               DEBUG("Dropping cap_sys_boot");
                 #else
-               DEBUG("Can't drop cap_sys_boot as capabilities aren't supported\n");
+               DEBUG("Can't drop cap_sys_boot as capabilities aren't supported");
                 #endif
         } else {
-               DEBUG("Not dropping cap_sys_boot or watching utmp\n");
+               DEBUG("Not dropping cap_sys_boot or watching utmp");
                 handler->conf->need_utmp_watch = 0;
         }
  
+       if (!attach_block_device(handler->conf)) {
+               ERROR("Failure attaching block device");
+               goto out_fini_nonet;
+       }
+
+       if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) {
+               /* if the backing store is a device, mount it here and now */
+               if (rootfs_is_blockdev(conf)) {
+                       if (unshare(CLONE_NEWNS) < 0) {
+                               ERROR("Error unsharing mounts");
+                               goto out_fini_nonet;
+                       }
+                       remount_all_slave();
+                       if (do_rootfs_setup(conf, name, lxcpath) < 0) {
+                               ERROR("Error setting up rootfs mount as root before spawn");
+                               goto out_fini_nonet;
+                       }
+                       INFO("Set up container rootfs as host root");
+               }
+       }
+
         err = lxc_spawn(handler);
         if (err) {
                 ERROR("failed to spawn '%s'", name);
-               goto out_fini_nonet;
+               goto out_detach_blockdev;
         }
  
+       handler->conf->reboot = 0;
+
+       netnsfd = get_netns_fd(handler->pid);
+
         err = lxc_poll(name, handler);
         if (err) {
                 ERROR("mainloop exited with an error");
+               if (netnsfd >= 0)
+                       close(netnsfd);
                 goto out_abort;
         }
  
@@ -978,25 +1223,37 @@ int __lxc_start(const char *name, struct lxc_conf *conf,
                         DEBUG("Container rebooting");
                         handler->conf->reboot = 1;
                         break;
+               case SIGSYS: /* seccomp */
+                       DEBUG("Container violated its seccomp policy");
+                       break;
                 default:
-                       DEBUG("unknown exit status for init: %d\n", WTERMSIG(status));
+                       DEBUG("unknown exit status for init: %d", WTERMSIG(status));
                         break;
                 }
          }
  
-       lxc_rename_phys_nics_on_shutdown(handler->conf);
+       DEBUG("Pushing physical nics back to host namespace");
+       lxc_rename_phys_nics_on_shutdown(netnsfd, handler->conf);
+
+       DEBUG("Tearing down virtual network devices used by container");
+       lxc_delete_network(handler);
+
+       if (netnsfd >= 0)
+               close(netnsfd);
  
         if (handler->pinfd >= 0) {
-               process_lock();
                 close(handler->pinfd);
-               process_unlock();
                 handler->pinfd = -1;
         }
  
+       lxc_monitor_send_exit_code(name, status, handler->lxcpath);
         err =  lxc_error_set_and_log(handler->pid, status);
  out_fini:
         lxc_delete_network(handler);
  
+out_detach_blockdev:
+       detach_block_device(handler->conf);
+
  out_fini_nonet:
         lxc_fini(name, handler);
         return err;
@@ -1035,15 +1292,75 @@ static struct lxc_operations start_ops = {
  };
  
  int lxc_start(const char *name, char *const argv[], struct lxc_conf *conf,
-             const char *lxcpath)
+             const char *lxcpath, bool backgrounded)
  {
         struct start_args start_arg = {
                 .argv = argv,
         };
  
-       if (lxc_check_inherited(conf, -1))
-               return -1;
-
         conf->need_utmp_watch = 1;
-       return __lxc_start(name, conf, &start_ops, &start_arg, lxcpath);
+       return __lxc_start(name, conf, &start_ops, &start_arg, lxcpath, backgrounded);
  }
+
+static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
+                                           const char *name)
+{
+       char destroy[MAXPATHLEN];
+       bool bret = true;
+       int ret = 0;
+       struct lxc_container *c;
+       if (handler->conf && handler->conf->rootfs.path && handler->conf->rootfs.mount) {
+               bret = do_destroy_container(handler->conf);
+               if (!bret) {
+                       ERROR("Error destroying rootfs for %s", name);
+                       return;
+               }
+       }
+       INFO("Destroyed rootfs for %s", name);
+
+       ret = snprintf(destroy, MAXPATHLEN, "%s/%s", handler->lxcpath, name);
+       if (ret < 0 || ret >= MAXPATHLEN) {
+               ERROR("Error printing path for %s", name);
+               ERROR("Error destroying directory for %s", name);
+               return;
+       }
+
+       c = lxc_container_new(name, handler->lxcpath);
+       if (c) {
+               if (container_disk_lock(c)) {
+                       INFO("Could not update lxc_snapshots file");
+                       lxc_container_put(c);
+               } else {
+                       mod_all_rdeps(c, false);
+                       container_disk_unlock(c);
+                       lxc_container_put(c);
+               }
+       }
+
+       if (am_unpriv())
+               ret = userns_exec_1(handler->conf, lxc_rmdir_onedev_wrapper, destroy);
+       else
+               ret = lxc_rmdir_onedev(destroy, NULL);
+
+       if (ret < 0) {
+               ERROR("Error destroying directory for %s", name);
+               return;
+       }
+       INFO("Destroyed directory for %s", name);
+}
+
+static int lxc_rmdir_onedev_wrapper(void *data)
+{
+       char *arg = (char *) data;
+       return lxc_rmdir_onedev(arg, NULL);
+}
+
+static bool do_destroy_container(struct lxc_conf *conf) {
+        if (am_unpriv()) {
+                if (userns_exec_1(conf, bdev_destroy_wrapper, conf) < 0)
+                        return false;
+                return true;
+        }
+        return bdev_destroy(conf);
+}
+