]> git.proxmox.com Git - mirror_lxc.git/blobdiff - src/lxc/conf.c
refactor AppArmor into LSM backend, add SELinux support
[mirror_lxc.git] / src / lxc / conf.c
index 2e6a7921331eea8bcf996524967a4b69ef079249..18a92c9d2e2f8845416cf066aba6e38615861da1 100644 (file)
@@ -4,7 +4,7 @@
  * (C) Copyright IBM Corp. 2007, 2008
  *
  * Authors:
- * Daniel Lezcano <dlezcano at fr.ibm.com>
+ * Daniel Lezcano <daniel.lezcano at free.fr>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -18,7 +18,7 @@
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #define _GNU_SOURCE
 #include <stdio.h>
 #include <errno.h>
 #include <string.h>
 #include <dirent.h>
-#include <mntent.h>
 #include <unistd.h>
 #include <sys/wait.h>
+#include <sys/syscall.h>
+#include <time.h>
+
+#if HAVE_IFADDRS_H
+#include <ifaddrs.h>
+#else
+#include <../include/ifaddrs.h>
+#endif
+
+#if HAVE_PTY_H
 #include <pty.h>
+#else
+#include <../include/openpty.h>
+#endif
 
 #include <linux/loop.h>
 
@@ -43,8 +55,6 @@
 #include <sys/mount.h>
 #include <sys/mman.h>
 #include <sys/prctl.h>
-#include <sys/capability.h>
-#include <sys/personality.h>
 
 #include <arpa/inet.h>
 #include <fcntl.h>
 #include "log.h"
 #include "lxc.h"       /* for lxc_cgroup_set() */
 #include "caps.h"       /* for lxc_caps_last_cap() */
+#include "bdev.h"
+#include "cgroup.h"
+#include "lxclock.h"
+#include "lsm/lsm.h"
 
-#if HAVE_APPARMOR
-#include <apparmor.h>
+#if HAVE_SYS_CAPABILITY_H
+#include <sys/capability.h>
 #endif
 
-lxc_log_define(lxc_conf, lxc);
-
-#define MAXHWLEN    18
-#define MAXINDEXLEN 20
-#define MAXMTULEN   16
-#define MAXLINELEN  128
-
-#ifndef MS_DIRSYNC
-#define MS_DIRSYNC  128
+#if HAVE_SYS_PERSONALITY_H
+#include <sys/personality.h>
 #endif
 
-#ifndef MS_REC
-#define MS_REC 16384
+#if IS_BIONIC
+#include <../include/lxcmntent.h>
+#else
+#include <mntent.h>
 #endif
 
-#ifndef MNT_DETACH
-#define MNT_DETACH 2
-#endif
+#include "lxcseccomp.h"
 
-#ifndef MS_RELATIME
-#define MS_RELATIME (1 << 21)
-#endif
+lxc_log_define(lxc_conf, lxc);
 
-#ifndef MS_STRICTATIME
-#define MS_STRICTATIME (1 << 24)
-#endif
+#define MAXHWLEN    18
+#define MAXINDEXLEN 20
+#define MAXMTULEN   16
+#define MAXLINELEN  128
 
+#if HAVE_SYS_CAPABILITY_H
 #ifndef CAP_SETFCAP
 #define CAP_SETFCAP 31
 #endif
@@ -104,15 +111,51 @@ lxc_log_define(lxc_conf, lxc);
 #ifndef CAP_MAC_ADMIN
 #define CAP_MAC_ADMIN 33
 #endif
+#endif
 
 #ifndef PR_CAPBSET_DROP
 #define PR_CAPBSET_DROP 24
 #endif
 
-char *lxchook_names[NUM_LXC_HOOKS] = {
-       "pre-start", "pre-mount", "mount", "start", "post-stop" };
+#ifndef LO_FLAGS_AUTOCLEAR
+#define LO_FLAGS_AUTOCLEAR 4
+#endif
 
+/* Define pivot_root() if missing from the C library */
+#ifndef HAVE_PIVOT_ROOT
+static int pivot_root(const char * new_root, const char * put_old)
+{
+#ifdef __NR_pivot_root
+return syscall(__NR_pivot_root, new_root, put_old);
+#else
+errno = ENOSYS;
+return -1;
+#endif
+}
+#else
 extern int pivot_root(const char * new_root, const char * put_old);
+#endif
+
+/* Define sethostname() if missing from the C library */
+#ifndef HAVE_SETHOSTNAME
+static int sethostname(const char * name, size_t len)
+{
+#ifdef __NR_sethostname
+return syscall(__NR_sethostname, name, len);
+#else
+errno = ENOSYS;
+return -1;
+#endif
+}
+#endif
+
+/* Define __S_ISTYPE if missing from the C library */
+#ifndef __S_ISTYPE
+#define        __S_ISTYPE(mode, mask)  (((mode) & S_IFMT) == (mask))
+#endif
+
+char *lxchook_names[NUM_LXC_HOOKS] = {
+       "pre-start", "pre-mount", "mount", "autodev", "start", "post-stop", "clone" };
 
 typedef int (*instanciate_cb)(struct lxc_handler *, struct lxc_netdev *);
 
@@ -184,6 +227,7 @@ static struct mount_opt mount_opt[] = {
        { NULL,            0, 0              },
 };
 
+#if HAVE_SYS_CAPABILITY_H
 static struct caps_opt caps_opt[] = {
        { "chown",             CAP_CHOWN             },
        { "dac_override",      CAP_DAC_OVERRIDE      },
@@ -230,13 +274,179 @@ static struct caps_opt caps_opt[] = {
        { "wake_alarm",        CAP_WAKE_ALARM        },
 #endif
 };
+#else
+static struct caps_opt caps_opt[] = {};
+#endif
+
+static char padchar[] =
+"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+static char *mkifname(char *template)
+{
+       char *name = NULL;
+       int i = 0;
+       FILE *urandom;
+       unsigned int seed;
+       struct ifaddrs *ifaddr, *ifa;
+       int ifexists = 0;
+
+       /* Get all the network interfaces */
+       getifaddrs(&ifaddr);
+
+       /* Initialize the random number generator */
+       process_lock();
+       urandom = fopen ("/dev/urandom", "r");
+       process_unlock();
+       if (urandom != NULL) {
+               if (fread (&seed, sizeof(seed), 1, urandom) <= 0)
+                       seed = time(0);
+               process_lock();
+               fclose(urandom);
+               process_unlock();
+       }
+       else
+               seed = time(0);
+
+#ifndef HAVE_RAND_R
+       srand(seed);
+#endif
+
+       /* Generate random names until we find one that doesn't exist */
+       while(1) {
+               ifexists = 0;
+               name = strdup(template);
+
+               if (name == NULL)
+                       return NULL;
+
+               for (i = 0; i < strlen(name); i++) {
+                       if (name[i] == 'X') {
+#ifdef HAVE_RAND_R
+                               name[i] = padchar[rand_r(&seed) % (strlen(padchar) - 1)];
+#else
+                               name[i] = padchar[rand() % (strlen(padchar) - 1)];
+#endif
+                       }
+               }
+
+               for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
+                       if (strcmp(ifa->ifa_name, name) == 0) {
+                               ifexists = 1;
+                               break;
+                       }
+               }
+
+               if (ifexists == 0)
+                       break;
+
+               free(name);
+       }
+
+       freeifaddrs(ifaddr);
+       return name;
+}
+
+static int run_buffer(char *buffer)
+{
+       FILE *f;
+       char *output;
+       int ret;
+
+       process_lock();
+       f = popen(buffer, "r");
+       process_unlock();
+       if (!f) {
+               SYSERROR("popen failed");
+               return -1;
+       }
+
+       output = malloc(LXC_LOG_BUFFER_SIZE);
+       if (!output) {
+               ERROR("failed to allocate memory for script output");
+               process_lock();
+               pclose(f);
+               process_unlock();
+               return -1;
+       }
+
+       while(fgets(output, LXC_LOG_BUFFER_SIZE, f))
+               DEBUG("script output: %s", output);
+
+       free(output);
+
+       process_lock();
+       ret = pclose(f);
+       process_unlock();
+       if (ret == -1) {
+               SYSERROR("Script exited on error");
+               return -1;
+       } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
+               ERROR("Script exited with status %d", WEXITSTATUS(ret));
+               return -1;
+       } else if (WIFSIGNALED(ret)) {
+               ERROR("Script terminated by signal %d (%s)", WTERMSIG(ret),
+                     strsignal(WTERMSIG(ret)));
+               return -1;
+       }
+
+       return 0;
+}
+
+static int run_script_argv(const char *name, const char *section,
+                     const char *script, const char *hook, const char *lxcpath,
+                     char **argsin)
+{
+       int ret, i;
+       char *buffer;
+       size_t size = 0;
+
+       INFO("Executing script '%s' for container '%s', config section '%s'",
+            script, name, section);
+
+       for (i=0; argsin && argsin[i]; i++)
+               size += strlen(argsin[i]) + 1;
+
+       size += strlen(hook) + 1;
+
+       size += strlen(script);
+       size += strlen(name);
+       size += strlen(section);
+       size += 3;
+
+       if (size > INT_MAX)
+               return -1;
+
+       buffer = alloca(size);
+       if (!buffer) {
+               ERROR("failed to allocate memory");
+               return -1;
+       }
+
+       ret = snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
+       if (ret < 0 || ret >= size) {
+               ERROR("Script name too long");
+               return -1;
+       }
+
+       for (i=0; argsin && argsin[i]; i++) {
+               int len = size-ret;
+               int rc;
+               rc = snprintf(buffer + ret, len, " %s", argsin[i]);
+               if (rc < 0 || rc >= len) {
+                       ERROR("Script args too long");
+                       return -1;
+               }
+               ret += rc;
+       }
+
+       return run_buffer(buffer);
+}
 
 static int run_script(const char *name, const char *section,
                      const char *script, ...)
 {
        int ret;
-       FILE *f;
-       char *buffer, *p, *output;
+       char *buffer, *p;
        size_t size = 0;
        va_list ap;
 
@@ -265,7 +475,6 @@ static int run_script(const char *name, const char *section,
        ret = snprintf(buffer, size, "%s %s %s", script, name, section);
        if (ret < 0 || ret >= size) {
                ERROR("Script name too long");
-               free(buffer);
                return -1;
        }
 
@@ -275,7 +484,6 @@ static int run_script(const char *name, const char *section,
                int rc;
                rc = snprintf(buffer + ret, len, " %s", p);
                if (rc < 0 || rc >= len) {
-                       free(buffer);
                        ERROR("Script args too long");
                        return -1;
                }
@@ -283,29 +491,7 @@ static int run_script(const char *name, const char *section,
        }
        va_end(ap);
 
-       f = popen(buffer, "r");
-       if (!f) {
-               SYSERROR("popen failed");
-               return -1;
-       }
-
-       output = malloc(LXC_LOG_BUFFER_SIZE);
-       if (!output) {
-               ERROR("failed to allocate memory for script output");
-               return -1;
-       }
-
-       while(fgets(output, LXC_LOG_BUFFER_SIZE, f))
-               DEBUG("script output: %s", output);
-
-       free(output);
-
-       if (pclose(f) == -1) {
-               SYSERROR("Script exited on error");
-               return -1;
-       }
-
-       return 0;
+       return run_buffer(buffer);
 }
 
 static int find_fstype_cb(char* buffer, void *data)
@@ -395,7 +581,9 @@ static int setup_lodev(const char *rootfs, int fd, struct loop_info64 *loinfo)
        int rfd;
        int ret = -1;
 
+       process_lock();
        rfd = open(rootfs, O_RDWR);
+       process_unlock();
        if (rfd < 0) {
                SYSERROR("failed to open '%s'", rootfs);
                return -1;
@@ -417,7 +605,9 @@ static int setup_lodev(const char *rootfs, int fd, struct loop_info64 *loinfo)
 
        ret = 0;
 out:
+       process_lock();
        close(rfd);
+       process_unlock();
 
        return ret;
 }
@@ -430,7 +620,9 @@ static int mount_rootfs_file(const char *rootfs, const char *target)
        DIR *dir;
        char path[MAXPATHLEN];
 
+       process_lock();
        dir = opendir("/dev");
+       process_unlock();
        if (!dir) {
                SYSERROR("failed to open '/dev'");
                return -1;
@@ -454,18 +646,25 @@ static int mount_rootfs_file(const char *rootfs, const char *target)
                if (rc < 0 || rc >= MAXPATHLEN)
                        continue;
 
+               process_lock();
                fd = open(path, O_RDWR);
+               process_unlock();
                if (fd < 0)
                        continue;
 
                if (ioctl(fd, LOOP_GET_STATUS64, &loinfo) == 0) {
+                       process_lock();
                        close(fd);
+                       process_unlock();
                        continue;
                }
 
                if (errno != ENXIO) {
                        WARN("unexpected error for ioctl on '%s': %m",
                             direntp->d_name);
+                       process_lock();
+                       close(fd);
+                       process_unlock();
                        continue;
                }
 
@@ -474,13 +673,17 @@ static int mount_rootfs_file(const char *rootfs, const char *target)
                ret = setup_lodev(rootfs, fd, &loinfo);
                if (!ret)
                        ret = mount_unknow_fs(path, target, 0);
+               process_lock();
                close(fd);
+               process_unlock();
 
                break;
        }
 
+       process_lock();
        if (closedir(dir))
                WARN("failed to close directory");
+       process_unlock();
 
        return ret;
 }
@@ -492,9 +695,10 @@ static int mount_rootfs_block(const char *rootfs, const char *target)
 
 /*
  * pin_rootfs
- * if rootfs is a directory, then open ${rootfs}.hold for writing for the
- * duration of the container run, to prevent the container from marking the
- * underlying fs readonly on shutdown.
+ * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
+ * the duration of the container run, to prevent the container from marking
+ * the underlying fs readonly on shutdown. unlink the file immediately so
+ * no name pollution is happens
  * return -1 on error.
  * return -2 if nothing needed to be pinned.
  * return an open fd (>=0) if we pinned it.
@@ -507,37 +711,116 @@ int pin_rootfs(const char *rootfs)
        int ret, fd;
 
        if (rootfs == NULL || strlen(rootfs) == 0)
-               return 0;
+               return -2;
 
-       if (!realpath(rootfs, absrootfs)) {
-               SYSERROR("failed to get real path for '%s'", rootfs);
-               return -1;
-       }
+       if (!realpath(rootfs, absrootfs))
+               return -2;
 
-       if (access(absrootfs, F_OK)) {
-               SYSERROR("'%s' is not accessible", absrootfs);
+       if (access(absrootfs, F_OK))
                return -1;
-       }
 
-       if (stat(absrootfs, &s)) {
-               SYSERROR("failed to stat '%s'", absrootfs);
+       if (stat(absrootfs, &s))
                return -1;
-       }
 
-       if (!__S_ISTYPE(s.st_mode, S_IFDIR))
+       if (!S_ISDIR(s.st_mode))
                return -2;
 
-       ret = snprintf(absrootfspin, MAXPATHLEN, "%s%s", absrootfs, ".hold");
-       if (ret >= MAXPATHLEN) {
-               SYSERROR("pathname too long for rootfs hold file");
+       ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
+       if (ret >= MAXPATHLEN)
                return -1;
-       }
 
+       process_lock();
        fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
-       INFO("opened %s as fd %d\n", absrootfspin, fd);
+       process_unlock();
+       if (fd < 0)
+               return fd;
+       (void)unlink(absrootfspin);
        return fd;
 }
 
+static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct cgroup_process_info *cgroup_info)
+{
+       char *path = NULL;
+       char *dev_null = NULL;
+       int r;
+
+       dev_null = lxc_append_paths(conf->rootfs.mount, "/dev/null");
+       if (!dev_null) {
+               SYSERROR("memory allocation error");
+               goto cleanup;
+       }
+
+       if (flags & LXC_AUTO_PROC) {
+               path = lxc_append_paths(conf->rootfs.mount, "/proc");
+               if (!path) {
+                       SYSERROR("memory allocation error trying to automatically mount /proc");
+                       goto cleanup;
+               }
+
+               r = mount("proc", path, "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
+               if (r < 0) {
+                       SYSERROR("error mounting /proc");
+                       goto cleanup;
+               }
+
+               free(path);
+               path = NULL;
+       }
+
+       if (flags & LXC_AUTO_PROC_SYSRQ) {
+               path = lxc_append_paths(conf->rootfs.mount, "/proc/sysrq-trigger");
+               if (!path) {
+                       SYSERROR("memory allocation error trying to automatically mount /proc");
+                       goto cleanup;
+               }
+
+               /* safety measure, mount /dev/null over /proc/sysrq-trigger,
+                * otherwise, a container may trigger a host reboot or such
+                */
+               r = mount(dev_null, path, NULL, MS_BIND, NULL);
+               if (r < 0)
+                       WARN("error mounting /dev/null over /proc/sysrq-trigger: %s", strerror(errno));
+
+               free(path);
+               path = NULL;
+       }
+
+       if (flags & LXC_AUTO_SYS) {
+               path = lxc_append_paths(conf->rootfs.mount, "/sys");
+               if (!path) {
+                       SYSERROR("memory allocation error trying to automatically mount /sys");
+                       goto cleanup;
+               }
+
+               r = mount("sysfs", path, "sysfs", MS_RDONLY, NULL);
+               if (r < 0) {
+                       SYSERROR("error mounting /sys");
+                       goto cleanup;
+               }
+
+               free(path);
+               path = NULL;
+       }
+
+       if (flags & LXC_AUTO_CGROUP) {
+               r = lxc_setup_mount_cgroup(conf->rootfs.mount, cgroup_info);
+               if (r < 0) {
+                       SYSERROR("error mounting /sys/fs/cgroup");
+                       goto cleanup;
+               }
+       }
+
+       free(dev_null);
+       free(path);
+
+       return 0;
+
+cleanup:
+       free(dev_null);
+       free(path);
+       return -1;
+}
+
 static int mount_rootfs(const char *rootfs, const char *target)
 {
        char absrootfs[MAXPATHLEN];
@@ -624,12 +907,17 @@ static int setup_tty(const struct lxc_rootfs *rootfs,
                                ERROR("pathname too long for ttys");
                                return -1;
                        }
+                       process_lock();
                        ret = creat(lxcpath, 0660);
+                       process_unlock();
                        if (ret==-1 && errno != EEXIST) {
                                SYSERROR("error creating %s\n", lxcpath);
                                return -1;
                        }
-                       close(ret);
+                       process_lock();
+                       if (ret >= 0)
+                               close(ret);
+                       process_unlock();
                        ret = unlink(path);
                        if (ret && errno != ENOENT) {
                                SYSERROR("error unlinking %s\n", path);
@@ -655,12 +943,17 @@ static int setup_tty(const struct lxc_rootfs *rootfs,
                } else {
                        /* If we populated /dev, then we need to create /dev/ttyN */
                        if (access(path, F_OK)) {
+                               process_lock();
                                ret = creat(path, 0660);
+                               process_unlock();
                                if (ret==-1) {
                                        SYSERROR("error creating %s\n", path);
                                        /* this isn't fatal, continue */
-                               } else
+                               } else {
+                                       process_lock();
                                        close(ret);
+                                       process_unlock();
+                               }
                        }
                        if (mount(pty_info->name, path, "none", MS_BIND, 0)) {
                                WARN("failed to mount '%s'->'%s'",
@@ -678,7 +971,7 @@ static int setup_tty(const struct lxc_rootfs *rootfs,
 static int setup_rootfs_pivot_root_cb(char *buffer, void *data)
 {
        struct lxc_list *mountlist, *listentry, *iterator;
-       char *pivotdir, *mountpoint, *mountentry;
+       char *pivotdir, *mountpoint, *mountentry, *saveptr = NULL;
        int found;
        void **cbparm;
 
@@ -689,12 +982,12 @@ static int setup_rootfs_pivot_root_cb(char *buffer, void *data)
        pivotdir  = cbparm[1];
 
        /* parse entry, first field is mountname, ignore */
-       mountpoint = strtok(mountentry, " ");
+       mountpoint = strtok_r(mountentry, " ", &saveptr);
        if (!mountpoint)
                return -1;
 
        /* second field is mountpoint */
-       mountpoint = strtok(NULL, " ");
+       mountpoint = strtok_r(NULL, " ", &saveptr);
        if (!mountpoint)
                return -1;
 
@@ -723,6 +1016,7 @@ static int setup_rootfs_pivot_root_cb(char *buffer, void *data)
        listentry->elem = strdup(mountpoint);
        if (!listentry->elem) {
                SYSERROR("strdup failed");
+               free(listentry);
                return -1;
        }
        lxc_list_add_tail(mountlist, listentry);
@@ -869,6 +1163,37 @@ static int setup_rootfs_pivot_root(const char *rootfs, const char *pivotdir)
        return 0;
 }
 
+/*
+ * Do we want to add options for max size of /dev and a file to
+ * specify which devices to create?
+ */
+static int mount_autodev(char *root)
+{
+       int ret;
+       char path[MAXPATHLEN];
+
+       INFO("Mounting /dev under %s\n", root);
+       ret = snprintf(path, MAXPATHLEN, "%s/dev", root);
+       if (ret < 0 || ret > MAXPATHLEN)
+               return -1;
+       ret = mount("none", path, "tmpfs", 0, "size=100000");
+       if (ret) {
+               SYSERROR("Failed to mount /dev at %s\n", root);
+               return -1;
+       }
+       ret = snprintf(path, MAXPATHLEN, "%s/dev/pts", root);
+       if (ret < 0 || ret >= MAXPATHLEN)
+               return -1;
+       ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+       if (ret) {
+               SYSERROR("Failed to create /dev/pts in container");
+               return -1;
+       }
+
+       INFO("Mounted /dev under %s\n", root);
+       return 0;
+}
+
 struct lxc_devs {
        char *name;
        mode_t mode;
@@ -886,54 +1211,162 @@ struct lxc_devs lxc_devs[] = {
        { "console",    S_IFCHR | S_IRUSR | S_IWUSR,           5, 1     },
 };
 
-/*
- * Do we want to add options for max size of /dev and a file to
- * specify which devices to create?
- */
 static int setup_autodev(char *root)
 {
        int ret;
        struct lxc_devs *d;
        char path[MAXPATHLEN];
        int i;
+       mode_t cmask;
+
+       INFO("Creating initial consoles under %s/dev\n", root);
 
-       INFO("Creating and populating /dev under %s\n", root);
        ret = snprintf(path, MAXPATHLEN, "%s/dev", root);
-       if (ret < 0 || ret > MAXPATHLEN)
-               return -1;
-       ret = mount("none", path, "tmpfs", 0, "size=100000");
-       if (ret) {
-               SYSERROR("Failed to mount /dev at %s\n", root);
+       if (ret < 0 || ret >= MAXPATHLEN) {
+               ERROR("Error calculating container /dev location");
                return -1;
        }
+
+       INFO("Populating /dev under %s\n", root);
+       cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
        for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
                d = &lxc_devs[i];
                ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", root, d->name);
                if (ret < 0 || ret >= MAXPATHLEN)
                        return -1;
                ret = mknod(path, d->mode, makedev(d->maj, d->min));
-               if (ret) {
+               if (ret && errno != EEXIST) {
                        SYSERROR("Error creating %s\n", d->name);
                        return -1;
                }
        }
-       ret = snprintf(path, MAXPATHLEN, "%s/dev/pts", root);
-       if (ret < 0 || ret >= MAXPATHLEN)
+       umask(cmask);
+
+       INFO("Populated /dev under %s\n", root);
+       return 0;
+}
+
+/*
+ * Detect whether / is mounted MS_SHARED.  The only way I know of to
+ * check that is through /proc/self/mountinfo.
+ * I'm only checking for /.  If the container rootfs or mount location
+ * is MS_SHARED, but not '/', then you're out of luck - figuring that
+ * out would be too much work to be worth it.
+ */
+#define LINELEN 4096
+int detect_shared_rootfs(void)
+{
+       char buf[LINELEN], *p;
+       FILE *f;
+       int i;
+       char *p2;
+
+       process_lock();
+       f = fopen("/proc/self/mountinfo", "r");
+       process_unlock();
+       if (!f)
+               return 0;
+       while ((p = fgets(buf, LINELEN, f))) {
+               INFO("looking at .%s.", p);
+               for (p = buf, i=0; p && i < 4; i++)
+                       p = index(p+1, ' ');
+               if (!p)
+                       continue;
+               p2 = index(p+1, ' ');
+               if (!p2)
+                       continue;
+               *p2 = '\0';
+               INFO("now p is .%s.", p);
+               if (strcmp(p+1, "/") == 0) {
+                       // this is '/'.  is it shared?
+                       p = index(p2+1, ' ');
+                       if (p && strstr(p, "shared:")) {
+                               process_lock();
+                               fclose(f);
+                               process_unlock();
+                               return 1;
+                       }
+               }
+       }
+       process_lock();
+       fclose(f);
+       process_unlock();
+       return 0;
+}
+
+/*
+ * I'll forgive you for asking whether all of this is needed :)  The
+ * answer is yes.
+ * pivot_root will fail if the new root, the put_old dir, or the parent
+ * of current->fs->root are MS_SHARED.  (parent of current->fs_root may
+ * or may not be current->fs_root - if we assumed it always was, we could
+ * just mount --make-rslave /).  So,
+ *    1. mount a tiny tmpfs to be parent of current->fs->root.
+ *    2. make that MS_SLAVE
+ *    3. make a 'root' directory under that
+ *    4. mount --rbind / under the $tinyroot/root.
+ *    5. make that rslave
+ *    6. chdir and chroot into $tinyroot/root
+ *    7. $tinyroot will be unmounted by our parent in start.c
+ */
+static int chroot_into_slave(struct lxc_conf *conf)
+{
+       char path[MAXPATHLEN];
+       const char *destpath = conf->rootfs.mount;
+       int ret;
+
+       if (mount(destpath, destpath, NULL, MS_BIND, 0)) {
+               SYSERROR("failed to mount %s bind", destpath);
                return -1;
-       ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
-       if (ret) {
+       }
+       if (mount("", destpath, NULL, MS_SLAVE, 0)) {
+               SYSERROR("failed to make %s slave", destpath);
+               return -1;
+       }
+       if (mount("none", destpath, "tmpfs", 0, "size=10000")) {
+               SYSERROR("Failed to mount tmpfs / at %s", destpath);
+               return -1;
+       }
+       ret = snprintf(path, MAXPATHLEN, "%s/root", destpath);
+       if (ret < 0 || ret >= MAXPATHLEN) {
+               ERROR("out of memory making root path");
+               return -1;
+       }
+       if (mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
                SYSERROR("Failed to create /dev/pts in container");
                return -1;
        }
-
-       INFO("Populated /dev under %s\n", root);
+       if (mount("/", path, NULL, MS_BIND|MS_REC, 0)) {
+               SYSERROR("Failed to rbind mount / to %s", path);
+               return -1;
+       }
+       if (mount("", destpath, NULL, MS_SLAVE|MS_REC, 0)) {
+               SYSERROR("Failed to make tmp-/ at %s rslave", path);
+               return -1;
+       }
+       if (chdir(path)) {
+               SYSERROR("Failed to chdir into tmp-/");
+               return -1;
+       }
+       if (chroot(path)) {
+               SYSERROR("Failed to chroot into tmp-/");
+               return -1;
+       }
+       INFO("Chrooted into tmp-/ at %s\n", path);
        return 0;
 }
 
-static int setup_rootfs(const struct lxc_rootfs *rootfs)
+static int setup_rootfs(struct lxc_conf *conf)
 {
-       if (!rootfs->path)
+       const struct lxc_rootfs *rootfs = &conf->rootfs;
+
+       if (!rootfs->path) {
+               if (mount("", "/", NULL, MS_SLAVE|MS_REC, 0)) {
+                       SYSERROR("Failed to make / rslave");
+                       return -1;
+               }
                return 0;
+       }
 
        if (access(rootfs->mount, F_OK)) {
                SYSERROR("failed to access to '%s', check it is present",
@@ -941,6 +1374,22 @@ static int setup_rootfs(const struct lxc_rootfs *rootfs)
                return -1;
        }
 
+       if (detect_shared_rootfs()) {
+               if (chroot_into_slave(conf)) {
+                       ERROR("Failed to chroot into slave /");
+                       return -1;
+               }
+       }
+
+       // First try mounting rootfs using a bdev
+       struct bdev *bdev = bdev_init(rootfs->path, rootfs->mount, NULL);
+       if (bdev && bdev->ops->mount(bdev) == 0) {
+               bdev_put(bdev);
+               DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
+               return 0;
+       }
+       if (bdev)
+               bdev_put(bdev);
        if (mount_rootfs(rootfs->path, rootfs->mount)) {
                ERROR("failed to mount rootfs");
                return -1;
@@ -1006,6 +1455,7 @@ out:
 
 static int setup_personality(int persona)
 {
+       #if HAVE_SYS_PERSONALITY_H
        if (persona == -1)
                return 0;
 
@@ -1015,6 +1465,7 @@ static int setup_personality(int persona)
        }
 
        INFO("set personality to '0x%x'", persona);
+       #endif
 
        return 0;
 }
@@ -1037,8 +1488,8 @@ static int setup_dev_console(const struct lxc_rootfs *rootfs,
                return 0;
        }
 
-       if (console->peer == -1) {
-               INFO("no console output required");
+       if (console->master < 0) {
+               INFO("no console");
                return 0;
        }
 
@@ -1095,15 +1546,20 @@ static int setup_ttydir_console(const struct lxc_rootfs *rootfs,
                return -1;
        }
 
+       process_lock();
        ret = creat(lxcpath, 0660);
+       process_unlock();
        if (ret==-1 && errno != EEXIST) {
                SYSERROR("error %d creating %s\n", errno, lxcpath);
                return -1;
        }
-       close(ret);
+       process_lock();
+       if (ret >= 0)
+               close(ret);
+       process_unlock();
 
-       if (console->peer == -1) {
-               INFO("no console output required");
+       if (console->master < 0) {
+               INFO("no console");
                return 0;
        }
 
@@ -1148,6 +1604,8 @@ static int setup_kmsg(const struct lxc_rootfs *rootfs,
        char kpath[MAXPATHLEN];
        int ret;
 
+       if (!rootfs->path)
+               return 0;
        ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
        if (ret < 0 || ret >= sizeof(kpath))
                return -1;
@@ -1167,31 +1625,6 @@ static int setup_kmsg(const struct lxc_rootfs *rootfs,
        return 0;
 }
 
-static int setup_cgroup(const char *name, struct lxc_list *cgroups)
-{
-       struct lxc_list *iterator;
-       struct lxc_cgroup *cg;
-       int ret = -1;
-
-       if (lxc_list_empty(cgroups))
-               return 0;
-
-       lxc_list_for_each(iterator, cgroups) {
-
-               cg = iterator->elem;
-
-               if (lxc_cgroup_set(name, cg->subsystem, cg->value))
-                       goto out;
-
-               DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
-       }
-
-       ret = 0;
-       INFO("cgroup has been setup");
-out:
-       return ret;
-}
-
 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
 {
        struct mount_opt *mo;
@@ -1294,6 +1727,9 @@ static inline int mount_entry_on_systemfs(struct mntent *mntent)
        ret = mount_entry(mntent->mnt_fsname, mntent->mnt_dir,
                          mntent->mnt_type, mntflags, mntdata);
 
+       if (hasmntopt(mntent, "optional") != NULL)
+               ret = 0;
+
        free(mntdata);
 
        return ret;
@@ -1308,15 +1744,22 @@ static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
        unsigned long mntflags;
        char *mntdata;
        int r, ret = 0, offset;
+       const char *lxcpath;
 
        if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
                ERROR("failed to parse mount option '%s'", mntent->mnt_opts);
                return -1;
        }
 
+       lxcpath = default_lxc_path();
+       if (!lxcpath) {
+               ERROR("Out of memory");
+               return -1;
+       }
+
        /* if rootfs->path is a blockdev path, allow container fstab to
-        * use /var/lib/lxc/CN/rootfs as the target prefix */
-       r = snprintf(path, MAXPATHLEN, "/var/lib/lxc/%s/rootfs", lxc_name);
+        * use $lxcpath/CN/rootfs as the target prefix */
+       r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
        if (r < 0 || r >= MAXPATHLEN)
                goto skipvarlib;
 
@@ -1348,6 +1791,9 @@ skipabs:
        ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type,
                          mntflags, mntdata);
 
+       if (hasmntopt(mntent, "optional") != NULL)
+               ret = 0;
+
 out:
        free(mntdata);
        return ret;
@@ -1376,6 +1822,9 @@ static int mount_entry_on_relative_rootfs(struct mntent *mntent,
        ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type,
                          mntflags, mntdata);
 
+       if (hasmntopt(mntent, "optional") != NULL)
+               ret = 0;
+
        free(mntdata);
 
        return ret;
@@ -1423,7 +1872,9 @@ static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
        if (!fstab)
                return 0;
 
+       process_lock();
        file = setmntent(fstab, "r");
+       process_unlock();
        if (!file) {
                SYSERROR("failed to use '%s'", fstab);
                return -1;
@@ -1431,7 +1882,9 @@ static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
 
        ret = mount_file_entries(rootfs, file, lxc_name);
 
+       process_lock();
        endmntent(file);
+       process_unlock();
        return ret;
 }
 
@@ -1443,7 +1896,9 @@ static int setup_mount_entries(const struct lxc_rootfs *rootfs, struct lxc_list
        char *mount_entry;
        int ret;
 
+       process_lock();
        file = tmpfile();
+       process_unlock();
        if (!file) {
                ERROR("tmpfile error: %m");
                return -1;
@@ -1458,7 +1913,9 @@ static int setup_mount_entries(const struct lxc_rootfs *rootfs, struct lxc_list
 
        ret = mount_file_entries(rootfs, file, lxc_name);
 
+       process_lock();
        fclose(file);
+       process_unlock();
        return ret;
 }
 
@@ -1513,7 +1970,76 @@ static int setup_caps(struct lxc_list *caps)
 
        }
 
-       DEBUG("capabilities has been setup");
+       DEBUG("capabilities have been setup");
+
+       return 0;
+}
+
+static int dropcaps_except(struct lxc_list *caps)
+{
+       struct lxc_list *iterator;
+       char *keep_entry;
+       char *ptr;
+       int i, capid;
+       int numcaps = lxc_caps_last_cap() + 1;
+       INFO("found %d capabilities\n", numcaps);
+
+       if (numcaps <= 0 || numcaps > 200)
+               return -1;
+
+       // caplist[i] is 1 if we keep capability i
+       int *caplist = alloca(numcaps * sizeof(int));
+       memset(caplist, 0, numcaps * sizeof(int));
+
+       lxc_list_for_each(iterator, caps) {
+
+               keep_entry = iterator->elem;
+
+               capid = -1;
+
+               for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
+
+                       if (strcmp(keep_entry, caps_opt[i].name))
+                               continue;
+
+                       capid = caps_opt[i].value;
+                       break;
+               }
+
+               if (capid < 0) {
+                       /* try to see if it's numeric, so the user may specify
+                       * capabilities  that the running kernel knows about but
+                       * we don't */
+                       capid = strtol(keep_entry, &ptr, 10);
+                       if (!ptr || *ptr != '\0' ||
+                       capid == LONG_MIN || capid == LONG_MAX)
+                               /* not a valid number */
+                               capid = -1;
+                       else if (capid > lxc_caps_last_cap())
+                               /* we have a number but it's not a valid
+                               * capability */
+                               capid = -1;
+               }
+
+               if (capid < 0) {
+                       ERROR("unknown capability %s", keep_entry);
+                       return -1;
+               }
+
+               DEBUG("drop capability '%s' (%d)", keep_entry, capid);
+
+               caplist[capid] = 1;
+       }
+       for (i=0; i<numcaps; i++) {
+               if (caplist[i])
+                       continue;
+               if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
+                       SYSERROR("failed to remove capability %d", i);
+                       return -1;
+                }
+       }
+
+       DEBUG("capabilities have been setup");
 
        return 0;
 }
@@ -1534,14 +2060,18 @@ static int setup_hw_addr(char *hwaddr, const char *ifname)
        memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
        memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
 
+       process_lock();
        fd = socket(AF_INET, SOCK_DGRAM, 0);
+       process_unlock();
        if (fd < 0) {
                ERROR("socket failure : %s", strerror(errno));
                return -1;
        }
 
        ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
+       process_lock();
        close(fd);
+       process_unlock();
        if (ret)
                ERROR("ioctl failure : %s", strerror(errno));
 
@@ -1767,26 +2297,47 @@ static int setup_network(struct lxc_list *network)
        return 0;
 }
 
+void lxc_rename_phys_nics_on_shutdown(struct lxc_conf *conf)
+{
+       int i;
+
+       INFO("running to reset %d nic names", conf->num_savednics);
+       for (i=0; i<conf->num_savednics; i++) {
+               struct saved_nic *s = &conf->saved_nics[i];
+               INFO("resetting nic %d to %s\n", s->ifindex, s->orig_name);
+               lxc_netdev_rename_by_index(s->ifindex, s->orig_name);
+               free(s->orig_name);
+       }
+       conf->num_savednics = 0;
+       free(conf->saved_nics);
+}
+
 static int setup_private_host_hw_addr(char *veth1)
 {
        struct ifreq ifr;
        int err;
        int sockfd;
 
+       process_lock();
        sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+       process_unlock();
        if (sockfd < 0)
                return -errno;
 
        snprintf((char *)ifr.ifr_name, IFNAMSIZ, "%s", veth1);
        err = ioctl(sockfd, SIOCGIFHWADDR, &ifr);
        if (err < 0) {
+               process_lock();
                close(sockfd);
+               process_unlock();
                return -errno;
        }
 
        ifr.ifr_hwaddr.sa_data[0] = 0xfe;
        err = ioctl(sockfd, SIOCSIFHWADDR, &ifr);
+       process_lock();
        close(sockfd);
+       process_unlock();
        if (err < 0)
                return -errno;
 
@@ -1816,25 +2367,37 @@ struct lxc_conf *lxc_conf_init(void)
        }
        memset(new, 0, sizeof(*new));
 
+       new->loglevel = LXC_LOG_PRIORITY_NOTSET;
        new->personality = -1;
+       new->console.log_path = NULL;
+       new->console.log_fd = -1;
        new->console.path = NULL;
        new->console.peer = -1;
+       new->console.peerpty.busy = -1;
+       new->console.peerpty.master = -1;
+       new->console.peerpty.slave = -1;
        new->console.master = -1;
        new->console.slave = -1;
        new->console.name[0] = '\0';
-       new->rootfs.mount = default_rootfs_mount;
+       new->maincmd_fd = -1;
+       new->rootfs.mount = strdup(default_rootfs_mount);
+       if (!new->rootfs.mount) {
+               ERROR("lxc_conf_init : %m");
+               free(new);
+               return NULL;
+       }
+       new->kmsg = 1;
        lxc_list_init(&new->cgroup);
        lxc_list_init(&new->network);
        lxc_list_init(&new->mount_list);
        lxc_list_init(&new->caps);
+       lxc_list_init(&new->keepcaps);
+       lxc_list_init(&new->id_map);
        for (i=0; i<NUM_LXC_HOOKS; i++)
                lxc_list_init(&new->hooks[i]);
-#if HAVE_APPARMOR
-       new->aa_profile = NULL;
-#endif
-#if HAVE_APPARMOR /* || HAVE_SMACK || HAVE_SELINUX */
+       new->lsm_aa_profile = NULL;
+       new->lsm_se_context = NULL;
        new->lsm_umount_proc = 0;
-#endif
 
        return new;
 }
@@ -1853,13 +2416,13 @@ static int instanciate_veth(struct lxc_handler *handler, struct lxc_netdev *netd
                        ERROR("veth1 name too long");
                        return -1;
                }
-               veth1 = mktemp(veth1buf);
+               veth1 = mkifname(veth1buf);
                /* store away for deconf */
                memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
        }
 
        snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
-       veth2 = mktemp(veth2buf);
+       veth2 = mkifname(veth2buf);
 
        if (!strlen(veth1) || !strlen(veth2)) {
                ERROR("failed to allocate a temporary name");
@@ -1965,7 +2528,7 @@ static int instanciate_macvlan(struct lxc_handler *handler, struct lxc_netdev *n
        if (err >= sizeof(peerbuf))
                return -1;
 
-       peer = mktemp(peerbuf);
+       peer = mkifname(peerbuf);
        if (!strlen(peer)) {
                ERROR("failed to make a temporary name");
                return -1;
@@ -2201,6 +2764,77 @@ int lxc_assign_network(struct lxc_list *network, pid_t pid)
        return 0;
 }
 
+static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
+                           size_t buf_size)
+{
+       char path[PATH_MAX];
+       int ret, closeret;
+       FILE *f;
+
+       ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid, idtype == ID_TYPE_UID ? 'u' : 'g');
+       if (ret < 0 || ret >= PATH_MAX) {
+               fprintf(stderr, "%s: path name too long", __func__);
+               return -E2BIG;
+       }
+       process_lock();
+       f = fopen(path, "w");
+       process_unlock();
+       if (!f) {
+               perror("open");
+               return -EINVAL;
+       }
+       ret = fwrite(buf, buf_size, 1, f);
+       if (ret < 0)
+               SYSERROR("writing id mapping");
+       process_lock();
+       closeret = fclose(f);
+       process_unlock();
+       if (closeret)
+               SYSERROR("writing id mapping");
+       return ret < 0 ? ret : closeret;
+}
+
+int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
+{
+       struct lxc_list *iterator;
+       struct id_map *map;
+       int ret = 0;
+       enum idtype type;
+       char *buf = NULL, *pos;
+
+       for(type = ID_TYPE_UID; type <= ID_TYPE_GID; type++) {
+               int left, fill;
+
+               pos = buf;
+               lxc_list_for_each(iterator, idmap) {
+                       /* The kernel only takes <= 4k for writes to /proc/<nr>/[ug]id_map */
+                       if (!buf)
+                               buf = pos = malloc(4096);
+                       if (!buf)
+                               return -ENOMEM;
+
+                       map = iterator->elem;
+                       if (map->idtype == type) {
+                               left = 4096 - (pos - buf);
+                               fill = snprintf(pos, left, "%lu %lu %lu\n",
+                                       map->nsid, map->hostid, map->range);
+                               if (fill <= 0 || fill >= left)
+                                       SYSERROR("snprintf failed, too many mappings");
+                               pos += fill;
+                       }
+               }
+               if (pos == buf) // no mappings were found
+                       continue;
+               ret = write_id_mapping(type, pid, buf, pos-buf);
+               if (ret)
+                       break;
+       }
+
+       if (buf)
+               free(buf);
+       return ret;
+}
+
 int lxc_find_gateway_addresses(struct lxc_handler *handler)
 {
        struct lxc_list *network = &handler->conf->network;
@@ -2252,7 +2886,7 @@ int lxc_find_gateway_addresses(struct lxc_handler *handler)
 int lxc_create_tty(const char *name, struct lxc_conf *conf)
 {
        struct lxc_tty_info *tty_info = &conf->tty_info;
-       int i;
+       int i, ret;
 
        /* no tty in the configuration */
        if (!conf->tty)
@@ -2269,8 +2903,11 @@ int lxc_create_tty(const char *name, struct lxc_conf *conf)
 
                struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
 
-               if (openpty(&pty_info->master, &pty_info->slave,
-                           pty_info->name, NULL, NULL)) {
+               process_lock();
+               ret = openpty(&pty_info->master, &pty_info->slave,
+                           pty_info->name, NULL, NULL);
+               process_unlock();
+               if (ret) {
                        SYSERROR("failed to create pty #%d", i);
                        tty_info->nbtty = i;
                        lxc_delete_tty(tty_info);
@@ -2301,20 +2938,105 @@ void lxc_delete_tty(struct lxc_tty_info *tty_info)
        for (i = 0; i < tty_info->nbtty; i++) {
                struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
 
+               process_lock();
                close(pty_info->master);
                close(pty_info->slave);
+               process_unlock();
        }
 
        free(tty_info->pty_info);
        tty_info->nbtty = 0;
 }
 
-int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
+/*
+ * given a host uid, return the ns uid if it is mapped.
+ * if it is not mapped, return the original host id.
+ */
+static int shiftid(struct lxc_conf *c, int uid, enum idtype w)
 {
-#if HAVE_APPARMOR /* || HAVE_SMACK || HAVE_SELINUX */
-       int mounted;
-#endif
+       struct lxc_list *iterator;
+       struct id_map *map;
+       int low, high;
+
+       lxc_list_for_each(iterator, &c->id_map) {
+               map = iterator->elem;
+               if (map->idtype != w)
+                       continue;
 
+               low = map->nsid;
+               high = map->nsid + map->range;
+               if (uid < low || uid >= high)
+                       continue;
+
+               return uid - low + map->hostid;
+       }
+
+       return uid;
+}
+
+/*
+ * Take a pathname for a file created on the host, and map the uid and gid
+ * into the container if needed.  (Used for ttys)
+ */
+static int uid_shift_file(char *path, struct lxc_conf *c)
+{
+       struct stat statbuf;
+       int newuid, newgid;
+
+       if (stat(path, &statbuf)) {
+               SYSERROR("stat(%s)", path);
+               return -1;
+       }
+
+       newuid = shiftid(c, statbuf.st_uid, ID_TYPE_UID);
+       newgid = shiftid(c, statbuf.st_gid, ID_TYPE_GID);
+       if (newuid != statbuf.st_uid || newgid != statbuf.st_gid) {
+               DEBUG("chowning %s from %d:%d to %d:%d\n", path, (int)statbuf.st_uid, (int)statbuf.st_gid, newuid, newgid);
+               if (chown(path, newuid, newgid)) {
+                       SYSERROR("chown(%s)", path);
+                       return -1;
+               }
+       }
+       return 0;
+}
+
+int uid_shift_ttys(int pid, struct lxc_conf *conf)
+{
+       int i, ret;
+       struct lxc_tty_info *tty_info = &conf->tty_info;
+       char path[MAXPATHLEN];
+       char *ttydir = conf->ttydir;
+
+       if (!conf->rootfs.path)
+               return 0;
+       /* first the console */
+       ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/%s/console", pid, ttydir ? ttydir : "");
+       if (ret < 0 || ret >= sizeof(path)) {
+               ERROR("console path too long\n");
+               return -1;
+       }
+       if (uid_shift_file(path, conf)) {
+               DEBUG("Failed to chown the console %s.\n", path);
+               return -1;
+       }
+       for (i=0; i< tty_info->nbtty; i++) {
+               ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/%s/tty%d",
+                       pid, ttydir ? ttydir : "", i + 1);
+               if (ret < 0 || ret >= sizeof(path)) {
+                       ERROR("pathname too long for ttys");
+                       return -1;
+               }
+               if (uid_shift_file(path, conf)) {
+                       DEBUG("Failed to chown pty %s.\n", path);
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+int lxc_setup(const char *name, struct lxc_conf *lxc_conf, const char *lxcpath, struct cgroup_process_info *cgroup_info)
+{
        if (setup_utsname(lxc_conf->utsname)) {
                ERROR("failed to setup the utsname for '%s'", name);
                return -1;
@@ -2325,70 +3047,94 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
                return -1;
        }
 
-       if (run_lxc_hooks(name, "pre-mount", lxc_conf)) {
+       if (run_lxc_hooks(name, "pre-mount", lxc_conf, lxcpath, NULL)) {
                ERROR("failed to run pre-mount hooks for container '%s'.", name);
                return -1;
        }
 
-       if (setup_rootfs(&lxc_conf->rootfs)) {
+       if (setup_rootfs(lxc_conf)) {
                ERROR("failed to setup rootfs for '%s'", name);
                return -1;
        }
 
        if (lxc_conf->autodev) {
-               if (setup_autodev(lxc_conf->rootfs.mount)) {
-                       ERROR("failed to set up /dev in the container");
+               if (mount_autodev(lxc_conf->rootfs.mount)) {
+                       ERROR("failed to mount /dev in the container");
                        return -1;
                }
        }
 
+       /* do automatic mounts (mainly /proc and /sys), but exclude
+        * those that need to wait until other stuff has finished
+        */
+       if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP & ~LXC_AUTO_PROC_SYSRQ, cgroup_info) < 0) {
+               ERROR("failed to setup the automatic mounts for '%s'", name);
+               return -1;
+       }
+
        if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name)) {
                ERROR("failed to setup the mounts for '%s'", name);
                return -1;
        }
 
-       if (setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name)) {
+       if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name)) {
                ERROR("failed to setup the mount entries for '%s'", name);
                return -1;
        }
 
-       if (run_lxc_hooks(name, "mount", lxc_conf)) {
+       /* now mount only cgroup, if wanted;
+        * before, /sys could not have been mounted
+        * (is either mounted automatically or via fstab entries)
+        */
+       if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP, cgroup_info) < 0) {
+               ERROR("failed to setup the automatic mounts for '%s'", name);
+               return -1;
+       }
+
+       if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
                ERROR("failed to run mount hooks for container '%s'.", name);
                return -1;
        }
 
-       if (setup_cgroup(name, &lxc_conf->cgroup)) {
-               ERROR("failed to setup the cgroups for '%s'", name);
+       if (lxc_conf->autodev) {
+               if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
+                       ERROR("failed to run autodev hooks for container '%s'.", name);
+                       return -1;
+               }
+               if (setup_autodev(lxc_conf->rootfs.mount)) {
+                       ERROR("failed to populate /dev in the container");
+                       return -1;
+               }
+       }
+
+       /* over-mount /proc/sysrq-trigger with /dev/null now, if wanted;
+        * before /dev/null did not necessarily exist
+        */
+       if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_PROC_SYSRQ, cgroup_info) < 0) {
+               ERROR("failed to setup the automatic mounts for '%s'", name);
                return -1;
        }
 
-       if (setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
+       if (!lxc_conf->is_execute && setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
                ERROR("failed to setup the console for '%s'", name);
                return -1;
        }
 
-       if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console))  // don't fail
-               ERROR("failed to setup kmsg for '%s'", name);
+       if (lxc_conf->kmsg) {
+               if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console))  // don't fail
+                       ERROR("failed to setup kmsg for '%s'", name);
+       }
 
-       if (setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)) {
+       if (!lxc_conf->is_execute && setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)) {
                ERROR("failed to setup the ttys for '%s'", name);
                return -1;
        }
 
-#if HAVE_APPARMOR /* || HAVE_SMACK || HAVE_SELINUX */
-       INFO("rootfs path is .%s., mount is .%s.", lxc_conf->rootfs.path,
-               lxc_conf->rootfs.mount);
-       if (lxc_conf->rootfs.path == NULL || strlen(lxc_conf->rootfs.path) == 0)
-               mounted = 0;
-       else
-               mounted = lsm_mount_proc_if_needed(lxc_conf->rootfs.path, lxc_conf->rootfs.mount);
-       if (mounted == -1) {
-               SYSERROR("failed to mount /proc in the container.");
+       /* mount /proc if needed for LSM transition */
+       if (lsm_proc_mount(lxc_conf) < 0) {
+               ERROR("failed to LSM mount proc for '%s'", name);
                return -1;
-       } else if (mounted == 1) {
-               lxc_conf->lsm_umount_proc = 1;
        }
-#endif
 
        if (setup_pivot_root(&lxc_conf->rootfs)) {
                ERROR("failed to set rootfs for '%s'", name);
@@ -2405,9 +3151,20 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
                return -1;
        }
 
-       if (setup_caps(&lxc_conf->caps)) {
-               ERROR("failed to drop capabilities");
-               return -1;
+       if (lxc_list_empty(&lxc_conf->id_map)) {
+               if (!lxc_list_empty(&lxc_conf->keepcaps)) {
+                       if (!lxc_list_empty(&lxc_conf->caps)) {
+                               ERROR("Simultaneously requested dropping and keeping caps");
+                               return -1;
+                       }
+                       if (dropcaps_except(&lxc_conf->keepcaps)) {
+                               ERROR("failed to keep requested caps\n");
+                               return -1;
+                       }
+               } else if (setup_caps(&lxc_conf->caps)) {
+                       ERROR("failed to drop capabilities");
+                       return -1;
+               }
        }
 
        NOTICE("'%s' is setup.", name);
@@ -2415,7 +3172,8 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
        return 0;
 }
 
-int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf)
+int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
+                 const char *lxcpath, char *argv[])
 {
        int which = -1;
        struct lxc_list *it;
@@ -2426,16 +3184,20 @@ int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf)
                which = LXCHOOK_PREMOUNT;
        else if (strcmp(hook, "mount") == 0)
                which = LXCHOOK_MOUNT;
+       else if (strcmp(hook, "autodev") == 0)
+               which = LXCHOOK_AUTODEV;
        else if (strcmp(hook, "start") == 0)
                which = LXCHOOK_START;
        else if (strcmp(hook, "post-stop") == 0)
                which = LXCHOOK_POSTSTOP;
+       else if (strcmp(hook, "clone") == 0)
+               which = LXCHOOK_CLONE;
        else
                return -1;
        lxc_list_for_each(it, &conf->hooks[which]) {
                int ret;
                char *hookname = it->elem;
-               ret = run_script(name, "lxc", hookname, hook, NULL);
+               ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
                if (ret)
                        return ret;
        }
@@ -2473,11 +3235,12 @@ static void lxc_remove_nic(struct lxc_list *it)
                free(it2->elem);
                free(it2);
        }
+       free(netdev);
        free(it);
 }
 
 /* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
-int lxc_clear_nic(struct lxc_conf *c, char *key)
+int lxc_clear_nic(struct lxc_conf *c, const char *key)
 {
        char *p1;
        int ret, idx, i;
@@ -2585,11 +3348,35 @@ int lxc_clear_config_caps(struct lxc_conf *c)
        return 0;
 }
 
-int lxc_clear_cgroups(struct lxc_conf *c, char *key)
+int lxc_clear_idmaps(struct lxc_conf *c)
+{
+       struct lxc_list *it, *next;
+
+       lxc_list_for_each_safe(it, &c->id_map, next) {
+               lxc_list_del(it);
+               free(it->elem);
+               free(it);
+       }
+       return 0;
+}
+
+int lxc_clear_config_keepcaps(struct lxc_conf *c)
+{
+       struct lxc_list *it,*next;
+
+       lxc_list_for_each_safe(it, &c->keepcaps, next) {
+               lxc_list_del(it);
+               free(it->elem);
+               free(it);
+       }
+       return 0;
+}
+
+int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
 {
        struct lxc_list *it,*next;
        bool all = false;
-       char *k = key + 11;
+       const char *k = key + 11;
 
        if (strcmp(key, "lxc.cgroup") == 0)
                all = true;
@@ -2619,11 +3406,11 @@ int lxc_clear_mount_entries(struct lxc_conf *c)
        return 0;
 }
 
-int lxc_clear_hooks(struct lxc_conf *c, char *key)
+int lxc_clear_hooks(struct lxc_conf *c, const char *key)
 {
        struct lxc_list *it,*next;
        bool all = false, done = false;
-       char *k = key + 9;
+       const char *k = key + 9;
        int i;
 
        if (strcmp(key, "lxc.hook") == 0)
@@ -2647,22 +3434,48 @@ int lxc_clear_hooks(struct lxc_conf *c, char *key)
        return 0;
 }
 
+void lxc_clear_saved_nics(struct lxc_conf *conf)
+{
+       int i;
+
+       if (!conf->num_savednics)
+               return;
+       for (i=0; i < conf->num_savednics; i++)
+               free(conf->saved_nics[i].orig_name);
+       conf->saved_nics = 0;
+       free(conf->saved_nics);
+}
+
 void lxc_conf_free(struct lxc_conf *conf)
 {
        if (!conf)
                return;
        if (conf->console.path)
                free(conf->console.path);
-       if (conf->rootfs.mount != default_rootfs_mount)
+       if (conf->rootfs.mount)
                free(conf->rootfs.mount);
+       if (conf->rootfs.path)
+               free(conf->rootfs.path);
+       if (conf->utsname)
+               free(conf->utsname);
+       if (conf->ttydir)
+               free(conf->ttydir);
+       if (conf->fstab)
+               free(conf->fstab);
+       if (conf->rcfile)
+               free(conf->rcfile);
        lxc_clear_config_network(conf);
-#if HAVE_APPARMOR
-       if (conf->aa_profile)
-               free(conf->aa_profile);
-#endif
+       if (conf->lsm_aa_profile)
+               free(conf->lsm_aa_profile);
+       if (conf->lsm_se_context)
+               free(conf->lsm_se_context);
+       lxc_seccomp_free(conf);
        lxc_clear_config_caps(conf);
+       lxc_clear_config_keepcaps(conf);
        lxc_clear_cgroups(conf, "lxc.cgroup");
        lxc_clear_hooks(conf, "lxc.hook");
        lxc_clear_mount_entries(conf);
+       lxc_clear_saved_nics(conf);
+       lxc_clear_idmaps(conf);
        free(conf);
 }