]> git.proxmox.com Git - mirror_lxc.git/blobdiff - src/lxc/conf.c
refactor AppArmor into LSM backend, add SELinux support
[mirror_lxc.git] / src / lxc / conf.c
index bb93189c29cbaacb87a07ef846f6081dbf4173a3..18a92c9d2e2f8845416cf066aba6e38615861da1 100644 (file)
@@ -4,7 +4,7 @@
  * (C) Copyright IBM Corp. 2007, 2008
  *
  * Authors:
- * Daniel Lezcano <dlezcano at fr.ibm.com>
+ * Daniel Lezcano <daniel.lezcano at free.fr>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -18,7 +18,7 @@
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #define _GNU_SOURCE
 #include <stdio.h>
 #include <unistd.h>
 #include <sys/wait.h>
 #include <sys/syscall.h>
+#include <time.h>
+
+#if HAVE_IFADDRS_H
+#include <ifaddrs.h>
+#else
+#include <../include/ifaddrs.h>
+#endif
 
 #if HAVE_PTY_H
 #include <pty.h>
 #include "log.h"
 #include "lxc.h"       /* for lxc_cgroup_set() */
 #include "caps.h"       /* for lxc_caps_last_cap() */
-
-#if HAVE_APPARMOR
-#include <apparmor.h>
-#endif
+#include "bdev.h"
+#include "cgroup.h"
+#include "lxclock.h"
+#include "lsm/lsm.h"
 
 #if HAVE_SYS_CAPABILITY_H
 #include <sys/capability.h>
@@ -92,30 +99,6 @@ lxc_log_define(lxc_conf, lxc);
 #define MAXMTULEN   16
 #define MAXLINELEN  128
 
-#ifndef MS_DIRSYNC
-#define MS_DIRSYNC  128
-#endif
-
-#ifndef MS_REC
-#define MS_REC 16384
-#endif
-
-#ifndef MNT_DETACH
-#define MNT_DETACH 2
-#endif
-
-#ifndef MS_SLAVE
-#define MS_SLAVE (1<<19)
-#endif
-
-#ifndef MS_RELATIME
-#define MS_RELATIME (1 << 21)
-#endif
-
-#ifndef MS_STRICTATIME
-#define MS_STRICTATIME (1 << 24)
-#endif
-
 #if HAVE_SYS_CAPABILITY_H
 #ifndef CAP_SETFCAP
 #define CAP_SETFCAP 31
@@ -172,7 +155,7 @@ return -1;
 #endif
 
 char *lxchook_names[NUM_LXC_HOOKS] = {
-       "pre-start", "pre-mount", "mount", "autodev", "start", "post-stop" };
+       "pre-start", "pre-mount", "mount", "autodev", "start", "post-stop", "clone" };
 
 typedef int (*instanciate_cb)(struct lxc_handler *, struct lxc_netdev *);
 
@@ -295,12 +278,83 @@ static struct caps_opt caps_opt[] = {
 static struct caps_opt caps_opt[] = {};
 #endif
 
+static char padchar[] =
+"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+static char *mkifname(char *template)
+{
+       char *name = NULL;
+       int i = 0;
+       FILE *urandom;
+       unsigned int seed;
+       struct ifaddrs *ifaddr, *ifa;
+       int ifexists = 0;
+
+       /* Get all the network interfaces */
+       getifaddrs(&ifaddr);
+
+       /* Initialize the random number generator */
+       process_lock();
+       urandom = fopen ("/dev/urandom", "r");
+       process_unlock();
+       if (urandom != NULL) {
+               if (fread (&seed, sizeof(seed), 1, urandom) <= 0)
+                       seed = time(0);
+               process_lock();
+               fclose(urandom);
+               process_unlock();
+       }
+       else
+               seed = time(0);
+
+#ifndef HAVE_RAND_R
+       srand(seed);
+#endif
+
+       /* Generate random names until we find one that doesn't exist */
+       while(1) {
+               ifexists = 0;
+               name = strdup(template);
+
+               if (name == NULL)
+                       return NULL;
+
+               for (i = 0; i < strlen(name); i++) {
+                       if (name[i] == 'X') {
+#ifdef HAVE_RAND_R
+                               name[i] = padchar[rand_r(&seed) % (strlen(padchar) - 1)];
+#else
+                               name[i] = padchar[rand() % (strlen(padchar) - 1)];
+#endif
+                       }
+               }
+
+               for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
+                       if (strcmp(ifa->ifa_name, name) == 0) {
+                               ifexists = 1;
+                               break;
+                       }
+               }
+
+               if (ifexists == 0)
+                       break;
+
+               free(name);
+       }
+
+       freeifaddrs(ifaddr);
+       return name;
+}
+
 static int run_buffer(char *buffer)
 {
        FILE *f;
        char *output;
+       int ret;
 
+       process_lock();
        f = popen(buffer, "r");
+       process_unlock();
        if (!f) {
                SYSERROR("popen failed");
                return -1;
@@ -309,6 +363,9 @@ static int run_buffer(char *buffer)
        output = malloc(LXC_LOG_BUFFER_SIZE);
        if (!output) {
                ERROR("failed to allocate memory for script output");
+               process_lock();
+               pclose(f);
+               process_unlock();
                return -1;
        }
 
@@ -317,14 +374,74 @@ static int run_buffer(char *buffer)
 
        free(output);
 
-       if (pclose(f) == -1) {
+       process_lock();
+       ret = pclose(f);
+       process_unlock();
+       if (ret == -1) {
                SYSERROR("Script exited on error");
                return -1;
+       } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
+               ERROR("Script exited with status %d", WEXITSTATUS(ret));
+               return -1;
+       } else if (WIFSIGNALED(ret)) {
+               ERROR("Script terminated by signal %d (%s)", WTERMSIG(ret),
+                     strsignal(WTERMSIG(ret)));
+               return -1;
        }
 
        return 0;
 }
 
+static int run_script_argv(const char *name, const char *section,
+                     const char *script, const char *hook, const char *lxcpath,
+                     char **argsin)
+{
+       int ret, i;
+       char *buffer;
+       size_t size = 0;
+
+       INFO("Executing script '%s' for container '%s', config section '%s'",
+            script, name, section);
+
+       for (i=0; argsin && argsin[i]; i++)
+               size += strlen(argsin[i]) + 1;
+
+       size += strlen(hook) + 1;
+
+       size += strlen(script);
+       size += strlen(name);
+       size += strlen(section);
+       size += 3;
+
+       if (size > INT_MAX)
+               return -1;
+
+       buffer = alloca(size);
+       if (!buffer) {
+               ERROR("failed to allocate memory");
+               return -1;
+       }
+
+       ret = snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
+       if (ret < 0 || ret >= size) {
+               ERROR("Script name too long");
+               return -1;
+       }
+
+       for (i=0; argsin && argsin[i]; i++) {
+               int len = size-ret;
+               int rc;
+               rc = snprintf(buffer + ret, len, " %s", argsin[i]);
+               if (rc < 0 || rc >= len) {
+                       ERROR("Script args too long");
+                       return -1;
+               }
+               ret += rc;
+       }
+
+       return run_buffer(buffer);
+}
+
 static int run_script(const char *name, const char *section,
                      const char *script, ...)
 {
@@ -358,7 +475,6 @@ static int run_script(const char *name, const char *section,
        ret = snprintf(buffer, size, "%s %s %s", script, name, section);
        if (ret < 0 || ret >= size) {
                ERROR("Script name too long");
-               free(buffer);
                return -1;
        }
 
@@ -368,7 +484,6 @@ static int run_script(const char *name, const char *section,
                int rc;
                rc = snprintf(buffer + ret, len, " %s", p);
                if (rc < 0 || rc >= len) {
-                       free(buffer);
                        ERROR("Script args too long");
                        return -1;
                }
@@ -466,7 +581,9 @@ static int setup_lodev(const char *rootfs, int fd, struct loop_info64 *loinfo)
        int rfd;
        int ret = -1;
 
+       process_lock();
        rfd = open(rootfs, O_RDWR);
+       process_unlock();
        if (rfd < 0) {
                SYSERROR("failed to open '%s'", rootfs);
                return -1;
@@ -488,7 +605,9 @@ static int setup_lodev(const char *rootfs, int fd, struct loop_info64 *loinfo)
 
        ret = 0;
 out:
+       process_lock();
        close(rfd);
+       process_unlock();
 
        return ret;
 }
@@ -501,7 +620,9 @@ static int mount_rootfs_file(const char *rootfs, const char *target)
        DIR *dir;
        char path[MAXPATHLEN];
 
+       process_lock();
        dir = opendir("/dev");
+       process_unlock();
        if (!dir) {
                SYSERROR("failed to open '/dev'");
                return -1;
@@ -525,18 +646,25 @@ static int mount_rootfs_file(const char *rootfs, const char *target)
                if (rc < 0 || rc >= MAXPATHLEN)
                        continue;
 
+               process_lock();
                fd = open(path, O_RDWR);
+               process_unlock();
                if (fd < 0)
                        continue;
 
                if (ioctl(fd, LOOP_GET_STATUS64, &loinfo) == 0) {
+                       process_lock();
                        close(fd);
+                       process_unlock();
                        continue;
                }
 
                if (errno != ENXIO) {
                        WARN("unexpected error for ioctl on '%s': %m",
                             direntp->d_name);
+                       process_lock();
+                       close(fd);
+                       process_unlock();
                        continue;
                }
 
@@ -545,13 +673,17 @@ static int mount_rootfs_file(const char *rootfs, const char *target)
                ret = setup_lodev(rootfs, fd, &loinfo);
                if (!ret)
                        ret = mount_unknow_fs(path, target, 0);
+               process_lock();
                close(fd);
+               process_unlock();
 
                break;
        }
 
+       process_lock();
        if (closedir(dir))
                WARN("failed to close directory");
+       process_unlock();
 
        return ret;
 }
@@ -563,9 +695,10 @@ static int mount_rootfs_block(const char *rootfs, const char *target)
 
 /*
  * pin_rootfs
- * if rootfs is a directory, then open ${rootfs}.hold for writing for the
- * duration of the container run, to prevent the container from marking the
- * underlying fs readonly on shutdown.
+ * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
+ * the duration of the container run, to prevent the container from marking
+ * the underlying fs readonly on shutdown. unlink the file immediately so
+ * no name pollution is happens
  * return -1 on error.
  * return -2 if nothing needed to be pinned.
  * return an open fd (>=0) if we pinned it.
@@ -578,37 +711,116 @@ int pin_rootfs(const char *rootfs)
        int ret, fd;
 
        if (rootfs == NULL || strlen(rootfs) == 0)
-               return 0;
+               return -2;
 
-       if (!realpath(rootfs, absrootfs)) {
-               SYSERROR("failed to get real path for '%s'", rootfs);
-               return -1;
-       }
+       if (!realpath(rootfs, absrootfs))
+               return -2;
 
-       if (access(absrootfs, F_OK)) {
-               SYSERROR("'%s' is not accessible", absrootfs);
+       if (access(absrootfs, F_OK))
                return -1;
-       }
 
-       if (stat(absrootfs, &s)) {
-               SYSERROR("failed to stat '%s'", absrootfs);
+       if (stat(absrootfs, &s))
                return -1;
-       }
 
        if (!S_ISDIR(s.st_mode))
                return -2;
 
-       ret = snprintf(absrootfspin, MAXPATHLEN, "%s%s", absrootfs, ".hold");
-       if (ret >= MAXPATHLEN) {
-               SYSERROR("pathname too long for rootfs hold file");
+       ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
+       if (ret >= MAXPATHLEN)
                return -1;
-       }
 
+       process_lock();
        fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
-       INFO("opened %s as fd %d\n", absrootfspin, fd);
+       process_unlock();
+       if (fd < 0)
+               return fd;
+       (void)unlink(absrootfspin);
        return fd;
 }
 
+static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct cgroup_process_info *cgroup_info)
+{
+       char *path = NULL;
+       char *dev_null = NULL;
+       int r;
+
+       dev_null = lxc_append_paths(conf->rootfs.mount, "/dev/null");
+       if (!dev_null) {
+               SYSERROR("memory allocation error");
+               goto cleanup;
+       }
+
+       if (flags & LXC_AUTO_PROC) {
+               path = lxc_append_paths(conf->rootfs.mount, "/proc");
+               if (!path) {
+                       SYSERROR("memory allocation error trying to automatically mount /proc");
+                       goto cleanup;
+               }
+
+               r = mount("proc", path, "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
+               if (r < 0) {
+                       SYSERROR("error mounting /proc");
+                       goto cleanup;
+               }
+
+               free(path);
+               path = NULL;
+       }
+
+       if (flags & LXC_AUTO_PROC_SYSRQ) {
+               path = lxc_append_paths(conf->rootfs.mount, "/proc/sysrq-trigger");
+               if (!path) {
+                       SYSERROR("memory allocation error trying to automatically mount /proc");
+                       goto cleanup;
+               }
+
+               /* safety measure, mount /dev/null over /proc/sysrq-trigger,
+                * otherwise, a container may trigger a host reboot or such
+                */
+               r = mount(dev_null, path, NULL, MS_BIND, NULL);
+               if (r < 0)
+                       WARN("error mounting /dev/null over /proc/sysrq-trigger: %s", strerror(errno));
+
+               free(path);
+               path = NULL;
+       }
+
+       if (flags & LXC_AUTO_SYS) {
+               path = lxc_append_paths(conf->rootfs.mount, "/sys");
+               if (!path) {
+                       SYSERROR("memory allocation error trying to automatically mount /sys");
+                       goto cleanup;
+               }
+
+               r = mount("sysfs", path, "sysfs", MS_RDONLY, NULL);
+               if (r < 0) {
+                       SYSERROR("error mounting /sys");
+                       goto cleanup;
+               }
+
+               free(path);
+               path = NULL;
+       }
+
+       if (flags & LXC_AUTO_CGROUP) {
+               r = lxc_setup_mount_cgroup(conf->rootfs.mount, cgroup_info);
+               if (r < 0) {
+                       SYSERROR("error mounting /sys/fs/cgroup");
+                       goto cleanup;
+               }
+       }
+
+       free(dev_null);
+       free(path);
+
+       return 0;
+
+cleanup:
+       free(dev_null);
+       free(path);
+       return -1;
+}
+
 static int mount_rootfs(const char *rootfs, const char *target)
 {
        char absrootfs[MAXPATHLEN];
@@ -695,12 +907,17 @@ static int setup_tty(const struct lxc_rootfs *rootfs,
                                ERROR("pathname too long for ttys");
                                return -1;
                        }
+                       process_lock();
                        ret = creat(lxcpath, 0660);
+                       process_unlock();
                        if (ret==-1 && errno != EEXIST) {
                                SYSERROR("error creating %s\n", lxcpath);
                                return -1;
                        }
-                       close(ret);
+                       process_lock();
+                       if (ret >= 0)
+                               close(ret);
+                       process_unlock();
                        ret = unlink(path);
                        if (ret && errno != ENOENT) {
                                SYSERROR("error unlinking %s\n", path);
@@ -726,12 +943,17 @@ static int setup_tty(const struct lxc_rootfs *rootfs,
                } else {
                        /* If we populated /dev, then we need to create /dev/ttyN */
                        if (access(path, F_OK)) {
+                               process_lock();
                                ret = creat(path, 0660);
+                               process_unlock();
                                if (ret==-1) {
                                        SYSERROR("error creating %s\n", path);
                                        /* this isn't fatal, continue */
-                               } else
+                               } else {
+                                       process_lock();
                                        close(ret);
+                                       process_unlock();
+                               }
                        }
                        if (mount(pty_info->name, path, "none", MS_BIND, 0)) {
                                WARN("failed to mount '%s'->'%s'",
@@ -749,7 +971,7 @@ static int setup_tty(const struct lxc_rootfs *rootfs,
 static int setup_rootfs_pivot_root_cb(char *buffer, void *data)
 {
        struct lxc_list *mountlist, *listentry, *iterator;
-       char *pivotdir, *mountpoint, *mountentry;
+       char *pivotdir, *mountpoint, *mountentry, *saveptr = NULL;
        int found;
        void **cbparm;
 
@@ -760,12 +982,12 @@ static int setup_rootfs_pivot_root_cb(char *buffer, void *data)
        pivotdir  = cbparm[1];
 
        /* parse entry, first field is mountname, ignore */
-       mountpoint = strtok(mountentry, " ");
+       mountpoint = strtok_r(mountentry, " ", &saveptr);
        if (!mountpoint)
                return -1;
 
        /* second field is mountpoint */
-       mountpoint = strtok(NULL, " ");
+       mountpoint = strtok_r(NULL, " ", &saveptr);
        if (!mountpoint)
                return -1;
 
@@ -794,6 +1016,7 @@ static int setup_rootfs_pivot_root_cb(char *buffer, void *data)
        listentry->elem = strdup(mountpoint);
        if (!listentry->elem) {
                SYSERROR("strdup failed");
+               free(listentry);
                return -1;
        }
        lxc_list_add_tail(mountlist, listentry);
@@ -1038,7 +1261,9 @@ int detect_shared_rootfs(void)
        int i;
        char *p2;
 
+       process_lock();
        f = fopen("/proc/self/mountinfo", "r");
+       process_unlock();
        if (!f)
                return 0;
        while ((p = fgets(buf, LINELEN, f))) {
@@ -1055,11 +1280,17 @@ int detect_shared_rootfs(void)
                if (strcmp(p+1, "/") == 0) {
                        // this is '/'.  is it shared?
                        p = index(p2+1, ' ');
-                       if (strstr(p, "shared:"))
+                       if (p && strstr(p, "shared:")) {
+                               process_lock();
+                               fclose(f);
+                               process_unlock();
                                return 1;
+                       }
                }
        }
+       process_lock();
        fclose(f);
+       process_unlock();
        return 0;
 }
 
@@ -1129,8 +1360,13 @@ static int setup_rootfs(struct lxc_conf *conf)
 {
        const struct lxc_rootfs *rootfs = &conf->rootfs;
 
-       if (!rootfs->path)
+       if (!rootfs->path) {
+               if (mount("", "/", NULL, MS_SLAVE|MS_REC, 0)) {
+                       SYSERROR("Failed to make / rslave");
+                       return -1;
+               }
                return 0;
+       }
 
        if (access(rootfs->mount, F_OK)) {
                SYSERROR("failed to access to '%s', check it is present",
@@ -1145,6 +1381,15 @@ static int setup_rootfs(struct lxc_conf *conf)
                }
        }
 
+       // First try mounting rootfs using a bdev
+       struct bdev *bdev = bdev_init(rootfs->path, rootfs->mount, NULL);
+       if (bdev && bdev->ops->mount(bdev) == 0) {
+               bdev_put(bdev);
+               DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
+               return 0;
+       }
+       if (bdev)
+               bdev_put(bdev);
        if (mount_rootfs(rootfs->path, rootfs->mount)) {
                ERROR("failed to mount rootfs");
                return -1;
@@ -1243,8 +1488,8 @@ static int setup_dev_console(const struct lxc_rootfs *rootfs,
                return 0;
        }
 
-       if (console->peer == -1) {
-               INFO("no console output required");
+       if (console->master < 0) {
+               INFO("no console");
                return 0;
        }
 
@@ -1301,15 +1546,20 @@ static int setup_ttydir_console(const struct lxc_rootfs *rootfs,
                return -1;
        }
 
+       process_lock();
        ret = creat(lxcpath, 0660);
+       process_unlock();
        if (ret==-1 && errno != EEXIST) {
                SYSERROR("error %d creating %s\n", errno, lxcpath);
                return -1;
        }
-       close(ret);
+       process_lock();
+       if (ret >= 0)
+               close(ret);
+       process_unlock();
 
-       if (console->peer == -1) {
-               INFO("no console output required");
+       if (console->master < 0) {
+               INFO("no console");
                return 0;
        }
 
@@ -1375,31 +1625,6 @@ static int setup_kmsg(const struct lxc_rootfs *rootfs,
        return 0;
 }
 
-int setup_cgroup(const char *name, struct lxc_list *cgroups)
-{
-       struct lxc_list *iterator;
-       struct lxc_cgroup *cg;
-       int ret = -1;
-
-       if (lxc_list_empty(cgroups))
-               return 0;
-
-       lxc_list_for_each(iterator, cgroups) {
-
-               cg = iterator->elem;
-
-               if (lxc_cgroup_set(name, cg->subsystem, cg->value))
-                       goto out;
-
-               DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
-       }
-
-       ret = 0;
-       INFO("cgroup has been setup");
-out:
-       return ret;
-}
-
 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
 {
        struct mount_opt *mo;
@@ -1519,7 +1744,7 @@ static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
        unsigned long mntflags;
        char *mntdata;
        int r, ret = 0, offset;
-       char *lxcpath;
+       const char *lxcpath;
 
        if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
                ERROR("failed to parse mount option '%s'", mntent->mnt_opts);
@@ -1535,7 +1760,6 @@ static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
        /* if rootfs->path is a blockdev path, allow container fstab to
         * use $lxcpath/CN/rootfs as the target prefix */
        r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
-       free(lxcpath);
        if (r < 0 || r >= MAXPATHLEN)
                goto skipvarlib;
 
@@ -1648,7 +1872,9 @@ static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
        if (!fstab)
                return 0;
 
+       process_lock();
        file = setmntent(fstab, "r");
+       process_unlock();
        if (!file) {
                SYSERROR("failed to use '%s'", fstab);
                return -1;
@@ -1656,7 +1882,9 @@ static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
 
        ret = mount_file_entries(rootfs, file, lxc_name);
 
+       process_lock();
        endmntent(file);
+       process_unlock();
        return ret;
 }
 
@@ -1668,7 +1896,9 @@ static int setup_mount_entries(const struct lxc_rootfs *rootfs, struct lxc_list
        char *mount_entry;
        int ret;
 
+       process_lock();
        file = tmpfile();
+       process_unlock();
        if (!file) {
                ERROR("tmpfile error: %m");
                return -1;
@@ -1683,7 +1913,9 @@ static int setup_mount_entries(const struct lxc_rootfs *rootfs, struct lxc_list
 
        ret = mount_file_entries(rootfs, file, lxc_name);
 
+       process_lock();
        fclose(file);
+       process_unlock();
        return ret;
 }
 
@@ -1738,7 +1970,76 @@ static int setup_caps(struct lxc_list *caps)
 
        }
 
-       DEBUG("capabilities has been setup");
+       DEBUG("capabilities have been setup");
+
+       return 0;
+}
+
+static int dropcaps_except(struct lxc_list *caps)
+{
+       struct lxc_list *iterator;
+       char *keep_entry;
+       char *ptr;
+       int i, capid;
+       int numcaps = lxc_caps_last_cap() + 1;
+       INFO("found %d capabilities\n", numcaps);
+
+       if (numcaps <= 0 || numcaps > 200)
+               return -1;
+
+       // caplist[i] is 1 if we keep capability i
+       int *caplist = alloca(numcaps * sizeof(int));
+       memset(caplist, 0, numcaps * sizeof(int));
+
+       lxc_list_for_each(iterator, caps) {
+
+               keep_entry = iterator->elem;
+
+               capid = -1;
+
+               for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
+
+                       if (strcmp(keep_entry, caps_opt[i].name))
+                               continue;
+
+                       capid = caps_opt[i].value;
+                       break;
+               }
+
+               if (capid < 0) {
+                       /* try to see if it's numeric, so the user may specify
+                       * capabilities  that the running kernel knows about but
+                       * we don't */
+                       capid = strtol(keep_entry, &ptr, 10);
+                       if (!ptr || *ptr != '\0' ||
+                       capid == LONG_MIN || capid == LONG_MAX)
+                               /* not a valid number */
+                               capid = -1;
+                       else if (capid > lxc_caps_last_cap())
+                               /* we have a number but it's not a valid
+                               * capability */
+                               capid = -1;
+               }
+
+               if (capid < 0) {
+                       ERROR("unknown capability %s", keep_entry);
+                       return -1;
+               }
+
+               DEBUG("drop capability '%s' (%d)", keep_entry, capid);
+
+               caplist[capid] = 1;
+       }
+       for (i=0; i<numcaps; i++) {
+               if (caplist[i])
+                       continue;
+               if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
+                       SYSERROR("failed to remove capability %d", i);
+                       return -1;
+                }
+       }
+
+       DEBUG("capabilities have been setup");
 
        return 0;
 }
@@ -1759,14 +2060,18 @@ static int setup_hw_addr(char *hwaddr, const char *ifname)
        memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
        memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
 
+       process_lock();
        fd = socket(AF_INET, SOCK_DGRAM, 0);
+       process_unlock();
        if (fd < 0) {
                ERROR("socket failure : %s", strerror(errno));
                return -1;
        }
 
        ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
+       process_lock();
        close(fd);
+       process_unlock();
        if (ret)
                ERROR("ioctl failure : %s", strerror(errno));
 
@@ -2013,20 +2318,26 @@ static int setup_private_host_hw_addr(char *veth1)
        int err;
        int sockfd;
 
+       process_lock();
        sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+       process_unlock();
        if (sockfd < 0)
                return -errno;
 
        snprintf((char *)ifr.ifr_name, IFNAMSIZ, "%s", veth1);
        err = ioctl(sockfd, SIOCGIFHWADDR, &ifr);
        if (err < 0) {
+               process_lock();
                close(sockfd);
+               process_unlock();
                return -errno;
        }
 
        ifr.ifr_hwaddr.sa_data[0] = 0xfe;
        err = ioctl(sockfd, SIOCSIFHWADDR, &ifr);
+       process_lock();
        close(sockfd);
+       process_unlock();
        if (err < 0)
                return -errno;
 
@@ -2056,29 +2367,37 @@ struct lxc_conf *lxc_conf_init(void)
        }
        memset(new, 0, sizeof(*new));
 
+       new->loglevel = LXC_LOG_PRIORITY_NOTSET;
        new->personality = -1;
        new->console.log_path = NULL;
        new->console.log_fd = -1;
        new->console.path = NULL;
        new->console.peer = -1;
+       new->console.peerpty.busy = -1;
+       new->console.peerpty.master = -1;
+       new->console.peerpty.slave = -1;
        new->console.master = -1;
        new->console.slave = -1;
        new->console.name[0] = '\0';
        new->maincmd_fd = -1;
-       new->rootfs.mount = default_rootfs_mount;
+       new->rootfs.mount = strdup(default_rootfs_mount);
+       if (!new->rootfs.mount) {
+               ERROR("lxc_conf_init : %m");
+               free(new);
+               return NULL;
+       }
+       new->kmsg = 1;
        lxc_list_init(&new->cgroup);
        lxc_list_init(&new->network);
        lxc_list_init(&new->mount_list);
        lxc_list_init(&new->caps);
+       lxc_list_init(&new->keepcaps);
        lxc_list_init(&new->id_map);
        for (i=0; i<NUM_LXC_HOOKS; i++)
                lxc_list_init(&new->hooks[i]);
-#if HAVE_APPARMOR
-       new->aa_profile = NULL;
-#endif
-#if HAVE_APPARMOR /* || HAVE_SMACK || HAVE_SELINUX */
+       new->lsm_aa_profile = NULL;
+       new->lsm_se_context = NULL;
        new->lsm_umount_proc = 0;
-#endif
 
        return new;
 }
@@ -2097,13 +2416,13 @@ static int instanciate_veth(struct lxc_handler *handler, struct lxc_netdev *netd
                        ERROR("veth1 name too long");
                        return -1;
                }
-               veth1 = mktemp(veth1buf);
+               veth1 = mkifname(veth1buf);
                /* store away for deconf */
                memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
        }
 
        snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
-       veth2 = mktemp(veth2buf);
+       veth2 = mkifname(veth2buf);
 
        if (!strlen(veth1) || !strlen(veth2)) {
                ERROR("failed to allocate a temporary name");
@@ -2209,7 +2528,7 @@ static int instanciate_macvlan(struct lxc_handler *handler, struct lxc_netdev *n
        if (err >= sizeof(peerbuf))
                return -1;
 
-       peer = mktemp(peerbuf);
+       peer = mkifname(peerbuf);
        if (!strlen(peer)) {
                ERROR("failed to make a temporary name");
                return -1;
@@ -2445,10 +2764,11 @@ int lxc_assign_network(struct lxc_list *network, pid_t pid)
        return 0;
 }
 
-int add_id_mapping(enum idtype idtype, pid_t pid, uid_t host_start, uid_t ns_start, int range)
+static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
+                           size_t buf_size)
 {
        char path[PATH_MAX];
-       int ret;
+       int ret, closeret;
        FILE *f;
 
        ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid, idtype == ID_TYPE_UID ? 'u' : 'g');
@@ -2456,16 +2776,22 @@ int add_id_mapping(enum idtype idtype, pid_t pid, uid_t host_start, uid_t ns_sta
                fprintf(stderr, "%s: path name too long", __func__);
                return -E2BIG;
        }
+       process_lock();
        f = fopen(path, "w");
+       process_unlock();
        if (!f) {
                perror("open");
                return -EINVAL;
        }
-       ret = fprintf(f, "%d %d %d", ns_start, host_start, range);
+       ret = fwrite(buf, buf_size, 1, f);
        if (ret < 0)
-               perror("write");
-       fclose(f);
-       return ret < 0 ? ret : 0;
+               SYSERROR("writing id mapping");
+       process_lock();
+       closeret = fclose(f);
+       process_unlock();
+       if (closeret)
+               SYSERROR("writing id mapping");
+       return ret < 0 ? ret : closeret;
 }
 
 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
@@ -2473,13 +2799,39 @@ int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
        struct lxc_list *iterator;
        struct id_map *map;
        int ret = 0;
-
-       lxc_list_for_each(iterator, idmap) {
-               map = iterator->elem;
-               ret = add_id_mapping(map->idtype, pid, map->hostid, map->nsid, map->range);
+       enum idtype type;
+       char *buf = NULL, *pos;
+
+       for(type = ID_TYPE_UID; type <= ID_TYPE_GID; type++) {
+               int left, fill;
+
+               pos = buf;
+               lxc_list_for_each(iterator, idmap) {
+                       /* The kernel only takes <= 4k for writes to /proc/<nr>/[ug]id_map */
+                       if (!buf)
+                               buf = pos = malloc(4096);
+                       if (!buf)
+                               return -ENOMEM;
+
+                       map = iterator->elem;
+                       if (map->idtype == type) {
+                               left = 4096 - (pos - buf);
+                               fill = snprintf(pos, left, "%lu %lu %lu\n",
+                                       map->nsid, map->hostid, map->range);
+                               if (fill <= 0 || fill >= left)
+                                       SYSERROR("snprintf failed, too many mappings");
+                               pos += fill;
+                       }
+               }
+               if (pos == buf) // no mappings were found
+                       continue;
+               ret = write_id_mapping(type, pid, buf, pos-buf);
                if (ret)
                        break;
        }
+
+       if (buf)
+               free(buf);
        return ret;
 }
 
@@ -2534,7 +2886,7 @@ int lxc_find_gateway_addresses(struct lxc_handler *handler)
 int lxc_create_tty(const char *name, struct lxc_conf *conf)
 {
        struct lxc_tty_info *tty_info = &conf->tty_info;
-       int i;
+       int i, ret;
 
        /* no tty in the configuration */
        if (!conf->tty)
@@ -2551,8 +2903,11 @@ int lxc_create_tty(const char *name, struct lxc_conf *conf)
 
                struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
 
-               if (openpty(&pty_info->master, &pty_info->slave,
-                           pty_info->name, NULL, NULL)) {
+               process_lock();
+               ret = openpty(&pty_info->master, &pty_info->slave,
+                           pty_info->name, NULL, NULL);
+               process_unlock();
+               if (ret) {
                        SYSERROR("failed to create pty #%d", i);
                        tty_info->nbtty = i;
                        lxc_delete_tty(tty_info);
@@ -2583,8 +2938,10 @@ void lxc_delete_tty(struct lxc_tty_info *tty_info)
        for (i = 0; i < tty_info->nbtty; i++) {
                struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
 
+               process_lock();
                close(pty_info->master);
                close(pty_info->slave);
+               process_unlock();
        }
 
        free(tty_info->pty_info);
@@ -2678,12 +3035,8 @@ int uid_shift_ttys(int pid, struct lxc_conf *conf)
        return 0;
 }
 
-int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
+int lxc_setup(const char *name, struct lxc_conf *lxc_conf, const char *lxcpath, struct cgroup_process_info *cgroup_info)
 {
-#if HAVE_APPARMOR /* || HAVE_SMACK || HAVE_SELINUX */
-       int mounted;
-#endif
-
        if (setup_utsname(lxc_conf->utsname)) {
                ERROR("failed to setup the utsname for '%s'", name);
                return -1;
@@ -2694,7 +3047,7 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
                return -1;
        }
 
-       if (run_lxc_hooks(name, "pre-mount", lxc_conf)) {
+       if (run_lxc_hooks(name, "pre-mount", lxc_conf, lxcpath, NULL)) {
                ERROR("failed to run pre-mount hooks for container '%s'.", name);
                return -1;
        }
@@ -2711,6 +3064,14 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
                }
        }
 
+       /* do automatic mounts (mainly /proc and /sys), but exclude
+        * those that need to wait until other stuff has finished
+        */
+       if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP & ~LXC_AUTO_PROC_SYSRQ, cgroup_info) < 0) {
+               ERROR("failed to setup the automatic mounts for '%s'", name);
+               return -1;
+       }
+
        if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name)) {
                ERROR("failed to setup the mounts for '%s'", name);
                return -1;
@@ -2721,13 +3082,22 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
                return -1;
        }
 
-       if (run_lxc_hooks(name, "mount", lxc_conf)) {
+       /* now mount only cgroup, if wanted;
+        * before, /sys could not have been mounted
+        * (is either mounted automatically or via fstab entries)
+        */
+       if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP, cgroup_info) < 0) {
+               ERROR("failed to setup the automatic mounts for '%s'", name);
+               return -1;
+       }
+
+       if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
                ERROR("failed to run mount hooks for container '%s'.", name);
                return -1;
        }
 
        if (lxc_conf->autodev) {
-               if (run_lxc_hooks(name, "autodev", lxc_conf)) {
+               if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
                        ERROR("failed to run autodev hooks for container '%s'.", name);
                        return -1;
                }
@@ -2737,33 +3107,34 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
                }
        }
 
-       if (setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
+       /* over-mount /proc/sysrq-trigger with /dev/null now, if wanted;
+        * before /dev/null did not necessarily exist
+        */
+       if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_PROC_SYSRQ, cgroup_info) < 0) {
+               ERROR("failed to setup the automatic mounts for '%s'", name);
+               return -1;
+       }
+
+       if (!lxc_conf->is_execute && setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
                ERROR("failed to setup the console for '%s'", name);
                return -1;
        }
 
-       if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console))  // don't fail
-               ERROR("failed to setup kmsg for '%s'", name);
+       if (lxc_conf->kmsg) {
+               if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console))  // don't fail
+                       ERROR("failed to setup kmsg for '%s'", name);
+       }
 
-       if (setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)) {
+       if (!lxc_conf->is_execute && setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)) {
                ERROR("failed to setup the ttys for '%s'", name);
                return -1;
        }
 
-#if HAVE_APPARMOR /* || HAVE_SMACK || HAVE_SELINUX */
-       INFO("rootfs path is .%s., mount is .%s.", lxc_conf->rootfs.path,
-               lxc_conf->rootfs.mount);
-       if (lxc_conf->rootfs.path == NULL || strlen(lxc_conf->rootfs.path) == 0)
-               mounted = 0;
-       else
-               mounted = lsm_mount_proc_if_needed(lxc_conf->rootfs.path, lxc_conf->rootfs.mount);
-       if (mounted == -1) {
-               SYSERROR("failed to mount /proc in the container.");
+       /* mount /proc if needed for LSM transition */
+       if (lsm_proc_mount(lxc_conf) < 0) {
+               ERROR("failed to LSM mount proc for '%s'", name);
                return -1;
-       } else if (mounted == 1) {
-               lxc_conf->lsm_umount_proc = 1;
        }
-#endif
 
        if (setup_pivot_root(&lxc_conf->rootfs)) {
                ERROR("failed to set rootfs for '%s'", name);
@@ -2781,7 +3152,16 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
        }
 
        if (lxc_list_empty(&lxc_conf->id_map)) {
-               if (setup_caps(&lxc_conf->caps)) {
+               if (!lxc_list_empty(&lxc_conf->keepcaps)) {
+                       if (!lxc_list_empty(&lxc_conf->caps)) {
+                               ERROR("Simultaneously requested dropping and keeping caps");
+                               return -1;
+                       }
+                       if (dropcaps_except(&lxc_conf->keepcaps)) {
+                               ERROR("failed to keep requested caps\n");
+                               return -1;
+                       }
+               } else if (setup_caps(&lxc_conf->caps)) {
                        ERROR("failed to drop capabilities");
                        return -1;
                }
@@ -2792,7 +3172,8 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
        return 0;
 }
 
-int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf)
+int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
+                 const char *lxcpath, char *argv[])
 {
        int which = -1;
        struct lxc_list *it;
@@ -2809,12 +3190,14 @@ int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf)
                which = LXCHOOK_START;
        else if (strcmp(hook, "post-stop") == 0)
                which = LXCHOOK_POSTSTOP;
+       else if (strcmp(hook, "clone") == 0)
+               which = LXCHOOK_CLONE;
        else
                return -1;
        lxc_list_for_each(it, &conf->hooks[which]) {
                int ret;
                char *hookname = it->elem;
-               ret = run_script(name, "lxc", hookname, hook, NULL);
+               ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
                if (ret)
                        return ret;
        }
@@ -2965,6 +3348,30 @@ int lxc_clear_config_caps(struct lxc_conf *c)
        return 0;
 }
 
+int lxc_clear_idmaps(struct lxc_conf *c)
+{
+       struct lxc_list *it, *next;
+
+       lxc_list_for_each_safe(it, &c->id_map, next) {
+               lxc_list_del(it);
+               free(it->elem);
+               free(it);
+       }
+       return 0;
+}
+
+int lxc_clear_config_keepcaps(struct lxc_conf *c)
+{
+       struct lxc_list *it,*next;
+
+       lxc_list_for_each_safe(it, &c->keepcaps, next) {
+               lxc_list_del(it);
+               free(it->elem);
+               free(it);
+       }
+       return 0;
+}
+
 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
 {
        struct lxc_list *it,*next;
@@ -3045,7 +3452,7 @@ void lxc_conf_free(struct lxc_conf *conf)
                return;
        if (conf->console.path)
                free(conf->console.path);
-       if (conf->rootfs.mount != default_rootfs_mount)
+       if (conf->rootfs.mount)
                free(conf->rootfs.mount);
        if (conf->rootfs.path)
                free(conf->rootfs.path);
@@ -3055,16 +3462,20 @@ void lxc_conf_free(struct lxc_conf *conf)
                free(conf->ttydir);
        if (conf->fstab)
                free(conf->fstab);
+       if (conf->rcfile)
+               free(conf->rcfile);
        lxc_clear_config_network(conf);
-#if HAVE_APPARMOR
-       if (conf->aa_profile)
-               free(conf->aa_profile);
-#endif
+       if (conf->lsm_aa_profile)
+               free(conf->lsm_aa_profile);
+       if (conf->lsm_se_context)
+               free(conf->lsm_se_context);
        lxc_seccomp_free(conf);
        lxc_clear_config_caps(conf);
+       lxc_clear_config_keepcaps(conf);
        lxc_clear_cgroups(conf, "lxc.cgroup");
        lxc_clear_hooks(conf, "lxc.hook");
        lxc_clear_mount_entries(conf);
        lxc_clear_saved_nics(conf);
+       lxc_clear_idmaps(conf);
        free(conf);
 }