]> git.proxmox.com Git - mirror_lxc.git/blobdiff - src/lxc/conf.c
refactor AppArmor into LSM backend, add SELinux support
[mirror_lxc.git] / src / lxc / conf.c
index a1aee141fbda9fde3232afa4fdd108bae2407c37..18a92c9d2e2f8845416cf066aba6e38615861da1 100644 (file)
@@ -18,7 +18,7 @@
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #define _GNU_SOURCE
 #include <stdio.h>
 #include <unistd.h>
 #include <sys/wait.h>
 #include <sys/syscall.h>
+#include <time.h>
+
+#if HAVE_IFADDRS_H
+#include <ifaddrs.h>
+#else
+#include <../include/ifaddrs.h>
+#endif
 
 #if HAVE_PTY_H
 #include <pty.h>
 #include "lxc.h"       /* for lxc_cgroup_set() */
 #include "caps.h"       /* for lxc_caps_last_cap() */
 #include "bdev.h"
-
-#if HAVE_APPARMOR
-#include <apparmor.h>
-#endif
+#include "cgroup.h"
+#include "lxclock.h"
+#include "lsm/lsm.h"
 
 #if HAVE_SYS_CAPABILITY_H
 #include <sys/capability.h>
@@ -272,13 +278,83 @@ static struct caps_opt caps_opt[] = {
 static struct caps_opt caps_opt[] = {};
 #endif
 
+static char padchar[] =
+"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+static char *mkifname(char *template)
+{
+       char *name = NULL;
+       int i = 0;
+       FILE *urandom;
+       unsigned int seed;
+       struct ifaddrs *ifaddr, *ifa;
+       int ifexists = 0;
+
+       /* Get all the network interfaces */
+       getifaddrs(&ifaddr);
+
+       /* Initialize the random number generator */
+       process_lock();
+       urandom = fopen ("/dev/urandom", "r");
+       process_unlock();
+       if (urandom != NULL) {
+               if (fread (&seed, sizeof(seed), 1, urandom) <= 0)
+                       seed = time(0);
+               process_lock();
+               fclose(urandom);
+               process_unlock();
+       }
+       else
+               seed = time(0);
+
+#ifndef HAVE_RAND_R
+       srand(seed);
+#endif
+
+       /* Generate random names until we find one that doesn't exist */
+       while(1) {
+               ifexists = 0;
+               name = strdup(template);
+
+               if (name == NULL)
+                       return NULL;
+
+               for (i = 0; i < strlen(name); i++) {
+                       if (name[i] == 'X') {
+#ifdef HAVE_RAND_R
+                               name[i] = padchar[rand_r(&seed) % (strlen(padchar) - 1)];
+#else
+                               name[i] = padchar[rand() % (strlen(padchar) - 1)];
+#endif
+                       }
+               }
+
+               for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
+                       if (strcmp(ifa->ifa_name, name) == 0) {
+                               ifexists = 1;
+                               break;
+                       }
+               }
+
+               if (ifexists == 0)
+                       break;
+
+               free(name);
+       }
+
+       freeifaddrs(ifaddr);
+       return name;
+}
+
 static int run_buffer(char *buffer)
 {
        FILE *f;
        char *output;
        int ret;
 
+       process_lock();
        f = popen(buffer, "r");
+       process_unlock();
        if (!f) {
                SYSERROR("popen failed");
                return -1;
@@ -287,7 +363,9 @@ static int run_buffer(char *buffer)
        output = malloc(LXC_LOG_BUFFER_SIZE);
        if (!output) {
                ERROR("failed to allocate memory for script output");
+               process_lock();
                pclose(f);
+               process_unlock();
                return -1;
        }
 
@@ -296,7 +374,9 @@ static int run_buffer(char *buffer)
 
        free(output);
 
+       process_lock();
        ret = pclose(f);
+       process_unlock();
        if (ret == -1) {
                SYSERROR("Script exited on error");
                return -1;
@@ -313,7 +393,8 @@ static int run_buffer(char *buffer)
 }
 
 static int run_script_argv(const char *name, const char *section,
-                     const char *script, const char *hook, char **argsin)
+                     const char *script, const char *hook, const char *lxcpath,
+                     char **argsin)
 {
        int ret, i;
        char *buffer;
@@ -500,7 +581,9 @@ static int setup_lodev(const char *rootfs, int fd, struct loop_info64 *loinfo)
        int rfd;
        int ret = -1;
 
+       process_lock();
        rfd = open(rootfs, O_RDWR);
+       process_unlock();
        if (rfd < 0) {
                SYSERROR("failed to open '%s'", rootfs);
                return -1;
@@ -522,7 +605,9 @@ static int setup_lodev(const char *rootfs, int fd, struct loop_info64 *loinfo)
 
        ret = 0;
 out:
+       process_lock();
        close(rfd);
+       process_unlock();
 
        return ret;
 }
@@ -535,7 +620,9 @@ static int mount_rootfs_file(const char *rootfs, const char *target)
        DIR *dir;
        char path[MAXPATHLEN];
 
+       process_lock();
        dir = opendir("/dev");
+       process_unlock();
        if (!dir) {
                SYSERROR("failed to open '/dev'");
                return -1;
@@ -559,19 +646,25 @@ static int mount_rootfs_file(const char *rootfs, const char *target)
                if (rc < 0 || rc >= MAXPATHLEN)
                        continue;
 
+               process_lock();
                fd = open(path, O_RDWR);
+               process_unlock();
                if (fd < 0)
                        continue;
 
                if (ioctl(fd, LOOP_GET_STATUS64, &loinfo) == 0) {
+                       process_lock();
                        close(fd);
+                       process_unlock();
                        continue;
                }
 
                if (errno != ENXIO) {
                        WARN("unexpected error for ioctl on '%s': %m",
                             direntp->d_name);
+                       process_lock();
                        close(fd);
+                       process_unlock();
                        continue;
                }
 
@@ -580,13 +673,17 @@ static int mount_rootfs_file(const char *rootfs, const char *target)
                ret = setup_lodev(rootfs, fd, &loinfo);
                if (!ret)
                        ret = mount_unknow_fs(path, target, 0);
+               process_lock();
                close(fd);
+               process_unlock();
 
                break;
        }
 
+       process_lock();
        if (closedir(dir))
                WARN("failed to close directory");
+       process_unlock();
 
        return ret;
 }
@@ -598,9 +695,10 @@ static int mount_rootfs_block(const char *rootfs, const char *target)
 
 /*
  * pin_rootfs
- * if rootfs is a directory, then open ${rootfs}.hold for writing for the
- * duration of the container run, to prevent the container from marking the
- * underlying fs readonly on shutdown.
+ * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
+ * the duration of the container run, to prevent the container from marking
+ * the underlying fs readonly on shutdown. unlink the file immediately so
+ * no name pollution is happens
  * return -1 on error.
  * return -2 if nothing needed to be pinned.
  * return an open fd (>=0) if we pinned it.
@@ -615,35 +713,114 @@ int pin_rootfs(const char *rootfs)
        if (rootfs == NULL || strlen(rootfs) == 0)
                return -2;
 
-       if (!realpath(rootfs, absrootfs)) {
-               INFO("failed to get real path for '%s', not pinning", rootfs);
+       if (!realpath(rootfs, absrootfs))
                return -2;
-       }
 
-       if (access(absrootfs, F_OK)) {
-               SYSERROR("'%s' is not accessible", absrootfs);
+       if (access(absrootfs, F_OK))
                return -1;
-       }
 
-       if (stat(absrootfs, &s)) {
-               SYSERROR("failed to stat '%s'", absrootfs);
+       if (stat(absrootfs, &s))
                return -1;
-       }
 
        if (!S_ISDIR(s.st_mode))
                return -2;
 
-       ret = snprintf(absrootfspin, MAXPATHLEN, "%s%s", absrootfs, ".hold");
-       if (ret >= MAXPATHLEN) {
-               SYSERROR("pathname too long for rootfs hold file");
+       ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
+       if (ret >= MAXPATHLEN)
                return -1;
-       }
 
+       process_lock();
        fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
-       INFO("opened %s as fd %d\n", absrootfspin, fd);
+       process_unlock();
+       if (fd < 0)
+               return fd;
+       (void)unlink(absrootfspin);
        return fd;
 }
 
+static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct cgroup_process_info *cgroup_info)
+{
+       char *path = NULL;
+       char *dev_null = NULL;
+       int r;
+
+       dev_null = lxc_append_paths(conf->rootfs.mount, "/dev/null");
+       if (!dev_null) {
+               SYSERROR("memory allocation error");
+               goto cleanup;
+       }
+
+       if (flags & LXC_AUTO_PROC) {
+               path = lxc_append_paths(conf->rootfs.mount, "/proc");
+               if (!path) {
+                       SYSERROR("memory allocation error trying to automatically mount /proc");
+                       goto cleanup;
+               }
+
+               r = mount("proc", path, "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
+               if (r < 0) {
+                       SYSERROR("error mounting /proc");
+                       goto cleanup;
+               }
+
+               free(path);
+               path = NULL;
+       }
+
+       if (flags & LXC_AUTO_PROC_SYSRQ) {
+               path = lxc_append_paths(conf->rootfs.mount, "/proc/sysrq-trigger");
+               if (!path) {
+                       SYSERROR("memory allocation error trying to automatically mount /proc");
+                       goto cleanup;
+               }
+
+               /* safety measure, mount /dev/null over /proc/sysrq-trigger,
+                * otherwise, a container may trigger a host reboot or such
+                */
+               r = mount(dev_null, path, NULL, MS_BIND, NULL);
+               if (r < 0)
+                       WARN("error mounting /dev/null over /proc/sysrq-trigger: %s", strerror(errno));
+
+               free(path);
+               path = NULL;
+       }
+
+       if (flags & LXC_AUTO_SYS) {
+               path = lxc_append_paths(conf->rootfs.mount, "/sys");
+               if (!path) {
+                       SYSERROR("memory allocation error trying to automatically mount /sys");
+                       goto cleanup;
+               }
+
+               r = mount("sysfs", path, "sysfs", MS_RDONLY, NULL);
+               if (r < 0) {
+                       SYSERROR("error mounting /sys");
+                       goto cleanup;
+               }
+
+               free(path);
+               path = NULL;
+       }
+
+       if (flags & LXC_AUTO_CGROUP) {
+               r = lxc_setup_mount_cgroup(conf->rootfs.mount, cgroup_info);
+               if (r < 0) {
+                       SYSERROR("error mounting /sys/fs/cgroup");
+                       goto cleanup;
+               }
+       }
+
+       free(dev_null);
+       free(path);
+
+       return 0;
+
+cleanup:
+       free(dev_null);
+       free(path);
+       return -1;
+}
+
 static int mount_rootfs(const char *rootfs, const char *target)
 {
        char absrootfs[MAXPATHLEN];
@@ -730,13 +907,17 @@ static int setup_tty(const struct lxc_rootfs *rootfs,
                                ERROR("pathname too long for ttys");
                                return -1;
                        }
+                       process_lock();
                        ret = creat(lxcpath, 0660);
+                       process_unlock();
                        if (ret==-1 && errno != EEXIST) {
                                SYSERROR("error creating %s\n", lxcpath);
                                return -1;
                        }
+                       process_lock();
                        if (ret >= 0)
                                close(ret);
+                       process_unlock();
                        ret = unlink(path);
                        if (ret && errno != ENOENT) {
                                SYSERROR("error unlinking %s\n", path);
@@ -762,12 +943,17 @@ static int setup_tty(const struct lxc_rootfs *rootfs,
                } else {
                        /* If we populated /dev, then we need to create /dev/ttyN */
                        if (access(path, F_OK)) {
+                               process_lock();
                                ret = creat(path, 0660);
+                               process_unlock();
                                if (ret==-1) {
                                        SYSERROR("error creating %s\n", path);
                                        /* this isn't fatal, continue */
-                               } else
+                               } else {
+                                       process_lock();
                                        close(ret);
+                                       process_unlock();
+                               }
                        }
                        if (mount(pty_info->name, path, "none", MS_BIND, 0)) {
                                WARN("failed to mount '%s'->'%s'",
@@ -1075,7 +1261,9 @@ int detect_shared_rootfs(void)
        int i;
        char *p2;
 
+       process_lock();
        f = fopen("/proc/self/mountinfo", "r");
+       process_unlock();
        if (!f)
                return 0;
        while ((p = fgets(buf, LINELEN, f))) {
@@ -1093,12 +1281,16 @@ int detect_shared_rootfs(void)
                        // this is '/'.  is it shared?
                        p = index(p2+1, ' ');
                        if (p && strstr(p, "shared:")) {
+                               process_lock();
                                fclose(f);
+                               process_unlock();
                                return 1;
                        }
                }
        }
+       process_lock();
        fclose(f);
+       process_unlock();
        return 0;
 }
 
@@ -1192,9 +1384,12 @@ static int setup_rootfs(struct lxc_conf *conf)
        // First try mounting rootfs using a bdev
        struct bdev *bdev = bdev_init(rootfs->path, rootfs->mount, NULL);
        if (bdev && bdev->ops->mount(bdev) == 0) {
+               bdev_put(bdev);
                DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
                return 0;
        }
+       if (bdev)
+               bdev_put(bdev);
        if (mount_rootfs(rootfs->path, rootfs->mount)) {
                ERROR("failed to mount rootfs");
                return -1;
@@ -1293,8 +1488,8 @@ static int setup_dev_console(const struct lxc_rootfs *rootfs,
                return 0;
        }
 
-       if (console->peer == -1) {
-               INFO("no console output required");
+       if (console->master < 0) {
+               INFO("no console");
                return 0;
        }
 
@@ -1351,16 +1546,20 @@ static int setup_ttydir_console(const struct lxc_rootfs *rootfs,
                return -1;
        }
 
+       process_lock();
        ret = creat(lxcpath, 0660);
+       process_unlock();
        if (ret==-1 && errno != EEXIST) {
                SYSERROR("error %d creating %s\n", errno, lxcpath);
                return -1;
        }
+       process_lock();
        if (ret >= 0)
                close(ret);
+       process_unlock();
 
-       if (console->peer == -1) {
-               INFO("no console output required");
+       if (console->master < 0) {
+               INFO("no console");
                return 0;
        }
 
@@ -1426,47 +1625,6 @@ static int setup_kmsg(const struct lxc_rootfs *rootfs,
        return 0;
 }
 
-static int _setup_cgroup(const char *cgpath, struct lxc_list *cgroups,
-                         int devices)
-{
-       struct lxc_list *iterator;
-       struct lxc_cgroup *cg;
-       int ret = -1;
-
-       if (lxc_list_empty(cgroups))
-               return 0;
-
-       lxc_list_for_each(iterator, cgroups) {
-               cg = iterator->elem;
-
-               if (devices == !strncmp("devices", cg->subsystem, 7)) {
-                       if (lxc_cgroup_set_bypath(cgpath, cg->subsystem,
-                           cg->value)) {
-                               ERROR("Error setting %s to %s for %s\n",
-                                     cg->subsystem, cg->value, cgpath);
-                               goto out;
-                       }
-               }
-
-               DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
-       }
-
-       ret = 0;
-       INFO("cgroup has been setup");
-out:
-       return ret;
-}
-
-int setup_cgroup_devices(const char *cgpath, struct lxc_list *cgroups)
-{
-       return _setup_cgroup(cgpath, cgroups, 1);
-}
-
-int setup_cgroup(const char *cgpath, struct lxc_list *cgroups)
-{
-       return _setup_cgroup(cgpath, cgroups, 0);
-}
-
 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
 {
        struct mount_opt *mo;
@@ -1714,7 +1872,9 @@ static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
        if (!fstab)
                return 0;
 
+       process_lock();
        file = setmntent(fstab, "r");
+       process_unlock();
        if (!file) {
                SYSERROR("failed to use '%s'", fstab);
                return -1;
@@ -1722,7 +1882,9 @@ static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
 
        ret = mount_file_entries(rootfs, file, lxc_name);
 
+       process_lock();
        endmntent(file);
+       process_unlock();
        return ret;
 }
 
@@ -1734,7 +1896,9 @@ static int setup_mount_entries(const struct lxc_rootfs *rootfs, struct lxc_list
        char *mount_entry;
        int ret;
 
+       process_lock();
        file = tmpfile();
+       process_unlock();
        if (!file) {
                ERROR("tmpfile error: %m");
                return -1;
@@ -1749,7 +1913,9 @@ static int setup_mount_entries(const struct lxc_rootfs *rootfs, struct lxc_list
 
        ret = mount_file_entries(rootfs, file, lxc_name);
 
+       process_lock();
        fclose(file);
+       process_unlock();
        return ret;
 }
 
@@ -1804,7 +1970,76 @@ static int setup_caps(struct lxc_list *caps)
 
        }
 
-       DEBUG("capabilities has been setup");
+       DEBUG("capabilities have been setup");
+
+       return 0;
+}
+
+static int dropcaps_except(struct lxc_list *caps)
+{
+       struct lxc_list *iterator;
+       char *keep_entry;
+       char *ptr;
+       int i, capid;
+       int numcaps = lxc_caps_last_cap() + 1;
+       INFO("found %d capabilities\n", numcaps);
+
+       if (numcaps <= 0 || numcaps > 200)
+               return -1;
+
+       // caplist[i] is 1 if we keep capability i
+       int *caplist = alloca(numcaps * sizeof(int));
+       memset(caplist, 0, numcaps * sizeof(int));
+
+       lxc_list_for_each(iterator, caps) {
+
+               keep_entry = iterator->elem;
+
+               capid = -1;
+
+               for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
+
+                       if (strcmp(keep_entry, caps_opt[i].name))
+                               continue;
+
+                       capid = caps_opt[i].value;
+                       break;
+               }
+
+               if (capid < 0) {
+                       /* try to see if it's numeric, so the user may specify
+                       * capabilities  that the running kernel knows about but
+                       * we don't */
+                       capid = strtol(keep_entry, &ptr, 10);
+                       if (!ptr || *ptr != '\0' ||
+                       capid == LONG_MIN || capid == LONG_MAX)
+                               /* not a valid number */
+                               capid = -1;
+                       else if (capid > lxc_caps_last_cap())
+                               /* we have a number but it's not a valid
+                               * capability */
+                               capid = -1;
+               }
+
+               if (capid < 0) {
+                       ERROR("unknown capability %s", keep_entry);
+                       return -1;
+               }
+
+               DEBUG("drop capability '%s' (%d)", keep_entry, capid);
+
+               caplist[capid] = 1;
+       }
+       for (i=0; i<numcaps; i++) {
+               if (caplist[i])
+                       continue;
+               if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
+                       SYSERROR("failed to remove capability %d", i);
+                       return -1;
+                }
+       }
+
+       DEBUG("capabilities have been setup");
 
        return 0;
 }
@@ -1825,14 +2060,18 @@ static int setup_hw_addr(char *hwaddr, const char *ifname)
        memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
        memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
 
+       process_lock();
        fd = socket(AF_INET, SOCK_DGRAM, 0);
+       process_unlock();
        if (fd < 0) {
                ERROR("socket failure : %s", strerror(errno));
                return -1;
        }
 
        ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
+       process_lock();
        close(fd);
+       process_unlock();
        if (ret)
                ERROR("ioctl failure : %s", strerror(errno));
 
@@ -2079,20 +2318,26 @@ static int setup_private_host_hw_addr(char *veth1)
        int err;
        int sockfd;
 
+       process_lock();
        sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+       process_unlock();
        if (sockfd < 0)
                return -errno;
 
        snprintf((char *)ifr.ifr_name, IFNAMSIZ, "%s", veth1);
        err = ioctl(sockfd, SIOCGIFHWADDR, &ifr);
        if (err < 0) {
+               process_lock();
                close(sockfd);
+               process_unlock();
                return -errno;
        }
 
        ifr.ifr_hwaddr.sa_data[0] = 0xfe;
        err = ioctl(sockfd, SIOCSIFHWADDR, &ifr);
+       process_lock();
        close(sockfd);
+       process_unlock();
        if (err < 0)
                return -errno;
 
@@ -2122,30 +2367,37 @@ struct lxc_conf *lxc_conf_init(void)
        }
        memset(new, 0, sizeof(*new));
 
+       new->loglevel = LXC_LOG_PRIORITY_NOTSET;
        new->personality = -1;
        new->console.log_path = NULL;
        new->console.log_fd = -1;
        new->console.path = NULL;
        new->console.peer = -1;
+       new->console.peerpty.busy = -1;
+       new->console.peerpty.master = -1;
+       new->console.peerpty.slave = -1;
        new->console.master = -1;
        new->console.slave = -1;
        new->console.name[0] = '\0';
        new->maincmd_fd = -1;
-       new->rootfs.mount = default_rootfs_mount;
+       new->rootfs.mount = strdup(default_rootfs_mount);
+       if (!new->rootfs.mount) {
+               ERROR("lxc_conf_init : %m");
+               free(new);
+               return NULL;
+       }
        new->kmsg = 1;
        lxc_list_init(&new->cgroup);
        lxc_list_init(&new->network);
        lxc_list_init(&new->mount_list);
        lxc_list_init(&new->caps);
+       lxc_list_init(&new->keepcaps);
        lxc_list_init(&new->id_map);
        for (i=0; i<NUM_LXC_HOOKS; i++)
                lxc_list_init(&new->hooks[i]);
-#if HAVE_APPARMOR
-       new->aa_profile = NULL;
-#endif
-#if HAVE_APPARMOR /* || HAVE_SMACK || HAVE_SELINUX */
+       new->lsm_aa_profile = NULL;
+       new->lsm_se_context = NULL;
        new->lsm_umount_proc = 0;
-#endif
 
        return new;
 }
@@ -2164,13 +2416,13 @@ static int instanciate_veth(struct lxc_handler *handler, struct lxc_netdev *netd
                        ERROR("veth1 name too long");
                        return -1;
                }
-               veth1 = mktemp(veth1buf);
+               veth1 = mkifname(veth1buf);
                /* store away for deconf */
                memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
        }
 
        snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
-       veth2 = mktemp(veth2buf);
+       veth2 = mkifname(veth2buf);
 
        if (!strlen(veth1) || !strlen(veth2)) {
                ERROR("failed to allocate a temporary name");
@@ -2276,7 +2528,7 @@ static int instanciate_macvlan(struct lxc_handler *handler, struct lxc_netdev *n
        if (err >= sizeof(peerbuf))
                return -1;
 
-       peer = mktemp(peerbuf);
+       peer = mkifname(peerbuf);
        if (!strlen(peer)) {
                ERROR("failed to make a temporary name");
                return -1;
@@ -2524,7 +2776,9 @@ static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
                fprintf(stderr, "%s: path name too long", __func__);
                return -E2BIG;
        }
+       process_lock();
        f = fopen(path, "w");
+       process_unlock();
        if (!f) {
                perror("open");
                return -EINVAL;
@@ -2532,7 +2786,9 @@ static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
        ret = fwrite(buf, buf_size, 1, f);
        if (ret < 0)
                SYSERROR("writing id mapping");
+       process_lock();
        closeret = fclose(f);
+       process_unlock();
        if (closeret)
                SYSERROR("writing id mapping");
        return ret < 0 ? ret : closeret;
@@ -2630,7 +2886,7 @@ int lxc_find_gateway_addresses(struct lxc_handler *handler)
 int lxc_create_tty(const char *name, struct lxc_conf *conf)
 {
        struct lxc_tty_info *tty_info = &conf->tty_info;
-       int i;
+       int i, ret;
 
        /* no tty in the configuration */
        if (!conf->tty)
@@ -2647,8 +2903,11 @@ int lxc_create_tty(const char *name, struct lxc_conf *conf)
 
                struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
 
-               if (openpty(&pty_info->master, &pty_info->slave,
-                           pty_info->name, NULL, NULL)) {
+               process_lock();
+               ret = openpty(&pty_info->master, &pty_info->slave,
+                           pty_info->name, NULL, NULL);
+               process_unlock();
+               if (ret) {
                        SYSERROR("failed to create pty #%d", i);
                        tty_info->nbtty = i;
                        lxc_delete_tty(tty_info);
@@ -2679,8 +2938,10 @@ void lxc_delete_tty(struct lxc_tty_info *tty_info)
        for (i = 0; i < tty_info->nbtty; i++) {
                struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
 
+               process_lock();
                close(pty_info->master);
                close(pty_info->slave);
+               process_unlock();
        }
 
        free(tty_info->pty_info);
@@ -2774,12 +3035,8 @@ int uid_shift_ttys(int pid, struct lxc_conf *conf)
        return 0;
 }
 
-int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
+int lxc_setup(const char *name, struct lxc_conf *lxc_conf, const char *lxcpath, struct cgroup_process_info *cgroup_info)
 {
-#if HAVE_APPARMOR /* || HAVE_SMACK || HAVE_SELINUX */
-       int mounted;
-#endif
-
        if (setup_utsname(lxc_conf->utsname)) {
                ERROR("failed to setup the utsname for '%s'", name);
                return -1;
@@ -2790,7 +3047,7 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
                return -1;
        }
 
-       if (run_lxc_hooks(name, "pre-mount", lxc_conf, NULL)) {
+       if (run_lxc_hooks(name, "pre-mount", lxc_conf, lxcpath, NULL)) {
                ERROR("failed to run pre-mount hooks for container '%s'.", name);
                return -1;
        }
@@ -2807,6 +3064,14 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
                }
        }
 
+       /* do automatic mounts (mainly /proc and /sys), but exclude
+        * those that need to wait until other stuff has finished
+        */
+       if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP & ~LXC_AUTO_PROC_SYSRQ, cgroup_info) < 0) {
+               ERROR("failed to setup the automatic mounts for '%s'", name);
+               return -1;
+       }
+
        if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name)) {
                ERROR("failed to setup the mounts for '%s'", name);
                return -1;
@@ -2817,13 +3082,22 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
                return -1;
        }
 
-       if (run_lxc_hooks(name, "mount", lxc_conf, NULL)) {
+       /* now mount only cgroup, if wanted;
+        * before, /sys could not have been mounted
+        * (is either mounted automatically or via fstab entries)
+        */
+       if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP, cgroup_info) < 0) {
+               ERROR("failed to setup the automatic mounts for '%s'", name);
+               return -1;
+       }
+
+       if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
                ERROR("failed to run mount hooks for container '%s'.", name);
                return -1;
        }
 
        if (lxc_conf->autodev) {
-               if (run_lxc_hooks(name, "autodev", lxc_conf, NULL)) {
+               if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
                        ERROR("failed to run autodev hooks for container '%s'.", name);
                        return -1;
                }
@@ -2833,7 +3107,15 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
                }
        }
 
-       if (setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
+       /* over-mount /proc/sysrq-trigger with /dev/null now, if wanted;
+        * before /dev/null did not necessarily exist
+        */
+       if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_PROC_SYSRQ, cgroup_info) < 0) {
+               ERROR("failed to setup the automatic mounts for '%s'", name);
+               return -1;
+       }
+
+       if (!lxc_conf->is_execute && setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
                ERROR("failed to setup the console for '%s'", name);
                return -1;
        }
@@ -2843,25 +3125,16 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
                        ERROR("failed to setup kmsg for '%s'", name);
        }
 
-       if (setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)) {
+       if (!lxc_conf->is_execute && setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)) {
                ERROR("failed to setup the ttys for '%s'", name);
                return -1;
        }
 
-#if HAVE_APPARMOR /* || HAVE_SMACK || HAVE_SELINUX */
-       INFO("rootfs path is .%s., mount is .%s.", lxc_conf->rootfs.path,
-               lxc_conf->rootfs.mount);
-       if (lxc_conf->rootfs.path == NULL || strlen(lxc_conf->rootfs.path) == 0)
-               mounted = 0;
-       else
-               mounted = lsm_mount_proc_if_needed(lxc_conf->rootfs.path, lxc_conf->rootfs.mount);
-       if (mounted == -1) {
-               SYSERROR("failed to mount /proc in the container.");
+       /* mount /proc if needed for LSM transition */
+       if (lsm_proc_mount(lxc_conf) < 0) {
+               ERROR("failed to LSM mount proc for '%s'", name);
                return -1;
-       } else if (mounted == 1) {
-               lxc_conf->lsm_umount_proc = 1;
        }
-#endif
 
        if (setup_pivot_root(&lxc_conf->rootfs)) {
                ERROR("failed to set rootfs for '%s'", name);
@@ -2879,7 +3152,16 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
        }
 
        if (lxc_list_empty(&lxc_conf->id_map)) {
-               if (setup_caps(&lxc_conf->caps)) {
+               if (!lxc_list_empty(&lxc_conf->keepcaps)) {
+                       if (!lxc_list_empty(&lxc_conf->caps)) {
+                               ERROR("Simultaneously requested dropping and keeping caps");
+                               return -1;
+                       }
+                       if (dropcaps_except(&lxc_conf->keepcaps)) {
+                               ERROR("failed to keep requested caps\n");
+                               return -1;
+                       }
+               } else if (setup_caps(&lxc_conf->caps)) {
                        ERROR("failed to drop capabilities");
                        return -1;
                }
@@ -2890,7 +3172,8 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
        return 0;
 }
 
-int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf, char *argv[])
+int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
+                 const char *lxcpath, char *argv[])
 {
        int which = -1;
        struct lxc_list *it;
@@ -2914,7 +3197,7 @@ int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf, char *arg
        lxc_list_for_each(it, &conf->hooks[which]) {
                int ret;
                char *hookname = it->elem;
-               ret = run_script_argv(name, "lxc", hookname, hook, argv);
+               ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
                if (ret)
                        return ret;
        }
@@ -3065,6 +3348,30 @@ int lxc_clear_config_caps(struct lxc_conf *c)
        return 0;
 }
 
+int lxc_clear_idmaps(struct lxc_conf *c)
+{
+       struct lxc_list *it, *next;
+
+       lxc_list_for_each_safe(it, &c->id_map, next) {
+               lxc_list_del(it);
+               free(it->elem);
+               free(it);
+       }
+       return 0;
+}
+
+int lxc_clear_config_keepcaps(struct lxc_conf *c)
+{
+       struct lxc_list *it,*next;
+
+       lxc_list_for_each_safe(it, &c->keepcaps, next) {
+               lxc_list_del(it);
+               free(it->elem);
+               free(it);
+       }
+       return 0;
+}
+
 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
 {
        struct lxc_list *it,*next;
@@ -3145,7 +3452,7 @@ void lxc_conf_free(struct lxc_conf *conf)
                return;
        if (conf->console.path)
                free(conf->console.path);
-       if (conf->rootfs.mount != default_rootfs_mount)
+       if (conf->rootfs.mount)
                free(conf->rootfs.mount);
        if (conf->rootfs.path)
                free(conf->rootfs.path);
@@ -3158,15 +3465,17 @@ void lxc_conf_free(struct lxc_conf *conf)
        if (conf->rcfile)
                free(conf->rcfile);
        lxc_clear_config_network(conf);
-#if HAVE_APPARMOR
-       if (conf->aa_profile)
-               free(conf->aa_profile);
-#endif
+       if (conf->lsm_aa_profile)
+               free(conf->lsm_aa_profile);
+       if (conf->lsm_se_context)
+               free(conf->lsm_se_context);
        lxc_seccomp_free(conf);
        lxc_clear_config_caps(conf);
+       lxc_clear_config_keepcaps(conf);
        lxc_clear_cgroups(conf, "lxc.cgroup");
        lxc_clear_hooks(conf, "lxc.hook");
        lxc_clear_mount_entries(conf);
        lxc_clear_saved_nics(conf);
+       lxc_clear_idmaps(conf);
        free(conf);
 }