]> git.proxmox.com Git - mirror_lxc.git/commitdiff
apparmor: profile generation
authorWolfgang Bumiller <w.bumiller@proxmox.com>
Wed, 25 Jul 2018 10:11:31 +0000 (12:11 +0200)
committerWolfgang Bumiller <w.bumiller@proxmox.com>
Wed, 25 Jul 2018 12:37:32 +0000 (14:37 +0200)
This copies lxd's apparmor profile generation. This tries to
detect features such as cgroup namespaces, apparmor
namespaces and stacking support, and has profile parts
conditionally for unprivileged containers.

This introduces the following changes to the configuration:
  lxc.apparmor.profile = generated
    The fixed value 'generated' will cause this
    functionality to be used, otherwise there should be no
    functional changes happening unless specifically
    requested with the next key:
  lxc.apparmor.allow_nesting
    This is a boolean which, if enabled, causes the
    following changes: When generated apparmor profiles are
    used, they will contain the necessary changes to allow
    creating a nested container. In addition to the usual
    mount points, /dev/.lxc/proc and /dev/.lxc/sys will
    contain procfs and sysfs mount points without the lxcfs
    overlays, which, if generated apparmor profiles are
    being used, will not be read/writable directly.
  lxc.apparmor.raw
    A list of raw apparmor profile lines to append to the
    profile. Only valid when using generated profiles.

The following apparmor profile lines have not been copied
from lxd:

  mount /var/lib/lxd/shmounts/ -> /var/lib/lxd/shmounts/,
  mount none -> /var/lib/lxd/shmounts/,
  mount options=bind /var/lib/lxd/shmounts/** -> /var/lib/lxd/**,

They should be added via lxc.apparmor.raw entries by lxd.

In order for apparmor_parser's cache to be of use, this adds
a --with-apparmor-cache-dir ./configure option.

Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
12 files changed:
configure.ac
src/lxc/Makefile.am
src/lxc/conf.c
src/lxc/conf.h
src/lxc/confile.c
src/lxc/criu.c
src/lxc/lsm/apparmor.c
src/lxc/lsm/lsm.c
src/lxc/lsm/lsm.h
src/lxc/lsm/nop.c
src/lxc/lsm/selinux.c
src/lxc/start.c

index c24f8f3ee99227f073ff44ebfae9c487e8f43bd2..f1811205f0883a8db033e78c5d38aab6bdc50150 100644 (file)
@@ -469,6 +469,13 @@ AC_ARG_WITH([cgroup-pattern],
                [pattern for container cgroups]
        )], [], [with_cgroup_pattern=['lxc/%n']])
 
+# The path for the apparmor_parser's cache for generated apparmor profiles
+AC_ARG_WITH([apparmor-cache-dir],
+       [AC_HELP_STRING(
+               [--with-apparmor-cache-dir=dir],
+               [path for apparmor_parser cache]
+       )], [], [with_apparmor_cache_dir=['${localstatedir}/cache/lxc/apparmor']])
+
 # Container log path.  By default, use $lxcpath.
 AC_MSG_CHECKING([Whether to place logfiles in container config path])
 AC_ARG_ENABLE([configpath-log],
@@ -515,6 +522,7 @@ AS_AC_EXPAND(LXCBINHOOKDIR, "$libexecdir/lxc/hooks")
 AS_AC_EXPAND(LXCINITDIR, "$libexecdir")
 AS_AC_EXPAND(LOGPATH, "$with_log_path")
 AS_AC_EXPAND(RUNTIME_PATH, "$with_runtime_path")
+AS_AC_EXPAND(APPARMOR_CACHE_DIR, "$with_apparmor_cache_dir")
 AC_SUBST(DEFAULT_CGROUP_PATTERN, ["$with_cgroup_pattern"])
 
 # We need the install path so criu knows where to reference the hook scripts.
index c5e46ac280b21296f07aa45c867609dae5a0be75..1359eb3e43facc6249faa39086c0188dc47fd8b5 100644 (file)
@@ -174,6 +174,7 @@ AM_CFLAGS = -DLXCROOTFSMOUNT=\"$(LXCROOTFSMOUNT)\" \
            -DDEFAULT_CGROUP_PATTERN=\"$(DEFAULT_CGROUP_PATTERN)\" \
            -DRUNTIME_PATH=\"$(RUNTIME_PATH)\" \
            -DSBINDIR=\"$(SBINDIR)\" \
+           -DAPPARMOR_CACHE_DIR=\"$(APPARMOR_CACHE_DIR)\" \
            -I $(top_srcdir)/src \
            -I $(top_srcdir)/src/lxc \
            -I $(top_srcdir)/src/lxc/storage \
index 4d17c277dc2a1f23d34af96420762fe6bdc456d4..f5b94b091aaa9b1002899d80cb7a27a1fd4b7a4d 100644 (file)
@@ -2360,7 +2360,23 @@ static int setup_mount(const struct lxc_conf *conf,
        return ret;
 }
 
-FILE *make_anonymous_mount_file(struct lxc_list *mount)
+/*
+ * In order for nested containers to be able to mount /proc and /sys they need
+ * to see a "pure" proc and sysfs mount points with nothing mounted on top
+ * (like lxcfs).
+ * For this we provide proc and sysfs in /dev/.lxc/{proc,sys} while using an
+ * apparmor rule to deny access to them. This is mostly for convenience: The
+ * container's root user can mount them anyway and thus has access to the two
+ * file systems. But a non-root user in the container should not be allowed to
+ * access them as a side effect without explicitly allowing it.
+ */
+static const char nesting_helpers[] =
+"proc dev/.lxc/proc proc create=dir,optional\n"
+"sys dev/.lxc/sys sysfs create=dir,optional\n"
+;
+
+FILE *make_anonymous_mount_file(struct lxc_list *mount,
+                               bool include_nesting_helpers)
 {
        int ret;
        char *mount_entry;
@@ -2402,6 +2418,13 @@ FILE *make_anonymous_mount_file(struct lxc_list *mount)
                        goto on_error;
        }
 
+       if (include_nesting_helpers) {
+               ret = lxc_write_nointr(fd, nesting_helpers,
+                                      sizeof(nesting_helpers) - 1);
+               if (ret != sizeof(nesting_helpers) - 1)
+                       goto on_error;
+       }
+
        ret = lseek(fd, 0, SEEK_SET);
        if (ret < 0)
                goto on_error;
@@ -2422,7 +2445,7 @@ static int setup_mount_entries(const struct lxc_conf *conf,
        int ret;
        FILE *f;
 
-       f = make_anonymous_mount_file(mount);
+       f = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
        if (!f)
                return -1;
 
@@ -2738,6 +2761,7 @@ struct lxc_conf *lxc_conf_init(void)
        lxc_list_init(&new->groups);
        lxc_list_init(&new->state_clients);
        new->lsm_aa_profile = NULL;
+       lxc_list_init(&new->lsm_aa_raw);
        new->lsm_se_context = NULL;
        new->tmp_umount_proc = false;
        new->tmp_umount_proc = 0;
@@ -4025,6 +4049,19 @@ void lxc_clear_includes(struct lxc_conf *conf)
        }
 }
 
+int lxc_clear_apparmor_raw(struct lxc_conf *c)
+{
+       struct lxc_list *it, *next;
+
+       lxc_list_for_each_safe (it, &c->lsm_aa_raw, next) {
+               lxc_list_del(it);
+               free(it->elem);
+               free(it);
+       }
+
+       return 0;
+}
+
 void lxc_conf_free(struct lxc_conf *conf)
 {
        if (!conf)
@@ -4052,6 +4089,7 @@ void lxc_conf_free(struct lxc_conf *conf)
        free(conf->syslog);
        lxc_free_networks(&conf->network);
        free(conf->lsm_aa_profile);
+       free(conf->lsm_aa_profile_computed);
        free(conf->lsm_se_context);
        lxc_seccomp_free(conf);
        lxc_clear_config_caps(conf);
@@ -4068,6 +4106,7 @@ void lxc_conf_free(struct lxc_conf *conf)
        lxc_clear_limits(conf, "lxc.prlimit");
        lxc_clear_sysctls(conf, "lxc.sysctl");
        lxc_clear_procs(conf, "lxc.proc");
+       lxc_clear_apparmor_raw(conf);
        free(conf->cgroup_meta.dir);
        free(conf->cgroup_meta.controllers);
        free(conf->shmount.path_host);
index 8d7ded80ee84a77888ad3e1083ebab8683e03d70..6b299cab97c291110b1a221e68ff6b5db78f0fa1 100644 (file)
@@ -275,7 +275,11 @@ struct lxc_conf {
        };
 
        char *lsm_aa_profile;
+       char *lsm_aa_profile_computed;
+       bool lsm_aa_profile_created;
+       unsigned int lsm_aa_allow_nesting;
        unsigned int lsm_aa_allow_incomplete;
+       struct lxc_list lsm_aa_raw;
        char *lsm_se_context;
        bool tmp_umount_proc;
        char *seccomp;  /* filename with the seccomp rules */
@@ -427,7 +431,8 @@ extern int parse_mntopts(const char *mntopts, unsigned long *mntflags,
 extern void tmp_proc_unmount(struct lxc_conf *lxc_conf);
 extern void remount_all_slave(void);
 extern void suggest_default_idmap(void);
-extern FILE *make_anonymous_mount_file(struct lxc_list *mount);
+extern FILE *make_anonymous_mount_file(struct lxc_list *mount,
+                                      bool include_nesting_helpers);
 extern struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings);
 extern unsigned long add_required_remount_flags(const char *s, const char *d,
                                                unsigned long flags);
@@ -441,5 +446,6 @@ extern int setup_sysctl_parameters(struct lxc_list *sysctls);
 extern int lxc_clear_sysctls(struct lxc_conf *c, const char *key);
 extern int setup_proc_filesystem(struct lxc_list *procs, pid_t pid);
 extern int lxc_clear_procs(struct lxc_conf *c, const char *key);
+extern int lxc_clear_apparmor_raw(struct lxc_conf *c);
 
 #endif /* __LXC_CONF_H */
index 091dc67d9b5ab2ee82a02af5ade18c43e731ebbc..456cd4c2ed02fc3710c966e6d5ed5d74c5ae7749 100644 (file)
@@ -84,7 +84,9 @@ lxc_log_define(confile, lxc);
 
 lxc_config_define(autodev);
 lxc_config_define(apparmor_allow_incomplete);
+lxc_config_define(apparmor_allow_nesting);
 lxc_config_define(apparmor_profile);
+lxc_config_define(apparmor_raw);
 lxc_config_define(cap_drop);
 lxc_config_define(cap_keep);
 lxc_config_define(cgroup_controller);
@@ -158,6 +160,8 @@ static struct lxc_config_t config[] = {
        { "lxc.arch",                      set_config_personality,                 get_config_personality,                 clr_config_personality,               },
        { "lxc.apparmor.profile",          set_config_apparmor_profile,            get_config_apparmor_profile,            clr_config_apparmor_profile,          },
        { "lxc.apparmor.allow_incomplete", set_config_apparmor_allow_incomplete,   get_config_apparmor_allow_incomplete,   clr_config_apparmor_allow_incomplete, },
+       { "lxc.apparmor.allow_nesting",    set_config_apparmor_allow_nesting,      get_config_apparmor_allow_nesting,      clr_config_apparmor_allow_nesting,    },
+       { "lxc.apparmor.raw",              set_config_apparmor_raw,                get_config_apparmor_raw,                clr_config_apparmor_raw,              },
        { "lxc.autodev",                   set_config_autodev,                     get_config_autodev,                     clr_config_autodev,                   },
        { "lxc.cap.drop",                  set_config_cap_drop,                    get_config_cap_drop,                    clr_config_cap_drop,                  },
        { "lxc.cap.keep",                  set_config_cap_keep,                    get_config_cap_keep,                    clr_config_cap_keep,                  },
@@ -1132,6 +1136,52 @@ static int set_config_apparmor_allow_incomplete(const char *key,
        return 0;
 }
 
+static int set_config_apparmor_allow_nesting(const char *key,
+                                            const char *value,
+                                            struct lxc_conf *lxc_conf,
+                                            void *data)
+{
+       if (lxc_config_value_empty(value))
+               return clr_config_apparmor_allow_nesting(key, lxc_conf, NULL);
+
+       if (lxc_safe_uint(value, &lxc_conf->lsm_aa_allow_nesting) < 0)
+               return -1;
+
+       if (lxc_conf->lsm_aa_allow_nesting > 1)
+               return -1;
+
+       return 0;
+}
+
+static int set_config_apparmor_raw(const char *key,
+                                  const char *value,
+                                  struct lxc_conf *lxc_conf,
+                                  void *data)
+{
+       char *elem;
+       struct lxc_list *list;
+
+       if (lxc_config_value_empty(value))
+               return lxc_clear_apparmor_raw(lxc_conf);
+
+       list = malloc(sizeof(*list));
+       if (!list) {
+               errno = ENOMEM;
+               return -1;
+       }
+
+       elem = strdup(value);
+       if (!elem) {
+               free(list);
+               return -1;
+       }
+       list->elem = elem;
+
+       lxc_list_add_tail(&lxc_conf->lsm_aa_raw, list);
+
+       return 0;
+}
+
 static int set_config_selinux_context(const char *key, const char *value,
                                      struct lxc_conf *lxc_conf, void *data)
 {
@@ -3004,6 +3054,34 @@ static int get_config_apparmor_allow_incomplete(const char *key, char *retv,
                                c->lsm_aa_allow_incomplete);
 }
 
+static int get_config_apparmor_allow_nesting(const char *key, char *retv,
+                                            int inlen, struct lxc_conf *c,
+                                            void *data)
+{
+       return lxc_get_conf_int(c, retv, inlen,
+                               c->lsm_aa_allow_nesting);
+}
+
+static int get_config_apparmor_raw(const char *key, char *retv,
+                                  int inlen, struct lxc_conf *c,
+                                  void *data)
+{
+       int len;
+       struct lxc_list *it;
+       int fulllen = 0;
+
+       if (!retv)
+               inlen = 0;
+       else
+               memset(retv, 0, inlen);
+
+       lxc_list_for_each(it, &c->lsm_aa_raw) {
+               strprint(retv, inlen, "%s\n", (char *)it->elem);
+       }
+
+       return fulllen;
+}
+
 static int get_config_selinux_context(const char *key, char *retv, int inlen,
                                      struct lxc_conf *c, void *data)
 {
@@ -3794,6 +3872,21 @@ static inline int clr_config_apparmor_allow_incomplete(const char *key,
        return 0;
 }
 
+static inline int clr_config_apparmor_allow_nesting(const char *key,
+                                                   struct lxc_conf *c,
+                                                   void *data)
+{
+       c->lsm_aa_allow_nesting = 0;
+       return 0;
+}
+
+static inline int clr_config_apparmor_raw(const char *key,
+                                         struct lxc_conf *c,
+                                         void *data)
+{
+       return lxc_clear_apparmor_raw(c);
+}
+
 static inline int clr_config_selinux_context(const char *key,
                                             struct lxc_conf *c, void *data)
 {
@@ -4986,7 +5079,9 @@ int lxc_list_subkeys(struct lxc_conf *conf, const char *key, char *retv,
 
        if (!strcmp(key, "lxc.apparmor")) {
                strprint(retv, inlen, "allow_incomplete\n");
+               strprint(retv, inlen, "allow_nesting\n");
                strprint(retv, inlen, "profile\n");
+               strprint(retv, inlen, "raw\n");
        } else if (!strcmp(key, "lxc.cgroup")) {
                strprint(retv, inlen, "dir\n");
        } else if (!strcmp(key, "lxc.selinux")) {
index c36421627e36ada36e4053fdd1b4c2a8df990e60..64ea4f0248d62e9dcf3c5385c134cd685999ce40 100644 (file)
@@ -378,7 +378,8 @@ static void exec_criu(struct cgroup_ops *cgroup_ops, struct criu_opts *opts)
                DECLARE_ARG(opts->user->action_script);
        }
 
-       mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list);
+       mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list,
+                                        opts->c->lxc_conf->lsm_aa_allow_nesting);
        if (!mnts)
                goto err;
 
index 5fe6d525673cc9f65e127d8f59f054bfd963e455..ec3f805de3c128825d4fd6486ea1103184673c2d 100644 (file)
 #include "conf.h"
 #include "utils.h"
 #include "initutils.h"
+#include "caps.h"
+#include "parse.h"
 
 lxc_log_define(apparmor, lsm);
 
 /* set by lsm_apparmor_drv_init if true */
 static int aa_enabled = 0;
+static bool aa_parser_available = false;
+static bool aa_supports_unix = false;
+static bool aa_can_stack = false;
+static bool aa_is_stacked = false;
+static bool aa_admin = false;
 
 static int mount_features_enabled = 0;
 
@@ -46,6 +53,332 @@ static int mount_features_enabled = 0;
 #define AA_MOUNT_RESTR "/sys/kernel/security/apparmor/features/mount/mask"
 #define AA_ENABLED_FILE "/sys/module/apparmor/parameters/enabled"
 #define AA_UNCHANGED "unchanged"
+#define AA_GENERATED "generated"
+
+#define AA_CMD_LOAD   'r'
+#define AA_CMD_UNLOAD 'R'
+#define AA_CMD_PARSE  'Q'
+
+static const char AA_PROFILE_BASE[] =
+"  ### Base profile\n"
+"  capability,\n"
+"  dbus,\n"
+"  file,\n"
+"  network,\n"
+"  umount,\n"
+"\n"
+"  # Allow us to receive signals from anywhere.\n"
+"  signal (receive),\n"
+"\n"
+"  # Allow us to send signals to ourselves\n"
+"  signal peer=@{profile_name},\n"
+"\n"
+"  # Allow other processes to read our /proc entries, futexes, perf tracing and\n"
+"  # kcmp for now (they will need 'read' in the first place). Administrators can\n"
+"  # override with:\n"
+"  #   deny ptrace (readby) ...\n"
+"  ptrace (readby),\n"
+"\n"
+"  # Allow other processes to trace us by default (they will need 'trace' in\n"
+"  # the first place). Administrators can override with:\n"
+"  #   deny ptrace (tracedby) ...\n"
+"  ptrace (tracedby),\n"
+"\n"
+"  # Allow us to ptrace ourselves\n"
+"  ptrace peer=@{profile_name},\n"
+"\n"
+"  # ignore DENIED message on / remount\n"
+"  deny mount options=(ro, remount) -> /,\n"
+"  deny mount options=(ro, remount, silent) -> /,\n"
+"\n"
+"  # allow tmpfs mounts everywhere\n"
+"  mount fstype=tmpfs,\n"
+"\n"
+"  # allow hugetlbfs mounts everywhere\n"
+"  mount fstype=hugetlbfs,\n"
+"\n"
+"  # allow mqueue mounts everywhere\n"
+"  mount fstype=mqueue,\n"
+"\n"
+"  # allow fuse mounts everywhere\n"
+"  mount fstype=fuse,\n"
+"  mount fstype=fuse.*,\n"
+"\n"
+"  # deny access under /proc/bus to avoid e.g. messing with pci devices directly\n"
+"  deny @{PROC}/bus/** wklx,\n"
+"\n"
+"  # deny writes in /proc/sys/fs but allow binfmt_misc to be mounted\n"
+"  mount fstype=binfmt_misc -> /proc/sys/fs/binfmt_misc/,\n"
+"  deny @{PROC}/sys/fs/** wklx,\n"
+"\n"
+"  # allow efivars to be mounted, writing to it will be blocked though\n"
+"  mount fstype=efivarfs -> /sys/firmware/efi/efivars/,\n"
+"\n"
+"  # block some other dangerous paths\n"
+"  deny @{PROC}/kcore rwklx,\n"
+"  deny @{PROC}/sysrq-trigger rwklx,\n"
+"\n"
+"  # deny writes in /sys except for /sys/fs/cgroup, also allow\n"
+"  # fusectl, securityfs and debugfs to be mounted there (read-only)\n"
+"  mount fstype=fusectl -> /sys/fs/fuse/connections/,\n"
+"  mount fstype=securityfs -> /sys/kernel/security/,\n"
+"  mount fstype=debugfs -> /sys/kernel/debug/,\n"
+"  deny mount fstype=debugfs -> /var/lib/ureadahead/debugfs/,\n"
+"  mount fstype=proc -> /proc/,\n"
+"  mount fstype=sysfs -> /sys/,\n"
+"  mount options=(rw, nosuid, nodev, noexec, remount) -> /sys/,\n"
+"  deny /sys/firmware/efi/efivars/** rwklx,\n"
+"  # note, /sys/kernel/security/** handled below\n"
+"  mount options=(ro, nosuid, nodev, noexec, remount, strictatime) -> /sys/fs/cgroup/,\n"
+"\n"
+"  # deny reads from debugfs\n"
+"  deny /sys/kernel/debug/{,**} rwklx,\n"
+"\n"
+"  # allow paths to be made slave, shared, private or unbindable\n"
+"  # FIXME: This currently doesn't work due to the apparmor parser treating those as allowing all mounts.\n"
+"#  mount options=(rw,make-slave) -> **,\n"
+"#  mount options=(rw,make-rslave) -> **,\n"
+"#  mount options=(rw,make-shared) -> **,\n"
+"#  mount options=(rw,make-rshared) -> **,\n"
+"#  mount options=(rw,make-private) -> **,\n"
+"#  mount options=(rw,make-rprivate) -> **,\n"
+"#  mount options=(rw,make-unbindable) -> **,\n"
+"#  mount options=(rw,make-runbindable) -> **,\n"
+"\n"
+"  # allow bind-mounts of anything except /proc, /sys and /dev\n"
+"  mount options=(rw,bind) /[^spd]*{,/**},\n"
+"  mount options=(rw,bind) /d[^e]*{,/**},\n"
+"  mount options=(rw,bind) /de[^v]*{,/**},\n"
+"  mount options=(rw,bind) /dev/.[^l]*{,/**},\n"
+"  mount options=(rw,bind) /dev/.l[^x]*{,/**},\n"
+"  mount options=(rw,bind) /dev/.lx[^c]*{,/**},\n"
+"  mount options=(rw,bind) /dev/.lxc?*{,/**},\n"
+"  mount options=(rw,bind) /dev/[^.]*{,/**},\n"
+"  mount options=(rw,bind) /dev?*{,/**},\n"
+"  mount options=(rw,bind) /p[^r]*{,/**},\n"
+"  mount options=(rw,bind) /pr[^o]*{,/**},\n"
+"  mount options=(rw,bind) /pro[^c]*{,/**},\n"
+"  mount options=(rw,bind) /proc?*{,/**},\n"
+"  mount options=(rw,bind) /s[^y]*{,/**},\n"
+"  mount options=(rw,bind) /sy[^s]*{,/**},\n"
+"  mount options=(rw,bind) /sys?*{,/**},\n"
+"\n"
+"  # allow read-only bind-mounts of anything except /proc, /sys and /dev\n"
+"  mount options=(ro,remount,bind) -> /[^spd]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /d[^e]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /de[^v]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /dev/.[^l]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /dev/.l[^x]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /dev/.lx[^c]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /dev/.lxc?*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /dev/[^.]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /dev?*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /p[^r]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /pr[^o]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /pro[^c]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /proc?*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /s[^y]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /sy[^s]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /sys?*{,/**},\n"
+"\n"
+"  # allow moving mounts except for /proc, /sys and /dev\n"
+"  mount options=(rw,move) /[^spd]*{,/**},\n"
+"  mount options=(rw,move) /d[^e]*{,/**},\n"
+"  mount options=(rw,move) /de[^v]*{,/**},\n"
+"  mount options=(rw,move) /dev/.[^l]*{,/**},\n"
+"  mount options=(rw,move) /dev/.l[^x]*{,/**},\n"
+"  mount options=(rw,move) /dev/.lx[^c]*{,/**},\n"
+"  mount options=(rw,move) /dev/.lxc?*{,/**},\n"
+"  mount options=(rw,move) /dev/[^.]*{,/**},\n"
+"  mount options=(rw,move) /dev?*{,/**},\n"
+"  mount options=(rw,move) /p[^r]*{,/**},\n"
+"  mount options=(rw,move) /pr[^o]*{,/**},\n"
+"  mount options=(rw,move) /pro[^c]*{,/**},\n"
+"  mount options=(rw,move) /proc?*{,/**},\n"
+"  mount options=(rw,move) /s[^y]*{,/**},\n"
+"  mount options=(rw,move) /sy[^s]*{,/**},\n"
+"  mount options=(rw,move) /sys?*{,/**},\n"
+"\n"
+"  # generated by: lxc-generate-aa-rules.py container-rules.base\n"
+"  deny /proc/sys/[^kn]*{,/**} wklx,\n"
+"  deny /proc/sys/k[^e]*{,/**} wklx,\n"
+"  deny /proc/sys/ke[^r]*{,/**} wklx,\n"
+"  deny /proc/sys/ker[^n]*{,/**} wklx,\n"
+"  deny /proc/sys/kern[^e]*{,/**} wklx,\n"
+"  deny /proc/sys/kerne[^l]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/[^smhd]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/d[^o]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/do[^m]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/dom[^a]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/doma[^i]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/domai[^n]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/domain[^n]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/domainn[^a]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/domainna[^m]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/domainnam[^e]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/domainname?*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/h[^o]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/ho[^s]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/hos[^t]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/host[^n]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/hostn[^a]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/hostna[^m]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/hostnam[^e]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/hostname?*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/m[^s]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/ms[^g]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/msg*/** wklx,\n"
+"  deny /proc/sys/kernel/s[^he]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/se[^m]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/sem*/** wklx,\n"
+"  deny /proc/sys/kernel/sh[^m]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/shm*/** wklx,\n"
+"  deny /proc/sys/kernel?*{,/**} wklx,\n"
+"  deny /proc/sys/n[^e]*{,/**} wklx,\n"
+"  deny /proc/sys/ne[^t]*{,/**} wklx,\n"
+"  deny /proc/sys/net?*{,/**} wklx,\n"
+"  deny /sys/[^fdck]*{,/**} wklx,\n"
+"  deny /sys/c[^l]*{,/**} wklx,\n"
+"  deny /sys/cl[^a]*{,/**} wklx,\n"
+"  deny /sys/cla[^s]*{,/**} wklx,\n"
+"  deny /sys/clas[^s]*{,/**} wklx,\n"
+"  deny /sys/class/[^n]*{,/**} wklx,\n"
+"  deny /sys/class/n[^e]*{,/**} wklx,\n"
+"  deny /sys/class/ne[^t]*{,/**} wklx,\n"
+"  deny /sys/class/net?*{,/**} wklx,\n"
+"  deny /sys/class?*{,/**} wklx,\n"
+"  deny /sys/d[^e]*{,/**} wklx,\n"
+"  deny /sys/de[^v]*{,/**} wklx,\n"
+"  deny /sys/dev[^i]*{,/**} wklx,\n"
+"  deny /sys/devi[^c]*{,/**} wklx,\n"
+"  deny /sys/devic[^e]*{,/**} wklx,\n"
+"  deny /sys/device[^s]*{,/**} wklx,\n"
+"  deny /sys/devices/[^v]*{,/**} wklx,\n"
+"  deny /sys/devices/v[^i]*{,/**} wklx,\n"
+"  deny /sys/devices/vi[^r]*{,/**} wklx,\n"
+"  deny /sys/devices/vir[^t]*{,/**} wklx,\n"
+"  deny /sys/devices/virt[^u]*{,/**} wklx,\n"
+"  deny /sys/devices/virtu[^a]*{,/**} wklx,\n"
+"  deny /sys/devices/virtua[^l]*{,/**} wklx,\n"
+"  deny /sys/devices/virtual/[^n]*{,/**} wklx,\n"
+"  deny /sys/devices/virtual/n[^e]*{,/**} wklx,\n"
+"  deny /sys/devices/virtual/ne[^t]*{,/**} wklx,\n"
+"  deny /sys/devices/virtual/net?*{,/**} wklx,\n"
+"  deny /sys/devices/virtual?*{,/**} wklx,\n"
+"  deny /sys/devices?*{,/**} wklx,\n"
+"  deny /sys/f[^s]*{,/**} wklx,\n"
+"  deny /sys/fs/[^c]*{,/**} wklx,\n"
+"  deny /sys/fs/c[^g]*{,/**} wklx,\n"
+"  deny /sys/fs/cg[^r]*{,/**} wklx,\n"
+"  deny /sys/fs/cgr[^o]*{,/**} wklx,\n"
+"  deny /sys/fs/cgro[^u]*{,/**} wklx,\n"
+"  deny /sys/fs/cgrou[^p]*{,/**} wklx,\n"
+"  deny /sys/fs/cgroup?*{,/**} wklx,\n"
+"  deny /sys/fs?*{,/**} wklx,\n"
+;
+
+static const char AA_PROFILE_UNIX_SOCKETS[] =
+"\n"
+"  ### Feature: unix\n"
+"  # Allow receive via unix sockets from anywhere\n"
+"  unix (receive),\n"
+"\n"
+"  # Allow all unix sockets in the container\n"
+"  unix peer=(label=@{profile_name}),\n"
+;
+
+static const char AA_PROFILE_CGROUP_NAMESPACES[] =
+"\n"
+"  ### Feature: cgroup namespace\n"
+"  mount fstype=cgroup -> /sys/fs/cgroup/**,\n"
+"  mount fstype=cgroup2 -> /sys/fs/cgroup/**,\n"
+;
+
+/* '_BASE' because we still need to append generated change_profile rules */
+static const char AA_PROFILE_STACKING_BASE[] =
+"\n"
+"  ### Feature: apparmor stacking\n"
+"  ### Configuration: apparmor profile loading (in namespace)\n"
+"  deny /sys/k[^e]*{,/**} wklx,\n"
+"  deny /sys/ke[^r]*{,/**} wklx,\n"
+"  deny /sys/ker[^n]*{,/**} wklx,\n"
+"  deny /sys/kern[^e]*{,/**} wklx,\n"
+"  deny /sys/kerne[^l]*{,/**} wklx,\n"
+"  deny /sys/kernel/[^s]*{,/**} wklx,\n"
+"  deny /sys/kernel/s[^e]*{,/**} wklx,\n"
+"  deny /sys/kernel/se[^c]*{,/**} wklx,\n"
+"  deny /sys/kernel/sec[^u]*{,/**} wklx,\n"
+"  deny /sys/kernel/secu[^r]*{,/**} wklx,\n"
+"  deny /sys/kernel/secur[^i]*{,/**} wklx,\n"
+"  deny /sys/kernel/securi[^t]*{,/**} wklx,\n"
+"  deny /sys/kernel/securit[^y]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/[^a]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/a[^p]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/ap[^p]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/app[^a]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/appa[^r]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/appar[^m]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/apparm[^o]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/apparmo[^r]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/apparmor?*{,/**} wklx,\n"
+"  deny /sys/kernel/security?*{,/**} wklx,\n"
+"  deny /sys/kernel?*{,/**} wklx,\n"
+;
+
+static const char AA_PROFILE_NO_STACKING[] =
+"\n"
+"  ### Feature: apparmor stacking (not present)\n"
+"  deny /sys/k*{,/**} rwklx,\n"
+;
+
+/* '_BASE' because we need to append change_profile for stacking */
+static const char AA_PROFILE_NESTING_BASE[] =
+"\n"
+"  ### Configuration: nesting\n"
+"  pivot_root,\n"
+"  ptrace,\n"
+"  signal,\n"
+"\n"
+   /* NOTE: See conf.c's "nesting_helpers" for details. */
+"  deny /dev/.lxc/proc/** rw,\n"
+"  deny /dev/.lxc/sys/** rw,\n"
+"\n"
+"  mount fstype=proc -> /usr/lib/*/lxc/**,\n"
+"  mount fstype=sysfs -> /usr/lib/*/lxc/**,\n"
+"  mount options=(rw,bind),\n"
+"  mount options=(rw,rbind),\n"
+"  mount options=(rw,make-rshared),\n"
+"\n"
+   /* FIXME: What's the state here on apparmor's side? */
+"  # there doesn't seem to be a way to ask for:\n"
+"  # mount options=(ro,nosuid,nodev,noexec,remount,bind),\n"
+"  # as we always get mount to $cdir/proc/sys with those flags denied\n"
+"  # So allow all mounts until that is straightened out:\n"
+"  mount,\n"
+;
+
+static const char AA_PROFILE_UNPRIVILEGED[] =
+"\n"
+"  ### Configuration: unprivileged container\n"
+"  pivot_root,\n"
+"\n"
+"  # Allow modifying mount propagation\n"
+"  mount options=(rw,make-slave) -> **,\n"
+"  mount options=(rw,make-rslave) -> **,\n"
+"  mount options=(rw,make-shared) -> **,\n"
+"  mount options=(rw,make-rshared) -> **,\n"
+"  mount options=(rw,make-private) -> **,\n"
+"  mount options=(rw,make-rprivate) -> **,\n"
+"  mount options=(rw,make-unbindable) -> **,\n"
+"  mount options=(rw,make-runbindable) -> **,\n"
+"\n"
+"  # Allow all bind-mounts\n"
+"  mount options=(rw,bind),\n"
+"  mount options=(rw,rbind),\n"
+"\n"
+"  # Allow remounting things read-only\n"
+"  mount options=(ro,remount),\n"
+;
 
 static bool check_mount_feature_enabled(void)
 {
@@ -144,11 +477,6 @@ static bool apparmor_am_unconfined(void)
        return ret;
 }
 
-/* aa stacking is not yet supported */
-static bool aa_stacking_supported(void) {
-       return false;
-}
-
 static bool aa_needs_transition(char *curlabel)
 {
        if (!curlabel)
@@ -160,61 +488,546 @@ static bool aa_needs_transition(char *curlabel)
        return true;
 }
 
+static inline void uint64hex(char *buf, uint64_t num)
+{
+       size_t i;
+
+       buf[16] = 0;
+       for (i = 16; i--;) {
+               char c = (char)(num & 0xf);
+               buf[i] = c + (c < 0xa ? '0' : 'a' - 0xa);
+               num >>= 4;
+       }
+}
+
+static inline char *shorten_apparmor_name(char *name)
+{
+       size_t len = strlen(name);
+       if (len + 7 > 253) {
+               uint64_t hash;
+               hash = fnv_64a_buf(name, len, FNV1A_64_INIT);
+               name = must_realloc(name, 16 + 1);
+               uint64hex(name, hash);
+       }
+
+       return name;
+}
+
+/* Replace slashes with hyphens */
+static inline void sanitize_path(char *path)
+{
+       size_t i;
+
+       for (i = 0; path[i]; i++)
+               if (path[i] == '/')
+                       path[i] = '-';
+}
+
+static inline char *apparmor_dir(const char *ctname, const char *lxcpath)
+{
+       return must_make_path(lxcpath, ctname, "apparmor", NULL);
+}
+
+
+static inline char *apparmor_profile_full(const char *ctname, const char *lxcpath)
+{
+       return shorten_apparmor_name(must_concat("lxc-", ctname, "_<", lxcpath, ">", NULL));
+}
+
+/* Like apparmor_profile_full() but with slashes replaced by hyphens */
+static inline char *apparmor_namespace(const char *ctname, const char *lxcpath)
+{
+       char *full;
+
+       full = apparmor_profile_full(ctname, lxcpath);
+       sanitize_path(full);
+
+       return full;
+}
+
+/* FIXME: This is currently run only in the context of a constructor (via the
+ * initial lsm_init() called due to its __attribute__((constructor)), so we
+ * do not have ERROR/... macros available, so there are some fprintf(stderr)s
+ * in there.
+ */
+static bool check_apparmor_parser_version()
+{
+       struct lxc_popen_FILE *parserpipe;
+       int rc;
+       int major = 0, minor = 0, micro = 0;
+
+       parserpipe = lxc_popen("apparmor_parser --version");
+       if (!parserpipe) {
+               fprintf(stderr, "Failed to run check for apparmor_parser\n");
+               return false;
+       }
+
+       rc = fscanf(parserpipe->f, "AppArmor parser version %d.%d.%d", &major, &minor, &micro);
+       if (rc < 1) {
+               lxc_pclose(parserpipe);
+               /* We stay silent for now as this most likely means the shell
+                * lxc_popen executed failed to find the apparmor_parser binary.
+                * See the FIXME comment above for details.
+                */
+               return false;
+       }
+
+       rc = lxc_pclose(parserpipe);
+       if (rc < 0) {
+               fprintf(stderr, "Error waiting for child process\n");
+               return false;
+       }
+       if (rc != 0) {
+               fprintf(stderr, "'apparmor_parser --version' executed with an error status\n");
+               return false;
+       }
+
+       aa_supports_unix = (major > 2) ||
+                          (major == 2 && minor > 10) ||
+                          (major == 2 && minor == 10 && micro >= 95);
+
+       return true;
+}
+
+static bool file_is_yes(const char *path)
+{
+       ssize_t rd;
+       int fd;
+       char buf[8]; /* we actually just expect "yes" or "no" */
+
+       fd = open(path, O_RDONLY | O_CLOEXEC);
+       if (fd < 0)
+               return false;
+
+       rd = read(fd, buf, sizeof(buf));
+       close(fd);
+
+       return rd >= 4 && strncmp(buf, "yes\n", 4) == 0;
+}
+
+static bool apparmor_can_stack()
+{
+       int major, minor, scanned;
+       FILE *f;
+
+       if (!file_is_yes("/sys/kernel/security/apparmor/features/domain/stack"))
+               return false;
+
+       f = fopen_cloexec("/sys/kernel/security/apparmor/features/domain/version", "r");
+       if (!f)
+               return false;
+
+       scanned = fscanf(f, "%d.%d", &major, &minor);
+       fclose(f);
+       if (scanned != 2)
+               return false;
+
+       return major > 1 || (major == 1 && minor >= 2);
+}
+
+static void must_append_sized_full(char **buf, size_t *bufsz, const char *data,
+                                  size_t size, bool append_newline)
+{
+       size_t newsize = *bufsz + size;
+
+       if (append_newline)
+               ++newsize;
+
+       *buf = must_realloc(*buf, newsize);
+       memcpy(*buf + *bufsz, data, size);
+
+       if (append_newline)
+               (*buf)[newsize - 1] = '\n';
+
+       *bufsz = newsize;
+}
+
+static void must_append_sized(char **buf, size_t *bufsz, const char *data, size_t size)
+{
+       return must_append_sized_full(buf, bufsz, data, size, false);
+}
+
+static bool is_privileged(struct lxc_conf *conf)
+{
+       return lxc_list_empty(&conf->id_map);
+}
+
+static char *get_apparmor_profile_content(struct lxc_conf *conf, const char *lxcpath)
+{
+       char *profile, *profile_name_full;
+       size_t size;
+       struct lxc_list *it;
+
+       profile_name_full = apparmor_profile_full(conf->name, lxcpath);
+
+       profile = must_concat(
+"#include <tunables/global>\n"
+"profile \"", profile_name_full, "\" flags=(attach_disconnected,mediate_deleted) {\n",
+                             NULL);
+       size = strlen(profile);
+
+       must_append_sized(&profile, &size, AA_PROFILE_BASE,
+                         sizeof(AA_PROFILE_BASE) - 1);
+
+       if (aa_supports_unix)
+               must_append_sized(&profile, &size, AA_PROFILE_UNIX_SOCKETS,
+                                 sizeof(AA_PROFILE_UNIX_SOCKETS) - 1);
+
+       if (file_exists("/proc/self/ns/cgroup"))
+               must_append_sized(&profile, &size, AA_PROFILE_CGROUP_NAMESPACES,
+                                 sizeof(AA_PROFILE_CGROUP_NAMESPACES) - 1);
+
+       if (aa_can_stack && !aa_is_stacked) {
+               char *namespace, *temp;
+
+               must_append_sized(&profile, &size, AA_PROFILE_STACKING_BASE,
+                                 sizeof(AA_PROFILE_STACKING_BASE) - 1);
+
+               namespace = apparmor_namespace(conf->name, lxcpath);
+               temp = must_concat("  change_profile -> \":", namespace, ":*\",\n"
+                                  "  change_profile -> \":", namespace, "://*\",\n",
+                                  NULL);
+               free(namespace);
+
+               must_append_sized(&profile, &size, temp, strlen(temp));
+               free(temp);
+       } else {
+               must_append_sized(&profile, &size, AA_PROFILE_NO_STACKING,
+                                 sizeof(AA_PROFILE_NO_STACKING) - 1);
+       }
+
+       if (conf->lsm_aa_allow_nesting) {
+               must_append_sized(&profile, &size, AA_PROFILE_NESTING_BASE,
+                                 sizeof(AA_PROFILE_NESTING_BASE) - 1);
+
+               if (!aa_can_stack || aa_is_stacked) {
+                       char *temp;
+
+                       temp = must_concat("  change_profile -> \"",
+                                          profile_name_full, "\",\n", NULL);
+                       must_append_sized(&profile, &size, temp, strlen(temp));
+                       free(temp);
+               }
+       }
+
+       if (!is_privileged(conf) || am_host_unpriv())
+               must_append_sized(&profile, &size, AA_PROFILE_UNPRIVILEGED,
+                                 sizeof(AA_PROFILE_UNPRIVILEGED) - 1);
+
+       lxc_list_for_each(it, &conf->lsm_aa_raw) {
+               const char *line = it->elem;
+
+               must_append_sized_full(&profile, &size, line, strlen(line), true);
+       }
+
+       /* include terminating \0 byte */
+       must_append_sized(&profile, &size, "}\n", 3);
+
+       free(profile_name_full);
+
+       return profile;
+}
+
 /*
- * apparmor_process_label_set: Set AppArmor process profile
- *
- * @label   : the profile to set
- * @conf    : the container configuration to use if @label is NULL
- * @default : use the default profile if @label is NULL
- * @on_exec : this is ignored.  Apparmor profile will be changed immediately
- *
- * Returns 0 on success, < 0 on failure
- *
- * Notes: This relies on /proc being available.
+ * apparmor_parser creates a cache file using the parsed file's name as a name.
+ * This means there may be multiple containers with the same name but different
+ * lxcpaths. Therefore we need a sanitized version of the complete profile name
+ * as profile file-name.
+ * We already get this exactly from apparmor_namespace().
  */
-static int apparmor_process_label_set(const char *inlabel, struct lxc_conf *conf,
-                                     bool use_default, bool on_exec)
+static char *make_apparmor_profile_path(const char *ctname, const char *lxcpath)
 {
-       int label_fd, ret;
-       pid_t tid;
-       const char *label = inlabel ? inlabel : conf->lsm_aa_profile;
-       char *curlabel;
+       char *ret, *filename;
 
-       if (!aa_enabled)
-               return 0;
+       filename = apparmor_namespace(ctname, lxcpath);
+       ret = must_make_path(lxcpath, ctname, "apparmor", filename, NULL);
+       free(filename);
+
+       return ret;
+}
+
+static char *make_apparmor_namespace_path(const char *ctname, const char *lxcpath)
+{
+       char *ret, *namespace;
+
+       namespace = apparmor_namespace(ctname, lxcpath);
+       ret = must_make_path("/sys/kernel/security/apparmor/policy/namespaces", namespace, NULL);
+       free(namespace);
+
+       return ret;
+}
+
+static bool make_apparmor_namespace(struct lxc_conf *conf, const char *lxcpath)
+{
+       char *path;
+
+       if (!aa_can_stack || aa_is_stacked)
+               return true;
+
+       path = make_apparmor_namespace_path(conf->name, lxcpath);
+       errno = 0;
+       if (mkdir(path, 0755) < 0 && errno != EEXIST) {
+               SYSERROR("Error creating AppArmor namespace: %s", path);
+               free(path);
+               return false;
+       }
+       free(path);
+
+       return true;
+}
+
+static void remove_apparmor_namespace(struct lxc_conf *conf, const char *lxcpath)
+{
+       char *path;
+
+       path = make_apparmor_namespace_path(conf->name, lxcpath);
+       if (rmdir(path) != 0)
+               SYSERROR("Error removing AppArmor namespace");
+       free(path);
+}
+
+struct apparmor_parser_args {
+       char cmd;
+       char *file;
+};
+
+static int apparmor_parser_exec(void *data)
+{
+       struct apparmor_parser_args *args = data;
+       char cmdbuf[] = { '-', args->cmd, 'W', 'L', 0 };
+
+       execlp("apparmor_parser", "apparmor_parser", cmdbuf, APPARMOR_CACHE_DIR, args->file, NULL);
+
+       return -1;
+}
+
+static int run_apparmor_parser(char command,
+                               struct lxc_conf *conf,
+                               const char *lxcpath)
+{
+       char output[MAXPATHLEN];
+       int ret;
+       struct apparmor_parser_args args = {
+               .cmd = command,
+               .file = make_apparmor_profile_path(conf->name, lxcpath),
+       };
+
+       ret = run_command(output, sizeof(output), apparmor_parser_exec, (void*)&args);
+       if (ret < 0) {
+               ERROR("Failed to run apparmor_parser on \"%s\": %s", args.file, output);
+               ret = -1;
+       }
+
+
+       free(args.file);
+       return ret;
+}
+
+static void remove_apparmor_profile(struct lxc_conf *conf, const char *lxcpath)
+{
+       char *path;
+
+       /* It's ok if these deletes fail: if the container was never started,
+        * we'll have never written a profile or cached it.
+        */
+
+       path = make_apparmor_profile_path(conf->name, lxcpath);
+       (void)unlink(path);
+       free(path);
+
+       /* Also remove the apparmor/ subdirectory */
+       path = apparmor_dir(conf->name, lxcpath);
+       (void)rmdir(path);
+       free(path);
+}
+
+static int load_apparmor_profile(struct lxc_conf *conf, const char *lxcpath)
+{
+       struct stat profile_sb;
+       size_t content_len;
+       int ret = -1;
+       size_t old_len = 0;
+       char *profile_path = NULL, *old_content = NULL, *new_content = NULL;
+       int profile_fd = -1;
+
+       if (!make_apparmor_namespace(conf, lxcpath))
+               return -1;
+
+       /* In order to avoid forcing a profile parse (potentially slow) on
+        * every container start, let's use apparmor's binary policy cache,
+        * which checks mtime of the files to figure out if the policy needs to
+        * be regenerated.
+        *
+        * Since it uses mtimes, we shouldn't just always write out our local
+        * apparmor template; instead we should check to see whether the
+        * template is the same as ours. If it isn't we should write our
+        * version out so that the new changes are reflected and we definitely
+        * force a recompile.
+        */
+
+       profile_path = make_apparmor_profile_path(conf->name, lxcpath);
+       profile_fd = open(profile_path, O_RDONLY | O_CLOEXEC);
+       if (profile_fd >= 0) {
+               if (fstat(profile_fd, &profile_sb) < 0) {
+                       SYSERROR("Error accessing old profile from %s",
+                                profile_path);
+                       goto out;
+               }
+               old_len = profile_sb.st_size;
+               old_content = lxc_strmmap(NULL, old_len, PROT_READ,
+                                         MAP_PRIVATE, profile_fd, 0);
+               if (!old_content) {
+                       SYSERROR("Failed to mmap old profile from %s",
+                                profile_path);
+                       goto out;
+               }
+       } else if (errno != ENOENT) {
+               SYSERROR("Error reading old profile from %s", profile_path);
+               goto out;
+       }
+
+       new_content = get_apparmor_profile_content(conf, lxcpath);
+       if (!new_content)
+               goto out;
+
+       content_len = strlen(new_content);
+
+       if (!old_content || old_len != content_len || memcmp(old_content, new_content, content_len) != 0) {
+               char *path;
+
+               ret = mkdir_p(APPARMOR_CACHE_DIR, 0755);
+               if (ret < 0) {
+                       SYSERROR("Error creating AppArmor profile cache directory " APPARMOR_CACHE_DIR);
+                       goto out;
+               }
+
+               path = apparmor_dir(conf->name, lxcpath);
+               ret = mkdir_p(path, 0755);
+               if (ret < 0) {
+                       SYSERROR("Error creating AppArmor profile directory: %s", path);
+                       free(path);
+                       goto out;
+               }
+               free(path);
+
+               ret = lxc_write_to_file(profile_path, new_content, content_len, false, 0600);
+               if (ret < 0) {
+                       SYSERROR("Error writing profile to %s", profile_path);
+                       goto out;
+               }
+       }
+
+       ret = run_apparmor_parser(AA_CMD_LOAD, conf, lxcpath);
+       if (ret != 0)
+               goto out_remove_profile;
+
+       conf->lsm_aa_profile_created = true;
+
+       goto out_ok;
+
+out_remove_profile:
+       remove_apparmor_profile(conf, lxcpath);
+out:
+       remove_apparmor_namespace(conf, lxcpath);
+out_ok:
+       if (profile_fd >= 0) {
+               if (old_content)
+                       lxc_strmunmap(old_content, old_len);
+               close(profile_fd);
+       }
+       free(profile_path);
+       free(new_content);
+       return ret;
+}
+
+/*
+ * Ensure that the container's policy namespace is unloaded to free kernel
+ * memory. This does not delete the policy from disk or cache.
+ */
+static void apparmor_cleanup(struct lxc_conf *conf, const char *lxcpath)
+{
+       if (!aa_admin)
+               return;
+
+       if (!conf->lsm_aa_profile_created)
+               return;
+
+       remove_apparmor_namespace(conf, lxcpath);
+       (void)run_apparmor_parser(AA_CMD_UNLOAD, conf, lxcpath);
+
+       remove_apparmor_profile(conf, lxcpath);
+}
+
+static int apparmor_prepare(struct lxc_conf *conf, const char *lxcpath)
+{
+       int ret = -1;
+       const char *label;
+       char *curlabel = NULL, *genlabel = NULL;
+
+       if (!aa_enabled) {
+               ERROR("AppArmor not enabled");
+               return -1;
+       }
+
+       label = conf->lsm_aa_profile;
 
        /* user may request that we just ignore apparmor */
        if (label && strcmp(label, AA_UNCHANGED) == 0) {
-               INFO("apparmor profile unchanged per user request");
+               INFO("AppArmor profile unchanged per user request");
+               conf->lsm_aa_profile_computed = must_copy_string(label);
                return 0;
        }
 
+       if (label && strcmp(label, AA_GENERATED) == 0) {
+               if (!aa_parser_available) {
+                       ERROR("Cannot use generated profile: apparmor_parser not available");
+                       goto out;
+               }
+
+               /* auto-generate profile based on available/requested security features */
+               if (load_apparmor_profile(conf, lxcpath) != 0) {
+                       ERROR("Failed to load generated AppArmor profile");
+                       goto out;
+               }
+
+               genlabel = apparmor_profile_full(conf->name, lxcpath);
+               if (!genlabel) {
+                       ERROR("Failed to build AppArmor profile name");
+                       goto out;
+               }
+
+               if (aa_can_stack && !aa_is_stacked) {
+                       char *namespace = apparmor_namespace(conf->name, lxcpath);
+                       size_t llen = strlen(genlabel);
+                       must_append_sized(&genlabel, &llen, "//&:", sizeof("//&:") - 1);
+                       must_append_sized(&genlabel, &llen, namespace, strlen(namespace));
+                       must_append_sized(&genlabel, &llen, ":", sizeof(":")); /* with the nul byte */
+                       free(namespace);
+               }
+
+               label = genlabel;
+       }
+
        curlabel = apparmor_process_label_get(lxc_raw_getpid());
 
-       if (!aa_stacking_supported() && aa_needs_transition(curlabel)) {
+       if (!aa_can_stack && aa_needs_transition(curlabel)) {
                /* we're already confined, and stacking isn't supported */
 
                if (!label || strcmp(curlabel, label) == 0) {
                        /* no change requested */
-                       free(curlabel);
-                       return 0;
+                       ret = 0;
+                       goto out;
                }
 
-               ERROR("already apparmor confined, but new label requested.");
-               free(curlabel);
-               return -1;
+               ERROR("Already AppArmor confined, but new label requested.");
+               goto out;
        }
-       free(curlabel);
 
        if (!label) {
-               if (use_default) {
-                       if (cgns_supported())
-                               label = AA_DEF_PROFILE_CGNS;
-                       else
-                               label = AA_DEF_PROFILE;
-               }
+               if (cgns_supported())
+                       label = AA_DEF_PROFILE_CGNS;
                else
-                       label = "unconfined";
+                       label = AA_DEF_PROFILE;
        }
 
        if (!check_mount_feature_enabled() && strcmp(label, "unconfined") != 0) {
@@ -223,30 +1036,78 @@ static int apparmor_process_label_set(const char *inlabel, struct lxc_conf *conf
                        ERROR("If you really want to start this container, set");
                        ERROR("lxc.apparmor.allow_incomplete = 1");
                        ERROR("in your container configuration file");
-                       return -1;
+                       goto out;
                }
        }
 
+       conf->lsm_aa_profile_computed = must_copy_string(label);
+       ret = 0;
+
+out:
+       if (genlabel) {
+               free(genlabel);
+               if (ret != 0)
+                       apparmor_cleanup(conf, lxcpath);
+       }
+       free(curlabel);
+       return ret;
+}
+
+/*
+ * apparmor_process_label_set: Set AppArmor process profile
+ *
+ * @label   : the profile to set
+ * @conf    : the container configuration to use if @label is NULL
+ * @default : use the default profile if @label is NULL
+ * @on_exec : this is ignored.  Apparmor profile will be changed immediately
+ *
+ * Returns 0 on success, < 0 on failure
+ *
+ * Notes: This relies on /proc being available.
+ */
+static int apparmor_process_label_set(const char *inlabel, struct lxc_conf *conf,
+                                     bool on_exec)
+{
+       int label_fd, ret;
+       pid_t tid;
+       const char *label;
+
+       if (!aa_enabled) {
+               ERROR("AppArmor not enabled");
+               return -1;
+       }
+
+       label = inlabel ? inlabel : conf->lsm_aa_profile_computed;
+       if (!label) {
+               ERROR("LSM wasn't prepared");
+               return -1;
+       }
+
+       /* user may request that we just ignore apparmor */
+       if (strcmp(label, AA_UNCHANGED) == 0) {
+               INFO("AppArmor profile unchanged per user request");
+               return 0;
+       }
 
        if (strcmp(label, "unconfined") == 0 && apparmor_am_unconfined()) {
-               INFO("apparmor profile unchanged");
+               INFO("AppArmor profile unchanged");
                return 0;
        }
        tid = lxc_raw_gettid();
        label_fd = lsm_process_label_fd_get(tid, on_exec);
        if (label_fd < 0) {
-               SYSERROR("Failed to change apparmor profile to %s", label);
+               SYSERROR("Failed to change AppArmor profile to %s", label);
                return -1;
        }
 
        ret = lsm_process_label_set_at(label_fd, label, on_exec);
        close(label_fd);
        if (ret < 0) {
-               ERROR("Failed to change apparmor profile to %s", label);
+               ERROR("Failed to change AppArmor profile to %s", label);
                return -1;
        }
 
-       INFO("Changed apparmor profile to %s", label);
+       INFO("Changed AppArmor profile to %s", label);
        return 0;
 }
 
@@ -255,12 +1116,39 @@ static struct lsm_drv apparmor_drv = {
        .enabled           = apparmor_enabled,
        .process_label_get = apparmor_process_label_get,
        .process_label_set = apparmor_process_label_set,
+       .prepare           = apparmor_prepare,
+       .cleanup           = apparmor_cleanup,
 };
 
 struct lsm_drv *lsm_apparmor_drv_init(void)
 {
+       bool have_mac_admin = false;
+
        if (!apparmor_enabled())
                return NULL;
+
+       /* We only support generated profiles when apparmor_parser is usable */
+       if (!check_apparmor_parser_version())
+               goto out;
+
+       aa_parser_available = true;
+
+       aa_can_stack = apparmor_can_stack();
+       if (aa_can_stack)
+               aa_is_stacked = file_is_yes("/sys/kernel/security/apparmor/.ns_stacked");
+
+       #if HAVE_LIBCAP
+       have_mac_admin = lxc_proc_cap_is_set(CAP_SETGID, CAP_EFFECTIVE);
+       #endif
+
+       if (!have_mac_admin)
+               WARN("Per-container AppArmor profiles are disabled because the mac_admin capability is missing");
+       else if (am_host_unpriv() && !aa_is_stacked)
+               WARN("Per-container AppArmor profiles are disabled because LXC is running in an unprivileged container without stacking");
+       else
+               aa_admin = true;
+
+out:
        aa_enabled = 1;
        return &apparmor_drv;
 }
index 8d7de2dbef36f829a16fe7c650d5b3c93622d47d..46e2120690ee8993b1ce7765add53bdfcc61f68c 100644 (file)
@@ -177,11 +177,37 @@ on_error:
 }
 
 int lsm_process_label_set(const char *label, struct lxc_conf *conf,
-                         bool use_default, bool on_exec)
+                         bool on_exec)
 {
        if (!drv) {
                ERROR("LSM driver not inited");
                return -1;
        }
-       return drv->process_label_set(label, conf, use_default, on_exec);
+       return drv->process_label_set(label, conf, on_exec);
+}
+
+int lsm_process_prepare(struct lxc_conf *conf, const char *lxcpath)
+{
+       if (!drv) {
+               ERROR("LSM driver not inited");
+               return 0;
+       }
+
+       if (!drv->prepare)
+               return 0;
+
+       return drv->prepare(conf, lxcpath);
+}
+
+void lsm_process_cleanup(struct lxc_conf *conf, const char *lxcpath)
+{
+       if (!drv) {
+               ERROR("LSM driver not inited");
+               return;
+       }
+
+       if (!drv->cleanup)
+               return;
+
+       drv->cleanup(conf, lxcpath);
 }
index cafb2ac7cde0140345e170b4608e60fdf9464b06..52e656d6fb7929ad346029e4b85ca5740a2f1fd2 100644 (file)
@@ -38,17 +38,21 @@ struct lsm_drv {
        int (*enabled)(void);
        char *(*process_label_get)(pid_t pid);
        int (*process_label_set)(const char *label, struct lxc_conf *conf,
-                                bool use_default, bool on_exec);
+                                bool on_exec);
+       int (*prepare)(struct lxc_conf *conf, const char *lxcpath);
+       void (*cleanup)(struct lxc_conf *conf, const char *lxcpath);
 };
 
 extern void lsm_init(void);
 extern int lsm_enabled(void);
 extern const char *lsm_name(void);
 extern char *lsm_process_label_get(pid_t pid);
+extern int lsm_process_prepare(struct lxc_conf *conf, const char *lxcpath);
 extern int lsm_process_label_set(const char *label, struct lxc_conf *conf,
-                                bool use_default, bool on_exec);
+                                bool on_exec);
 extern int lsm_process_label_fd_get(pid_t pid, bool on_exec);
 extern int lsm_process_label_set_at(int label_fd, const char *label,
                                    bool on_exec);
+extern void lsm_process_cleanup(struct lxc_conf *conf, const char *lxcpath);
 
 #endif /* __LXC_LSM_H */
index 7bb8121b8ee6da6af720b652165c14285954943c..9397f2bfb5d7a34f57d4a5bf880876f797f0f0c9 100644 (file)
@@ -30,7 +30,7 @@ static char *nop_process_label_get(pid_t pid)
 }
 
 static int nop_process_label_set(const char *label, struct lxc_conf *conf,
-                                bool use_default, bool on_exec)
+                                bool on_exec)
 {
        return 0;
 }
index c88c18e3d7771a7258ae838c65de286c0d5abc73..9f7b7bc31334b7a9b636776e89ab4711eea87c11 100644 (file)
@@ -75,15 +75,13 @@ static char *selinux_process_label_get(pid_t pid)
  * Notes: This relies on /proc being available.
  */
 static int selinux_process_label_set(const char *inlabel, struct lxc_conf *conf,
-                                    bool use_default, bool on_exec)
+                                    bool on_exec)
 {
        int ret;
        const char *label;
 
        label = inlabel ? inlabel : conf->lsm_se_context;
        if (!label) {
-               if (!use_default)
-                       return -EINVAL;
 
                label = DEFAULT_LABEL;
        }
index ec372b4a2bc35f2ccba242a55ebaa60504b0d1fb..061e479175bc5c2e2046dcb15da46689f435e8f7 100644 (file)
@@ -863,9 +863,19 @@ int lxc_init(const char *name, struct lxc_handler *handler)
        }
        TRACE("Initialized cgroup driver");
 
+       ret = lsm_process_prepare(conf, handler->lxcpath);
+       if (ret < 0) {
+               ERROR("Failed to initialize LSM");
+               goto out_destroy_cgroups;
+       }
+       TRACE("Initialized LSM");
+
        INFO("Container \"%s\" is initialized", name);
        return 0;
 
+out_destroy_cgroups:
+       handler->cgroup_ops->destroy(handler->cgroup_ops, handler);
+
 out_delete_terminal:
        lxc_terminal_delete(&handler->conf->console);
 
@@ -956,6 +966,8 @@ void lxc_fini(const char *name, struct lxc_handler *handler)
        while (namespace_count--)
                free(namespaces[namespace_count]);
 
+       lsm_process_cleanup(handler->conf, handler->lxcpath);
+
        cgroup_ops->destroy(cgroup_ops, handler);
        cgroup_exit(cgroup_ops);
 
@@ -1235,7 +1247,7 @@ static int do_start(void *data)
        }
 
        /* Set the label to change to when we exec(2) the container's init. */
-       ret = lsm_process_label_set(NULL, handler->conf, 1, 1);
+       ret = lsm_process_label_set(NULL, handler->conf, true);
        if (ret < 0)
                goto out_warn_father;