mainloop: add io_uring support

[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c

index 711a9d7f5493d8d7883ccfc47e0c7ed6e15785f7..46754217ccfaa583fe557fac7f6f86a73045d85a 100644 (file)
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -40,6 +40,7 @@
  #include "commands_utils.h"
  #include "conf.h"
  #include "config.h"
+#include "error_utils.h"
  #include "log.h"
  #include "macro.h"
  #include "mainloop.h"
@@ -60,22 +61,29 @@
  
  lxc_log_define(cgfsng, cgroup);
  
-/* Given a pointer to a null-terminated array of pointers, realloc to add one
+/*
+ * Given a pointer to a null-terminated array of pointers, realloc to add one
   * entry, and point the new entry to NULL. Do not fail. Return the index to the
   * second-to-last entry - that is, the one which is now available for use
   * (keeping the list null-terminated).
   */
-static int append_null_to_list(void ***list)
+static int list_add(void ***list)
  {
-       int newentry = 0;
+       int idx = 0;
+       void **p;
  
         if (*list)
-               for (; (*list)[newentry]; newentry++)
+               for (; (*list)[idx]; idx++)
                         ;
  
-       *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
-       (*list)[newentry + 1] = NULL;
-       return newentry;
+       p = realloc(*list, (idx + 2) * sizeof(void **));
+       if (!p)
+               return ret_errno(ENOMEM);
+
+       p[idx + 1] = NULL;
+       *list = p;
+
+       return idx;
  }
  
  /* Given a null-terminated array of strings, check whether @entry is one of the
@@ -93,63 +101,10 @@ static bool string_in_list(char **list, const char *entry)
         return false;
  }
  
-/* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
- * "name=systemd". Do not fail.
- */
-static char *cg_legacy_must_prefix_named(char *entry)
-{
-       size_t len;
-       char *prefixed;
-
-       len = strlen(entry);
-       prefixed = must_realloc(NULL, len + 6);
-
-       memcpy(prefixed, "name=", STRLITERALLEN("name="));
-       memcpy(prefixed + STRLITERALLEN("name="), entry, len);
-       prefixed[len + 5] = '\0';
-
-       return prefixed;
-}
-
-/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
- * we are called.
- *
- * We also handle named subsystems here. Any controller which is not a kernel
- * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
- * we refuse to use because we're not sure which we have here.
- * (TODO: We could work around this in some cases by just remounting to be
- * unambiguous, or by comparing mountpoint contents with current cgroup.)
- *
- * The last entry will always be NULL.
- */
-static void must_append_controller(char **klist, char **nlist, char ***clist,
-                                  char *entry)
-{
-       int newentry;
-       char *copy;
-
-       if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
-               ERROR("Refusing to use ambiguous controller \"%s\"", entry);
-               ERROR("It is both a named and kernel subsystem");
-               return;
-       }
-
-       newentry = append_null_to_list((void ***)clist);
-
-       if (strnequal(entry, "name=", 5))
-               copy = must_copy_string(entry);
-       else if (string_in_list(klist, entry))
-               copy = must_copy_string(entry);
-       else
-               copy = cg_legacy_must_prefix_named(entry);
-
-       (*clist)[newentry] = copy;
-}
-
  /* Given a handler's cgroup data, return the struct hierarchy for the controller
   * @c, or NULL if there is none.
   */
-static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
+static struct hierarchy *get_hierarchy(const struct cgroup_ops *ops, const char *controller)
  {
         if (!ops->hierarchies)
                 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
@@ -169,12 +124,12 @@ static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *contr
                  */
                 if (pure_unified_layout(ops)) {
                         if (strequal(controller, "devices")) {
-                               if (ops->unified->bpf_device_controller)
+                               if (device_utility_controller(ops->unified))
                                         return ops->unified;
  
                                 break;
                         } else if (strequal(controller, "freezer")) {
-                               if (ops->unified->freezer_controller)
+                               if (freezer_utility_controller(ops->unified))
                                         return ops->unified;
  
                                 break;
@@ -193,6 +148,38 @@ static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *contr
         return ret_set_errno(NULL, ENOENT);
  }
  
+int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool limit)
+{
+       int dfd;
+       const struct hierarchy *h;
+
+       h = get_hierarchy(ops, fd->controller);
+       if (!h)
+               return ret_errno(ENOENT);
+
+       /*
+        * The client requested that the controller must be in a specific
+        * cgroup version.
+        */
+       if (fd->type != 0 && fd->type != h->fs_type)
+               return ret_errno(EINVAL);
+
+       if (limit)
+               dfd = h->dfd_con;
+       else
+               dfd = h->dfd_lim;
+       if (dfd < 0)
+               return ret_errno(EBADF);
+
+       fd->layout = ops->cgroup_layout;
+       fd->type = h->fs_type;
+       if (fd->type == UNIFIED_HIERARCHY)
+               fd->utilities = h->utilities;
+       fd->fd = dfd;
+
+       return 0;
+}
+
  /* Taken over modified from the kernel sources. */
  #define NBITS 32 /* bits in uint32_t */
  #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
@@ -315,44 +302,13 @@ static ssize_t get_max_cpus(char *cpulist)
  
  static inline bool is_unified_hierarchy(const struct hierarchy *h)
  {
-       return h->version == CGROUP2_SUPER_MAGIC;
-}
-
-/* Given two null-terminated lists of strings, return true if any string is in
- * both.
- */
-static bool controller_lists_intersect(char **l1, char **l2)
-{
-       if (!l1 || !l2)
-               return false;
-
-       for (int i = 0; l1[i]; i++)
-               if (string_in_list(l2, l1[i]))
-                       return true;
-
-       return false;
-}
-
-/* For a null-terminated list of controllers @clist, return true if any of those
- * controllers is already listed the null-terminated list of hierarchies @hlist.
- * Realistically, if one is present, all must be present.
- */
-static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
-{
-       if (!hlist)
-               return false;
-
-       for (int i = 0; hlist[i]; i++)
-               if (controller_lists_intersect(hlist[i]->controllers, clist))
-                       return true;
-
-       return false;
+       return h->fs_type == UNIFIED_HIERARCHY;
  }
  
  /* Return true if the controller @entry is found in the null-terminated list of
   * hierarchies @hlist.
   */
-static bool controller_found(struct hierarchy **hlist, char *entry)
+static bool controller_available(struct hierarchy **hlist, char *entry)
  {
         if (!hlist)
                 return false;
@@ -364,10 +320,7 @@ static bool controller_found(struct hierarchy **hlist, char *entry)
         return false;
  }
  
-/* Return true if all of the controllers which we require have been found.  The
- * required list is  freezer and anything in lxc.cgroup.use.
- */
-static bool all_controllers_found(struct cgroup_ops *ops)
+static bool controllers_available(struct cgroup_ops *ops)
  {
         struct hierarchy **hlist;
  
@@ -376,104 +329,73 @@ static bool all_controllers_found(struct cgroup_ops *ops)
  
         hlist = ops->hierarchies;
         for (char **cur = ops->cgroup_use; cur && *cur; cur++)
-               if (!controller_found(hlist, *cur))
-                       return log_error(false, "No %s controller mountpoint found", *cur);
+               if (!controller_available(hlist, *cur))
+                       return log_error(false, "The %s controller found", *cur);
  
         return true;
  }
  
-/* Get the controllers from a mountinfo line There are other ways we could get
- * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
- * could parse the mount options. But we simply assume that the mountpoint must
- * be /sys/fs/cgroup/controller-list
- */
-static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
-                                       int type)
+static char **list_new(void)
  {
-       /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
-        * for legacy hierarchies.
-        */
-       __do_free_string_list char **aret = NULL;
-       int i;
-       char *p2, *tok;
-       char *p = line, *sep = ",";
-
-       for (i = 0; i < 4; i++) {
-               p = strchr(p, ' ');
-               if (!p)
-                       return NULL;
-               p++;
-       }
+       __do_free_string_list char **list = NULL;
+       int idx;
  
-       /* Note, if we change how mountinfo works, then our caller will need to
-        * verify /sys/fs/cgroup/ in this field.
-        */
-       if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15))
-               return log_warn(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
+       idx = list_add((void ***)&list);
+       if (idx < 0)
+               return NULL;
  
-       p += 15;
-       p2 = strchr(p, ' ');
-       if (!p2)
-               return log_error(NULL, "Corrupt mountinfo");
-       *p2 = '\0';
+       list[idx] = NULL;
+       return move_ptr(list);
+}
  
-       if (type == CGROUP_SUPER_MAGIC) {
-               __do_free char *dup = NULL;
+static int list_add_string(char ***list, char *entry)
+{
+       __do_free char *dup = NULL;
+       int idx;
  
-               /* strdup() here for v1 hierarchies. Otherwise
-                * lxc_iterate_parts() will destroy mountpoints such as
-                * "/sys/fs/cgroup/cpu,cpuacct".
-                */
-               dup = must_copy_string(p);
-               if (!dup)
-                       return NULL;
+       dup = strdup(entry);
+       if (!dup)
+               return ret_errno(ENOMEM);
  
-               lxc_iterate_parts(tok, dup, sep)
-                       must_append_controller(klist, nlist, &aret, tok);
-       }
-       *p2 = ' ';
+       idx = list_add((void ***)list);
+       if (idx < 0)
+               return idx;
  
-       return move_ptr(aret);
+       (*list)[idx] = move_ptr(dup);
+       return 0;
  }
  
-static char **cg_unified_make_empty_controller(void)
+static char **list_add_controllers(char *controllers)
  {
-       __do_free_string_list char **aret = NULL;
-       int newentry;
+       __do_free_string_list char **list = NULL;
+       char *it;
+
+       lxc_iterate_parts(it, controllers, ", \t\n") {
+               int ret;
  
-       newentry = append_null_to_list((void ***)&aret);
-       aret[newentry] = NULL;
-       return move_ptr(aret);
+               ret = list_add_string(&list, it);
+               if (ret < 0)
+                       return NULL;
+       }
+
+       return move_ptr(list);
  }
  
-static char **cg_unified_get_controllers(int dfd, const char *file)
+static char **unified_controllers(int dfd, const char *file)
  {
         __do_free char *buf = NULL;
-       __do_free_string_list char **aret = NULL;
-       char *sep = " \t\n";
-       char *tok;
  
         buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
         if (!buf)
                 return NULL;
  
-       lxc_iterate_parts(tok, buf, sep) {
-               int newentry;
-               char *copy;
-
-               newentry = append_null_to_list((void ***)&aret);
-               copy = must_copy_string(tok);
-               aret[newentry] = copy;
-       }
-
-       return move_ptr(aret);
+       return list_add_controllers(buf);
  }
  
-static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
-                                      char **controllers)
+static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
  {
         if (!ops->cgroup_use)
-               return true;
+               return false;
  
         for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
                 bool found = false;
@@ -489,299 +411,54 @@ static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
                 if (found)
                         continue;
  
-               return false;
+               return true;
         }
  
-       return true;
+       return false;
  }
  
-static int add_hierarchy(struct cgroup_ops *ops, char **clist, char *mountpoint,
-                        char *container_base_path, int type)
+static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
+                               int dfd_base, char *base_cgroup,
+                               char **controllers, cgroupfs_type_magic_t fs_type)
  {
-       __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
         __do_free struct hierarchy *new = NULL;
-       __do_free_string_list char **controllers = clist;
         int idx;
  
-       if (abspath(container_base_path))
-               return syserrno(-errno, "Container base path must be relative to controller mount");
-
-       if (!controllers && type != CGROUP2_SUPER_MAGIC)
-               return syserrno_set(-EINVAL, "Empty controller list for non-unified cgroup hierarchy passed");
-
-       dfd_mnt = open_at(-EBADF, mountpoint, PROTECT_OPATH_DIRECTORY,
-                         PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
-       if (dfd_mnt < 0)
-               return syserrno(-errno, "Failed to open %s", mountpoint);
-
-       if (!is_empty_string(container_base_path)) {
-               dfd_base = open_at(dfd_mnt, container_base_path,
-                                  PROTECT_OPATH_DIRECTORY,
-                                  PROTECT_LOOKUP_BENEATH_XDEV, 0);
-               if (dfd_base < 0)
-                       return syserrno(-errno, "Failed to open %d(%s)", dfd_base, container_base_path);
-       }
-
-       if (!controllers) {
-               /*
-               * We assume that the cgroup we're currently in has been delegated to
-               * us and we are free to further delege all of the controllers listed
-               * in cgroup.controllers further down the hierarchy.
-                */
-               if (dfd_base < 0)
-                       controllers = cg_unified_get_controllers(dfd_mnt, "cgroup.controllers");
-               else
-                       controllers = cg_unified_get_controllers(dfd_base, "cgroup.controllers");
-               if (!controllers)
-                       controllers = cg_unified_make_empty_controller();
-               if (!controllers[0])
-                       TRACE("No controllers are enabled for delegation");
-       }
-
-       /* Exclude all controllers that cgroup use does not want. */
-       if (!cgroup_use_wants_controllers(ops, controllers))
-               return log_trace(0, "Skipping cgroup hiearchy with non-requested controllers");
+       if (abspath(base_cgroup))
+               return syserror_set(-EINVAL, "Container base path must be relative to controller mount");
  
         new = zalloc(sizeof(*new));
         if (!new)
                 return ret_errno(ENOMEM);
  
-       new->version                    = type;
-       new->controllers                = move_ptr(controllers);
-       new->mountpoint                 = mountpoint;
-       new->container_base_path        = container_base_path;
-       new->cgfd_con                   = -EBADF;
-       new->cgfd_limit                 = -EBADF;
-       new->cgfd_mon                   = -EBADF;
-
-       TRACE("Adding cgroup hierarchy with mountpoint %s and base cgroup %s",
-             mountpoint, container_base_path);
-       for (char *const *it = new->controllers; it && *it; it++)
-               TRACE("The detected hierarchy contains the %s controller", *it);
-
-       idx = append_null_to_list((void ***)&ops->hierarchies);
-       if (dfd_base < 0)
-               new->dfd_base = dfd_mnt;
-       else
-               new->dfd_base = move_fd(dfd_base);
-       new->dfd_mnt = move_fd(dfd_mnt);
-       if (type == CGROUP2_SUPER_MAGIC)
-               ops->unified = new;
-       (ops->hierarchies)[idx] = move_ptr(new);
-       return 0;
-}
-
-/* Get a copy of the mountpoint from @line, which is a line from
- * /proc/self/mountinfo.
- */
-static char *cg_hybrid_get_mountpoint(char *line)
-{
-       char *p = line, *sret = NULL;
-       size_t len;
-       char *p2;
-
-       for (int i = 0; i < 4; i++) {
-               p = strchr(p, ' ');
-               if (!p)
-                       return NULL;
-               p++;
-       }
-
-       if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15))
-               return NULL;
-
-       p2 = strchr(p + 15, ' ');
-       if (!p2)
-               return NULL;
-       *p2 = '\0';
-
-       len = strlen(p);
-       sret = must_realloc(NULL, len + 1);
-       memcpy(sret, p, len);
-       sret[len] = '\0';
-
-       return sret;
-}
-
-/* Given a multi-line string, return a null-terminated copy of the current line. */
-static char *copy_to_eol(char *p)
-{
-       char *p2, *sret;
-       size_t len;
-
-       p2 = strchr(p, '\n');
-       if (!p2)
-               return NULL;
-
-       len = p2 - p;
-       sret = must_realloc(NULL, len + 1);
-       memcpy(sret, p, len);
-       sret[len] = '\0';
+       new->dfd_con            = -EBADF;
+       new->dfd_lim            = -EBADF;
+       new->dfd_mon            = -EBADF;
  
-       return sret;
-}
-
-/* cgline: pointer to character after the first ':' in a line in a \n-terminated
- * /proc/self/cgroup file. Check whether controller c is present.
- */
-static bool controller_in_clist(char *cgline, char *c)
-{
-       __do_free char *tmp = NULL;
-       char *tok, *eol;
-       size_t len;
-
-       eol = strchr(cgline, ':');
-       if (!eol)
-               return false;
-
-       len = eol - cgline;
-       tmp = must_realloc(NULL, len + 1);
-       memcpy(tmp, cgline, len);
-       tmp[len] = '\0';
-
-       lxc_iterate_parts(tok, tmp, ",")
-               if (strequal(tok, c))
-                       return true;
-
-       return false;
-}
-
-static inline char *trim(char *s)
-{
-       size_t len;
-
-       len = strlen(s);
-       while ((len > 1) && (s[len - 1] == '\n'))
-               s[--len] = '\0';
-
-       return s;
-}
-
-/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
- * @controller.
- */
-static char *cg_hybrid_get_current_cgroup(bool relative, char *basecginfo,
-                                         char *controller, int type)
-{
-       char *base_cgroup = basecginfo;
-
-       for (;;) {
-               bool is_cgv2_base_cgroup = false;
-
-               /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
-               if ((type == CGROUP2_SUPER_MAGIC) && (*base_cgroup == '0'))
-                       is_cgv2_base_cgroup = true;
-
-               base_cgroup = strchr(base_cgroup, ':');
-               if (!base_cgroup)
-                       return NULL;
-               base_cgroup++;
-
-               if (is_cgv2_base_cgroup || (controller && controller_in_clist(base_cgroup, controller))) {
-                       __do_free char *copy = NULL;
-
-                       base_cgroup = strchr(base_cgroup, ':');
-                       if (!base_cgroup)
-                               return NULL;
-                       base_cgroup++;
-
-                       copy = copy_to_eol(base_cgroup);
-                       if (!copy)
-                               return NULL;
-                       trim(copy);
-
-                       if (!relative) {
-                               base_cgroup = prune_init_scope(copy);
-                               if (!base_cgroup)
-                                       return NULL;
-                       } else {
-                               base_cgroup = copy;
-                       }
-
-                       if (abspath(base_cgroup))
-                               base_cgroup = deabs(base_cgroup);
-
-                       /* We're allowing base_cgroup to be "". */
-                       return strdup(base_cgroup);
-               }
-
-               base_cgroup = strchr(base_cgroup, '\n');
-               if (!base_cgroup)
-                       return NULL;
-               base_cgroup++;
-       }
-}
-
-static void must_append_string(char ***list, char *entry)
-{
-       int newentry;
-       char *copy;
-
-       newentry = append_null_to_list((void ***)list);
-       copy = must_copy_string(entry);
-       (*list)[newentry] = copy;
-}
-
-static int get_existing_subsystems(char ***klist, char ***nlist)
-{
-       __do_free char *line = NULL;
-       __do_fclose FILE *f = NULL;
-       size_t len = 0;
+       new->fs_type            = fs_type;
+       new->controllers        = controllers;
+       new->at_mnt             = mnt;
+       new->at_base            = base_cgroup;
  
-       f = fopen("/proc/self/cgroup", "re");
-       if (!f)
-               return -1;
+       new->dfd_mnt            = dfd_mnt;
+       new->dfd_base           = dfd_base;
  
-       while (getline(&line, &len, f) != -1) {
-               char *p, *p2, *tok;
-               p = strchr(line, ':');
-               if (!p)
-                       continue;
-               p++;
-               p2 = strchr(p, ':');
-               if (!p2)
-                       continue;
-               *p2 = '\0';
+       TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
+             mnt, maybe_empty(base_cgroup));
+       for (char *const *it = new->controllers; it && *it; it++)
+               TRACE("The hierarchy contains the %s controller", *it);
  
-               /* If the kernel has cgroup v2 support, then /proc/self/cgroup
-                * contains an entry of the form:
-                *
-                *      0::/some/path
-                *
-                * In this case we use "cgroup2" as controller name.
-                */
-               if ((p2 - p) == 0) {
-                       must_append_string(klist, "cgroup2");
-                       continue;
-               }
+       idx = list_add((void ***)&ops->hierarchies);
+       if (idx < 0)
+               return ret_errno(idx);
  
-               lxc_iterate_parts(tok, p, ",") {
-                       if (strnequal(tok, "name=", 5))
-                               must_append_string(nlist, tok);
-                       else
-                               must_append_string(klist, tok);
-               }
-       }
+       if (fs_type == UNIFIED_HIERARCHY)
+               ops->unified = new;
+       (ops->hierarchies)[idx] = move_ptr(new);
  
         return 0;
  }
  
-static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
-                                             char **nlist)
-{
-       int k;
-       char **it;
-
-       TRACE("basecginfo is:");
-       TRACE("%s", basecginfo);
-
-       for (k = 0, it = klist; it && *it; it++, k++)
-               TRACE("kernel subsystem %d: %s", k, *it);
-
-       for (k = 0, it = nlist; it && *it; it++, k++)
-               TRACE("named subsystem %d: %s", k, *it);
-}
-
  static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
  {
         if (!path_prune || !hierarchies)
@@ -797,9 +474,7 @@ static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_p
                 else
                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
  
-               if (h->container_limit_path != h->container_full_path)
-                       free_disarm(h->container_limit_path);
-               free_disarm(h->container_full_path);
+               free_equal(h->path_lim, h->path_con);
         }
  
         return 0;
@@ -1010,29 +685,29 @@ static bool cpuset1_initialize(int dfd_base, int dfd_next)
          */
         bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
         if (bytes < 0)
-               return syserrno(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
+               return syserror_ret(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
  
         /*
         * Initialize cpuset.cpus and make remove any isolated
         * and offline cpus.
          */
         if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
-               return syserrno(false, "Failed to initialize cpuset.cpus");
+               return syserror_ret(false, "Failed to initialize cpuset.cpus");
  
         /* Read cpuset.mems from parent... */
         bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
         if (bytes < 0)
-               return syserrno(false, "Failed to read file %d(cpuset.mems)", dfd_base);
+               return syserror_ret(false, "Failed to read file %d(cpuset.mems)", dfd_base);
  
         /* ... and copy to first cgroup in the tree... */
         bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
         if (bytes < 0)
-               return syserrno(false, "Failed to write %d(cpuset.mems)", dfd_next);
+               return syserror_ret(false, "Failed to write %d(cpuset.mems)", dfd_next);
  
         /* ... and finally turn on cpuset inheritance. */
         bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
         if (bytes < 0)
-               return syserrno(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
+               return syserror_ret(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
  
         return log_trace(true, "Initialized cpuset in the legacy hierarchy");
  }
@@ -1061,15 +736,15 @@ static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
                  * absolute nor walks upwards.
                  */
                 if (abspath(cur))
-                       return syserrno_set(-EINVAL, "No absolute paths allowed");
+                       return syserror_set(-EINVAL, "No absolute paths allowed");
  
                 if (strnequal(cur, "..", STRLITERALLEN("..")))
-                       return syserrno_set(-EINVAL, "No upward walking paths allowed");
+                       return syserror_set(-EINVAL, "No upward walking paths allowed");
  
                 ret = mkdirat(dfd_cur, cur, mode);
                 if (ret < 0) {
                         if (errno != EEXIST)
-                               return syserrno(-errno, "Failed to create %d(%s)", dfd_cur, cur);
+                               return syserror("Failed to create %d(%s)", dfd_cur, cur);
  
                         ret = -EEXIST;
                 }
@@ -1077,12 +752,12 @@ static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
  
                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
                 if (dfd_final < 0)
-                       return syserrno(-errno, "Fail to open%s directory %d(%s)",
+                       return syserror("Fail to open%s directory %d(%s)",
                                         !ret ? " newly created" : "", dfd_base, cur);
                 if (dfd_cur != dfd_base)
                         close(dfd_cur);
                 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
-                       return syserrno(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
+                       return syserror_set(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
                 /*
                  * Leave dfd_final pointing to the last fd we opened so
                  * it will be automatically zapped if we return early.
@@ -1093,7 +768,7 @@ static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
         /* The final cgroup must be succesfully creatd by us. */
         if (ret) {
                 if (ret != -EEXIST || !eexist_ignore)
-                       return syserrno_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
+                       return syswarn_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
         }
  
         return move_fd(dfd_final);
@@ -1104,7 +779,6 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
                                const char *cgroup_leaf, bool payload)
  {
         __do_close int fd_limit = -EBADF, fd_final = -EBADF;
-       __do_free char *path = NULL, *limit_path = NULL;
         bool cpuset_v1 = false;
  
         /*
@@ -1117,10 +791,13 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
                 /* With isolation both parts need to not already exist. */
                 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
                 if (fd_limit < 0)
-                       return syserrno(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
+                       return syswarn_ret(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
+
+               h->path_lim = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
+               h->dfd_lim = move_fd(fd_limit);
  
                 TRACE("Created limit cgroup %d->%d(%s)",
-                     fd_limit, h->dfd_base, cgroup_limit_dir);
+                     h->dfd_lim, h->dfd_base, cgroup_limit_dir);
  
                 /*
                  * With isolation the devices legacy cgroup needs to be
@@ -1130,46 +807,38 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
                  */
                 if (string_in_list(h->controllers, "devices") &&
                     !ops->setup_limits_legacy(ops, conf, true))
-                       return log_error(false, "Failed to setup legacy device limits");
-
-               limit_path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
-               path = must_make_path(limit_path, cgroup_leaf, NULL);
+                       return log_warn(false, "Failed to setup legacy device limits");
  
                 /*
                  * If we use a separate limit cgroup, the leaf cgroup, i.e. the
                  * cgroup the container actually resides in, is below fd_limit.
                  */
-               fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false);
+               fd_final = __cgroup_tree_create(h->dfd_lim, cgroup_leaf, 0755, cpuset_v1, false);
                 if (fd_final < 0) {
                         /* Ensure we don't leave any garbage behind. */
                         if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
                                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
                         else
                                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
+                       return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
                 }
-       } else {
-               path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
+               h->dfd_con = move_fd(fd_final);
+               h->path_con = must_make_path(h->path_lim, cgroup_leaf, NULL);
  
+       } else {
                 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
-       }
-       if (fd_final < 0)
-               return syserrno(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
+               if (fd_final < 0)
+                       return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
  
-       if (payload) {
-               h->cgfd_con = move_fd(fd_final);
-               h->container_full_path = move_ptr(path);
+               if (payload) {
+                       h->dfd_con = move_fd(fd_final);
+                       h->dfd_lim = h->dfd_con;
+                       h->path_con = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
  
-               if (fd_limit < 0)
-                       h->cgfd_limit = h->cgfd_con;
-               else
-                       h->cgfd_limit = move_fd(fd_limit);
-
-               if (limit_path)
-                       h->container_limit_path = move_ptr(limit_path);
-               else
-                       h->container_limit_path = h->container_full_path;
-       } else {
-               h->cgfd_mon = move_fd(fd_final);
+                       h->path_lim = h->path_con;
+               } else {
+                       h->dfd_mon = move_fd(fd_final);
+               }
         }
  
         return true;
@@ -1182,21 +851,17 @@ static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
  
         if (payload) {
                 /* Check whether we actually created the cgroup to prune. */
-               if (h->cgfd_limit < 0)
+               if (h->dfd_lim < 0)
                         prune = false;
  
-               if (h->container_full_path != h->container_limit_path)
-                       free_disarm(h->container_limit_path);
-               free_disarm(h->container_full_path);
-
-               close_prot_errno_disarm(h->cgfd_con);
-               close_prot_errno_disarm(h->cgfd_limit);
+               free_equal(h->path_con, h->path_lim);
+               close_equal(h->dfd_con, h->dfd_lim);
         } else {
                 /* Check whether we actually created the cgroup to prune. */
-               if (h->cgfd_mon < 0)
+               if (h->dfd_mon < 0)
                         prune = false;
  
-               close_prot_errno_disarm(h->cgfd_mon);
+               close_prot_errno_disarm(h->dfd_mon);
         }
  
         /* We didn't create this cgroup. */
@@ -1486,7 +1151,7 @@ __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lx
                                                true))
                                 continue;
  
-                       DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
+                       DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
                         for (int j = 0; j <= i; j++)
                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
                                                        limit_cgroup, true);
@@ -1542,20 +1207,20 @@ __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
                 struct hierarchy *h = ops->hierarchies[i];
                 int ret;
  
-               ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", monitor, monitor_len);
+               ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
                 if (ret)
-                       return log_error_errno(false, errno, "Failed to enter cgroup %d", h->cgfd_mon);
+                       return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
  
-               TRACE("Moved monitor into cgroup %d", h->cgfd_mon);
+               TRACE("Moved monitor into cgroup %d", h->dfd_mon);
  
                 if (handler->transient_pid <= 0)
                         continue;
  
-               ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", transient, transient_len);
+               ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
                 if (ret)
-                       return log_error_errno(false, errno, "Failed to enter cgroup %d", h->cgfd_mon);
+                       return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
  
-               TRACE("Moved transient process into cgroup %d", h->cgfd_mon);
+               TRACE("Moved transient process into cgroup %d", h->dfd_mon);
  
                 /*
                  * we don't keep the fds for non-unified hierarchies around
@@ -1564,7 +1229,7 @@ __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
                  * lot of them.
                  */
                 if (!is_unified_hierarchy(h))
-                       close_prot_errno_disarm(h->cgfd_mon);
+                       close_prot_errno_disarm(h->dfd_mon);
         }
         handler->transient_pid = -1;
  
@@ -1601,11 +1266,11 @@ __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
                     (handler->clone_flags & CLONE_INTO_CGROUP))
                         continue;
  
-               ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
+               ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
                 if (ret != 0)
-                       return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
+                       return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
  
-               TRACE("Moved container into %s cgroup via %d", h->container_full_path, h->cgfd_con);
+               TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
         }
  
         return true;
@@ -1665,7 +1330,10 @@ static int chown_cgroup_wrapper(void *data)
                 destuid = 0;
  
         for (int i = 0; arg->hierarchies[i]; i++) {
-               int dirfd = arg->hierarchies[i]->cgfd_con;
+               int dirfd = arg->hierarchies[i]->dfd_con;
+
+               if (dirfd < 0)
+                       return syserror_set(-EBADF, "Invalid cgroup file descriptor");
  
                 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
  
@@ -1677,15 +1345,15 @@ static int chown_cgroup_wrapper(void *data)
                  * files (which systemd in wily insists on doing).
                  */
  
-               if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC)
+               if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
                         (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
  
                 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
  
-               if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
+               if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
                         continue;
  
-               for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++)
+               for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
                         (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
         }
  
@@ -1723,7 +1391,7 @@ __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
         return true;
  }
  
-__cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops)
+__cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
  {
         if (!ops)
                 return;
@@ -1733,15 +1401,12 @@ __cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops)
  
         for (int i = 0; ops->hierarchies[i]; i++) {
                 struct hierarchy *h = ops->hierarchies[i];
-               /*
-                * we don't keep the fds for non-unified hierarchies around
-                * mainly because we don't make use of them anymore after the
-                * core cgroup setup is done but also because there are quite a
-                * lot of them.
-                */
-               if (!is_unified_hierarchy(h))
-                       close_prot_errno_disarm(h->cgfd_con);
+
+               /* Close all monitor cgroup file descriptors. */
+               close_prot_errno_disarm(h->dfd_mon);
         }
+       /* Close the cgroup root file descriptor. */
+       close_prot_errno_disarm(ops->dfd_mnt);
  
         /*
          * The checking for freezer support should obviously be done at cgroup
@@ -1758,10 +1423,10 @@ __cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops)
          * for our container which means we check here.
          */
          if (pure_unified_layout(ops) &&
-            !faccessat(ops->unified->cgfd_con, "cgroup.freeze", F_OK,
+            !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
                         AT_SYMLINK_NOFOLLOW)) {
                 TRACE("Unified hierarchy supports freezer");
-               ops->unified->freezer_controller = 1;
+               ops->unified->utilities |= FREEZER_CONTROLLER;
          }
  }
  
@@ -1785,7 +1450,7 @@ static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
   * control/the/cg/path.
   */
  static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
-                                      char *controllerpath, char *cgpath,
+                                      char *hierarchy_mnt, char *cgpath,
                                        const char *container_cgroup)
  {
         __do_free char *sourcepath = NULL;
@@ -1794,25 +1459,24 @@ static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarc
  
         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
             (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
-               ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
+               ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
                 if (ret < 0)
                         return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
-                                              controllerpath, controllerpath);
+                                              hierarchy_mnt, hierarchy_mnt);
  
-               remount_flags = add_required_remount_flags(controllerpath,
-                                                          controllerpath,
+               remount_flags = add_required_remount_flags(hierarchy_mnt,
+                                                          hierarchy_mnt,
                                                            flags | MS_REMOUNT);
-               ret = mount(controllerpath, controllerpath, "cgroup",
+               ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
                             NULL);
                 if (ret < 0)
-                       return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath);
+                       return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
  
-               INFO("Remounted %s read-only", controllerpath);
+               INFO("Remounted %s read-only", hierarchy_mnt);
         }
  
-       sourcepath = must_make_path(h->mountpoint, h->container_base_path,
-                                   container_cgroup, NULL);
+       sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
         if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
                 flags |= MS_RDONLY;
  
@@ -2025,8 +1689,8 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
                 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
                                           PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
                 if (dfd_mnt_unified < 0)
-                       return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
-                                       DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
+                       return syserror_ret(false, "Failed to open %d(%s)",
+                                           rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
                 /*
                  * If cgroup namespaces are supported but the container will
                  * not have CAP_SYS_ADMIN after it has started we need to mount
@@ -2059,7 +1723,7 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
                          */
                         ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
                         if (ret < 0)
-                               return syserrno(false, "Failed to force mount cgroup filesystem in cgroup namespace");
+                               return syserror_ret(false, "Failed to force mount cgroup filesystem in cgroup namespace");
  
                         return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
                 } else {
@@ -2090,7 +1754,7 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
                         }
                 }
  
-               return syserrno(false, "Failed to mount cgroups");
+               return syserror_ret(false, "Failed to mount cgroups");
         }
  
         /*
@@ -2128,21 +1792,16 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
         dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
         if (dfd_mnt_tmpfs < 0)
-               return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
-                               DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
+               return syserror_ret(false, "Failed to open %d(%s)",
+                                   rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
  
         for (int i = 0; ops->hierarchies[i]; i++) {
-               __do_free char *controllerpath = NULL, *path2 = NULL;
+               __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
                 struct hierarchy *h = ops->hierarchies[i];
-               char *controller = strrchr(h->mountpoint, '/');
-
-               if (!controller)
-                       continue;
-               controller++;
  
-               ret = mkdirat(dfd_mnt_tmpfs, controller, 0000);
+               ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
                 if (ret < 0)
-                       return log_error_errno(false, errno, "Failed to create cgroup mountpoint %d(%s)", dfd_mnt_tmpfs, controller);
+                       return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
  
                 if (in_cgroup_ns && wants_force_mount) {
                         /*
@@ -2150,7 +1809,8 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
                          * will not have CAP_SYS_ADMIN after it has started we
                          * need to mount the cgroups manually.
                          */
-                       ret = cgroupfs_mount(cgroup_automount_type, h, rootfs, dfd_mnt_tmpfs, controller);
+                       ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
+                                            dfd_mnt_tmpfs, h->at_mnt);
                         if (ret < 0)
                                 return false;
  
@@ -2158,7 +1818,8 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
                 }
  
                 /* Here is where the ancient kernel section begins. */
-               ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs, dfd_mnt_tmpfs, controller);
+               ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
+                                         dfd_mnt_tmpfs, h->at_mnt);
                 if (ret < 0)
                         return false;
  
@@ -2168,13 +1829,16 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
                 if (!cgroup_root)
                         cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
  
-               controllerpath = must_make_path(cgroup_root, controller, NULL);
-               path2 = must_make_path(controllerpath, h->container_base_path, ops->container_cgroup, NULL);
+               hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
+               path2 = must_make_path(hierarchy_mnt, h->at_base,
+                                      ops->container_cgroup, NULL);
                 ret = mkdir_p(path2, 0755);
                 if (ret < 0 && (errno != EEXIST))
                         return false;
  
-               ret = cg_legacy_mount_controllers(cgroup_automount_type, h, controllerpath, path2, ops->container_cgroup);
+               ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
+                                                 hierarchy_mnt, path2,
+                                                 ops->container_cgroup);
                 if (ret < 0)
                         return false;
         }
@@ -2202,10 +1866,9 @@ __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
                 __do_free char *fullpath = NULL;
                 int ret;
  
-               fullpath =
-                   must_make_path(ops->hierarchies[i]->mountpoint,
-                                  ops->hierarchies[i]->container_base_path,
-                                  "cgroup.procs", NULL);
+               fullpath = make_cgroup_path(ops->hierarchies[i],
+                                           ops->hierarchies[i]->at_base,
+                                           "cgroup.procs", NULL);
                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
                 if (ret != 0)
                         return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
@@ -2241,7 +1904,7 @@ __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
         if (!ops->hierarchies)
                 return ret_set_errno(false, ENOENT);
  
-       /* sanity check n */
+       /* consistency check n */
         for (i = 0; i < n; i++)
                 if (!ops->hierarchies[i])
                         return ret_set_errno(false, ENOENT);
@@ -2251,7 +1914,7 @@ __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
         return true;
  }
  
-static bool cg_legacy_freeze(struct cgroup_ops *ops)
+static int cg_legacy_freeze(struct cgroup_ops *ops)
  {
         struct hierarchy *h;
  
@@ -2259,12 +1922,12 @@ static bool cg_legacy_freeze(struct cgroup_ops *ops)
         if (!h)
                 return ret_set_errno(-1, ENOENT);
  
-       return lxc_write_openat(h->container_full_path, "freezer.state",
+       return lxc_write_openat(h->path_con, "freezer.state",
                                 "FROZEN", STRLITERALLEN("FROZEN"));
  }
  
  static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
-                                   struct lxc_epoll_descr *descr)
+                                   struct lxc_async_descr *descr)
  {
         __do_free char *line = NULL;
         __do_fclose FILE *f = NULL;
@@ -2297,22 +1960,22 @@ static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
                                 const char *wait_error)
  {
         __do_close int fd = -EBADF;
-       call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
+       call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL;
         int ret;
-       struct lxc_epoll_descr descr;
+       struct lxc_async_descr descr;
         struct hierarchy *h;
  
         h = ops->unified;
         if (!h)
                 return ret_set_errno(-1, ENOENT);
  
-       if (!h->container_full_path)
+       if (!h->path_con)
                 return ret_set_errno(-1, EEXIST);
  
         if (timeout != 0) {
                 __do_free char *events_file = NULL;
  
-               events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
+               events_file = must_make_path(h->path_con, "cgroup.events", NULL);
                 fd = open(events_file, O_RDONLY | O_CLOEXEC);
                 if (fd < 0)
                         return log_error_errno(-1, errno, "Failed to open cgroup.events file");
@@ -2324,12 +1987,16 @@ static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
                 /* automatically cleaned up now */
                 descr_ptr = &descr;
  
-               ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
+               ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI,
+                                                     freezer_cgroup_events_cb,
+                                                     default_cleanup_handler,
+                                                     INT_TO_PTR(state_num),
+                                                     "freezer_cgroup_events_cb");
                 if (ret < 0)
                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
         }
  
-       ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", state_string, 1);
+       ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
         if (ret < 0)
                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
  
@@ -2365,7 +2032,7 @@ static int cg_legacy_unfreeze(struct cgroup_ops *ops)
         if (!h)
                 return ret_set_errno(-1, ENOENT);
  
-       return lxc_write_openat(h->container_full_path, "freezer.state",
+       return lxc_write_openat(h->path_con, "freezer.state",
                                 "THAWED", STRLITERALLEN("THAWED"));
  }
  
@@ -2391,20 +2058,28 @@ static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
                                         const char *controller, bool limiting)
  {
         struct hierarchy *h;
+       size_t len;
+       const char *path;
  
         h = get_hierarchy(ops, controller);
         if (!h)
-               return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
-                                     controller ? controller : "(null)");
+               return log_warn_errno(NULL, ENOENT,
+                                     "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
  
         if (limiting)
-               return h->container_limit_path
-                          ? h->container_limit_path + strlen(h->mountpoint)
-                          : NULL;
+               path = h->path_lim;
+       else
+               path = h->path_con;
+       if (!path)
+               return NULL;
  
-       return h->container_full_path
-                  ? h->container_full_path + strlen(h->mountpoint)
-                  : NULL;
+       len = strlen(h->at_mnt);
+       if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
+                      STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
+               path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
+               path += strspn(path, "/");
+       }
+       return path += len;
  }
  
  __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
@@ -2413,8 +2088,8 @@ __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
      return cgfsng_get_cgroup_do(ops, controller, false);
  }
  
-__cgfsng_ops static const char *cgfsng_get_limiting_cgroup(struct cgroup_ops *ops,
-                                                          const char *controller)
+__cgfsng_ops static const char *cgfsng_get_limit_cgroup(struct cgroup_ops *ops,
+                                                       const char *controller)
  {
      return cgfsng_get_cgroup_do(ops, controller, true);
  }
@@ -2426,7 +2101,7 @@ static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
                                                        const char *inpath,
                                                        const char *filename)
  {
-       return must_make_path(h->mountpoint, inpath, filename, NULL);
+       return make_cgroup_path(h, inpath, filename, NULL);
  }
  
  static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
@@ -2534,16 +2209,13 @@ static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
                                         int *sk_fd, pid_t pid)
  {
         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
-       int target_fds[2];
         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
         size_t pidstr_len;
         ssize_t ret;
  
-       ret = lxc_abstract_unix_recv_fds(sk, target_fds, 2, NULL, 0);
-       if (ret <= 0)
+       ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1);
+       if (ret < 0)
                 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
-       target_fd0 = target_fds[0];
-       target_fd1 = target_fds[1];
  
         pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
  
@@ -2616,7 +2288,7 @@ static int __cg_unified_attach(const struct hierarchy *h,
         ret = cgroup_attach(conf, name, lxcpath, pid);
         if (ret == 0)
                 return log_trace(0, "Attached to unified cgroup via command handler");
-       if (ret != -ENOCGROUP2)
+       if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2)
                 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
  
         /* Fall back to retrieving the path for the unified cgroup. */
@@ -2625,7 +2297,7 @@ static int __cg_unified_attach(const struct hierarchy *h,
         if (!cgroup)
                 return 0;
  
-       path = must_make_path(h->mountpoint, cgroup, NULL);
+       path = make_cgroup_path(h, cgroup, NULL);
  
         unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
         if (unified_fd < 0)
@@ -2676,7 +2348,7 @@ __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
                 __do_free char *fullpath = NULL, *path = NULL;
                 struct hierarchy *h = ops->hierarchies[i];
  
-               if (h->version == CGROUP2_SUPER_MAGIC) {
+               if (h->fs_type == UNIFIED_HIERARCHY) {
                         ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
                                                   h->controllers[0]);
                         if (ret < 0)
@@ -2686,9 +2358,17 @@ __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
                 }
  
                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
-               /* not running */
-               if (!path)
-                       return false;
+               if (!path) {
+                       /*
+                        * Someone might have created a name=<controller>
+                        * controller after the container has started and so
+                        * the container doesn't make use of this controller.
+                        *
+                        * Link: https://github.com/lxc/lxd/issues/8577
+                        */
+                       TRACE("Skipping unused %s controller", maybe_empty(h->controllers[0]));
+                       continue;
+               }
  
                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
@@ -2717,12 +2397,15 @@ __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
         if (!ops)
                 return ret_set_errno(-1, ENOENT);
  
-       controller = must_copy_string(filename);
+       controller = strdup(filename);
+       if (!controller)
+               return ret_errno(ENOMEM);
+
         p = strchr(controller, '.');
         if (p)
                 *p = '\0';
  
-       path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
+       path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
         /* not running */
         if (!path)
                 return -1;
@@ -2779,19 +2462,9 @@ static int device_cgroup_rule_parse(struct device_item *device, const char *key,
                 device->type = 'a';
                 device->major = -1;
                 device->minor = -1;
-
-               if (device->allow) /* allow all devices */
-                       device->global_rule = LXC_BPF_DEVICE_CGROUP_DENYLIST;
-               else /* deny all devices */
-                       device->global_rule = LXC_BPF_DEVICE_CGROUP_ALLOWLIST;
-
-               device->allow = -1;
                 return 0;
         }
  
-       /* local rule */
-       device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
-
         switch (*val) {
         case 'a':
                 __fallthrough;
@@ -2871,7 +2544,10 @@ __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
             is_empty_string(name) || is_empty_string(lxcpath))
                 return ret_errno(EINVAL);
  
-       controller = must_copy_string(key);
+       controller = strdup(key);
+       if (!controller)
+               return ret_errno(ENOMEM);
+
         p = strchr(controller, '.');
         if (p)
                 *p = '\0';
@@ -2891,7 +2567,7 @@ __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
                 return 0;
         }
  
-       path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
+       path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
         /* not running */
         if (!path)
                 return -1;
@@ -2923,7 +2599,9 @@ static int device_cgroup_rule_parse_devpath(struct device_item *device,
         char *p;
         struct stat sb;
  
-       path = must_copy_string(devpath);
+       path = strdup(devpath);
+       if (!path)
+               return ret_errno(ENOMEM);
  
         /*
          * Read path followed by mode. Ignore any trailing text.
@@ -2974,7 +2652,6 @@ static int device_cgroup_rule_parse_devpath(struct device_item *device,
         device->major = MAJOR(sb.st_rdev);
         device->minor = MINOR(sb.st_rdev);
         device->allow = 1;
-       device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
  
         return 0;
  }
@@ -3011,7 +2688,10 @@ static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
         char converted_value[50];
         struct hierarchy *h;
  
-       controller = must_copy_string(filename);
+       controller = strdup(filename);
+       if (!controller)
+               return ret_errno(ENOMEM);
+
         p = strchr(controller, '.');
         if (p)
                 *p = '\0';
@@ -3030,11 +2710,11 @@ static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
                 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
  
         if (is_cpuset) {
-               int ret = lxc_write_openat(h->container_full_path, filename, value, strlen(value));
+               int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
                 if (ret)
                         return ret;
         }
-       return lxc_write_openat(h->container_limit_path, filename, value, strlen(value));
+       return lxc_write_openat(h->path_lim, filename, value, strlen(value));
  }
  
  __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
@@ -3105,16 +2785,22 @@ static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
         struct device_item device_item = {};
         int ret;
  
-       if (strequal("devices.allow", key) && *val == '/')
+       if (strequal("devices.allow", key) && abspath(val))
                 ret = device_cgroup_rule_parse_devpath(&device_item, val);
         else
                 ret = device_cgroup_rule_parse(&device_item, key, val);
         if (ret < 0)
-               return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", key, val);
+               return syserror_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
  
-       ret = bpf_list_add_device(conf, &device_item);
+       /*
+        * Note that bpf_list_add_device() returns 1 if it altered the device
+        * list and 0 if it didn't; both return values indicate success.
+        * Only a negative return value indicates an error.
+        */
+       ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
         if (ret < 0)
                 return -1;
+
         return 0;
  }
  
@@ -3156,7 +2842,7 @@ __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
                 if (strnequal("devices", cg->subsystem, 7))
                         ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
                 else
-                       ret = lxc_write_openat(h->container_limit_path, cg->subsystem, cg->value, strlen(cg->value));
+                       ret = lxc_write_openat(h->path_lim, cg->subsystem, cg->value, strlen(cg->value));
                 if (ret < 0)
                         return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
  
@@ -3168,12 +2854,8 @@ __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
  
  __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
  {
-       __do_bpf_program_free struct bpf_program *prog = NULL;
-       int ret;
         struct lxc_conf *conf;
         struct hierarchy *unified;
-       struct lxc_list *it;
-       struct bpf_program *prog_old;
  
         if (!ops)
                 return ret_set_errno(false, ENOENT);
@@ -3189,65 +2871,12 @@ __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct
         conf = handler->conf;
  
         unified = ops->unified;
-       if (!unified || !unified->bpf_device_controller ||
-           !unified->container_full_path || lxc_list_empty(&conf->devices))
+       if (!unified || !device_utility_controller(unified) ||
+           !unified->path_con ||
+           lxc_list_empty(&(conf->bpf_devices).device_item))
                 return true;
  
-       prog = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
-       if (!prog)
-               return log_error_errno(false, ENOMEM, "Failed to create new bpf program");
-
-       ret = bpf_program_init(prog);
-       if (ret)
-               return log_error_errno(false, ENOMEM, "Failed to initialize bpf program");
-
-       bpf_device_set_type(prog, &conf->devices);
-       TRACE("Device bpf %s all devices by default",
-             bpf_device_block_all(prog) ? "blocks" : "allows");
-
-       lxc_list_for_each(it, &conf->devices) {
-               struct device_item *cur = it->elem;
-
-               if (!bpf_device_add(prog, cur)) {
-                       TRACE("Skipping type %c, major %d, minor %d, access %s, allow %d",
-                             cur->type, cur->major, cur->minor, cur->access,
-                             cur->allow);
-                       continue;
-               }
-
-               ret = bpf_program_append_device(prog, cur);
-               if (ret)
-                       return log_error_errno(false, ENOMEM, "Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
-                                              cur->type,
-                                              cur->major,
-                                              cur->minor,
-                                              cur->access,
-                                              cur->allow,
-                                              cur->global_rule);
-               TRACE("Added rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
-                     cur->type,
-                     cur->major,
-                     cur->minor,
-                     cur->access,
-                     cur->allow,
-                     cur->global_rule);
-       }
-
-       ret = bpf_program_finalize(prog);
-       if (ret)
-               return log_error_errno(false, ENOMEM, "Failed to finalize bpf program");
-
-       ret = bpf_program_cgroup_attach(prog, BPF_CGROUP_DEVICE,
-                                       unified->cgfd_limit, -EBADF,
-                                       BPF_F_ALLOW_MULTI);
-       if (ret)
-               return log_error_errno(false, ENOMEM, "Failed to attach bpf program");
-
-       /* Replace old bpf program. */
-       prog_old = move_ptr(ops->cgroup2_devices);
-       ops->cgroup2_devices = move_ptr(prog);
-       prog = move_ptr(prog_old);
-       return true;
+       return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
  }
  
  static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
@@ -3304,20 +2933,20 @@ static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cg
                  * absolute nor walks upwards.
                  */
                 if (abspath(cur))
-                       return syserrno_set(-EINVAL, "No absolute paths allowed");
+                       return syserror_set(-EINVAL, "No absolute paths allowed");
  
                 if (strnequal(cur, "..", STRLITERALLEN("..")))
-                       return syserrno_set(-EINVAL, "No upward walking paths allowed");
+                       return syserror_set(-EINVAL, "No upward walking paths allowed");
  
                 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
                 if (ret < 0)
-                       return syserrno(-errno, "Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
+                       return syserror("Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
  
                 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
  
                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
                 if (dfd_final < 0)
-                       return syserrno(-errno, "Fail to open directory %d(%s)", dfd_cur, cur);
+                       return syserror("Fail to open directory %d(%s)", dfd_cur, cur);
                 if (dfd_cur != unified->dfd_base)
                         close(dfd_cur);
                 /*
@@ -3346,21 +2975,65 @@ __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *
         return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
  }
  
-static void cg_unified_delegate(char ***delegate)
+static inline bool unified_cgroup(const char *line)
  {
+       return *line == '0';
+}
+
+static inline char *current_unified_cgroup(bool relative, char *line)
+{
+       char *current_cgroup;
+
+       line += STRLITERALLEN("0::");
+
+       if (!abspath(line))
+               return ERR_PTR(-EINVAL);
+
+       /* remove init.scope */
+       if (!relative)
+               line = prune_init_scope(line);
+
+       /* create a relative path */
+       line = deabs(line);
+
+       current_cgroup = strdup(line);
+       if (!current_cgroup)
+               return ERR_PTR(-ENOMEM);
+
+       return current_cgroup;
+}
+
+static inline const char *unprefix(const char *controllers)
+{
+       if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
+               return controllers + STRLITERALLEN("name=");
+       return controllers;
+}
+
+static int __list_cgroup_delegate(char ***delegate)
+{
+       __do_free char **list = NULL;
         __do_free char *buf = NULL;
-       char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
+       char *standard[] = {
+               "cgroup.procs",
+               "cgroup.threads",
+               "cgroup.subtree_control",
+               "memory.oom.group",
+               NULL,
+       };
         char *token;
-       int idx;
+       int ret;
  
         buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
         if (!buf) {
                 for (char **p = standard; p && *p; p++) {
-                       idx = append_null_to_list((void ***)delegate);
-                       (*delegate)[idx] = must_copy_string(*p);
+                       ret = list_add_string(&list, *p);
+                       if (ret < 0)
+                               return ret;
                 }
-               SYSWARN("Failed to read /sys/kernel/cgroup/delegate");
-               return;
+
+               *delegate = move_ptr(list);
+               return syswarn_ret(0, "Failed to read /sys/kernel/cgroup/delegate");
         }
  
         lxc_iterate_parts(token, buf, " \t\n") {
@@ -3371,201 +3044,302 @@ static void cg_unified_delegate(char ***delegate)
                 if (strequal(token, "cgroup.procs"))
                         continue;
  
-               idx = append_null_to_list((void ***)delegate);
-               (*delegate)[idx] = must_copy_string(token);
+               ret = list_add_string(&list, token);
+               if (ret < 0)
+                       return ret;
         }
+
+       *delegate = move_ptr(list);
+       return 0;
  }
  
-/* At startup, parse_hierarchies finds all the info we need about cgroup
- * mountpoints and current cgroups, and stores it in @d.
- */
-static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
+static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
+{
+       __do_free_string_list char **list = NULL;
+       int ret;
+
+       ret = __list_cgroup_delegate(&list);
+       if (ret < 0)
+               return syserror_ret(ret, "Failed to determine unified cgroup delegation requirements");
+
+       for (char *const *s = list; s && *s; s++) {
+               if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
+                       continue;
+
+               return sysinfo_ret(false, "The %s file is not writable, skipping unified hierarchy", *s);
+       }
+
+       *ret_files = move_ptr(list);
+       return true;
+}
+
+static bool legacy_hierarchy_delegated(int dfd_base)
  {
-       __do_free char *basecginfo = NULL, *line = NULL;
-       __do_free_string_list char **klist = NULL, **nlist = NULL;
-       __do_fclose FILE *f = NULL;
         int ret;
-       size_t len = 0;
  
-       /* Root spawned containers escape the current cgroup, so use init's
+       ret = faccessat(dfd_base, ".", W_OK, 0);
+       if (ret < 0 && errno != ENOENT)
+               return sysinfo_ret(false, "Legacy hierarchy not writable, skipping");
+
+       return true;
+}
+
+/**
+ * systemd guarantees that the order of co-mounted controllers is stable. On
+ * some systems the order of the controllers might be reversed though.
+ *
+ * For example, this is how the order is mismatched on CentOS 7:
+ *
+ *      [root@localhost ~]# cat /proc/self/cgroup
+ *      11:perf_event:/
+ *      10:pids:/
+ *      9:freezer:/
+ * >>>> 8:cpuacct,cpu:/
+ *      7:memory:/
+ *      6:blkio:/
+ *      5:devices:/
+ *      4:hugetlb:/
+ * >>>> 3:net_prio,net_cls:/
+ *      2:cpuset:/
+ *      1:name=systemd:/user.slice/user-0.slice/session-c1.scope
+ *
+ * whereas the mountpoint:
+ *
+ *      | |-/sys/fs/cgroup                    tmpfs         tmpfs      ro,nosuid,nodev,noexec,mode=755
+ *      | | |-/sys/fs/cgroup/systemd          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
+ *      | | |-/sys/fs/cgroup/cpuset           cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,cpuset
+ * >>>> | | |-/sys/fs/cgroup/net_cls,net_prio cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,net_prio,net_cls
+ *      | | |-/sys/fs/cgroup/hugetlb          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,hugetlb
+ *      | | |-/sys/fs/cgroup/devices          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,devices
+ *      | | |-/sys/fs/cgroup/blkio            cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,blkio
+ *      | | |-/sys/fs/cgroup/memory           cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,memory
+ * >>>> | | |-/sys/fs/cgroup/cpu,cpuacct      cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,cpuacct,cpu
+ *      | | |-/sys/fs/cgroup/freezer          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,freezer
+ *      | | |-/sys/fs/cgroup/pids             cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,pids
+ *      | | `-/sys/fs/cgroup/perf_event       cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,perf_event
+ *
+ * Ensure that we always use the systemd-guaranteed stable order when checking
+ * for the mountpoint.
+ */
+__attribute__((returns_nonnull)) __attribute__((nonnull))
+static const char *stable_order(const char *controllers)
+{
+       if (strequal(controllers, "cpuacct,cpu"))
+               return "cpu,cpuacct";
+
+       if (strequal(controllers, "net_prio,net_cls"))
+               return "net_cls,net_prio";
+
+       return unprefix(controllers);
+}
+
+static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
+                               bool unprivileged)
+{
+       __do_free char *cgroup_info = NULL;
+       char *it;
+
+       /*
+        * Root spawned containers escape the current cgroup, so use init's
          * cgroups as our base in that case.
          */
         if (!relative && (geteuid() == 0))
-               basecginfo = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
+               cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
         else
-               basecginfo = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
-       if (!basecginfo)
-               return ret_set_errno(-1, ENOMEM);
-
-       ret = get_existing_subsystems(&klist, &nlist);
-       if (ret < 0)
-               return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
-
-       f = fopen("/proc/self/mountinfo", "re");
-       if (!f)
-               return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
+               cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
+       if (!cgroup_info)
+               return ret_errno(ENOMEM);
  
-       lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
+       lxc_iterate_parts(it, cgroup_info, "\n") {
+               __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
+               __do_free char *controllers = NULL, *current_cgroup = NULL;
+               __do_free_string_list char **controller_list = NULL,
+                                          **delegate = NULL;
+               char *line;
+               int dfd, ret, type;
  
-       while (getline(&line, &len, f) != -1) {
-               __do_free char *base_cgroup = NULL, *mountpoint = NULL;
-               __do_free_string_list char **controller_list = NULL;
-               int type;
-               bool writeable;
+               /* Handle the unified cgroup hierarchy. */
+               line = it;
+               if (unified_cgroup(line)) {
+                       char *unified_mnt;
  
-               type = get_cgroup_version(line);
-               if (type == 0)
-                       continue;
+                       type = UNIFIED_HIERARCHY;
  
-               if (type == CGROUP2_SUPER_MAGIC && ops->unified)
-                       continue;
+                       current_cgroup = current_unified_cgroup(relative, line);
+                       if (IS_ERR(current_cgroup))
+                               return PTR_ERR(current_cgroup);
  
-               if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
-                       if (type == CGROUP2_SUPER_MAGIC)
-                               ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
-                       else if (type == CGROUP_SUPER_MAGIC)
-                               ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
-               } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
-                       if (type == CGROUP_SUPER_MAGIC)
-                               ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
-               } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
-                       if (type == CGROUP2_SUPER_MAGIC)
-                               ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
-               }
-
-               controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
-               if (!controller_list && type == CGROUP_SUPER_MAGIC)
-                       continue;
+                       if (unified_cgroup_fd(ops->dfd_mnt)) {
+                               dfd_mnt = dup_cloexec(ops->dfd_mnt);
+                               unified_mnt = "";
+                       } else {
+                               dfd_mnt = open_at(ops->dfd_mnt,
+                                                 "unified",
+                                                 PROTECT_OPATH_DIRECTORY,
+                                                 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
+                               unified_mnt = "unified";
+                       }
+                       if (dfd_mnt < 0) {
+                               if (errno != ENOENT)
+                                       return syserror("Failed to open %d/unified", ops->dfd_mnt);
  
-               if (type == CGROUP_SUPER_MAGIC)
-                       if (controller_list_is_dup(ops->hierarchies, controller_list)) {
-                               TRACE("Skipping duplicating controller");
+                               SYSTRACE("Unified cgroup not mounted");
                                 continue;
                         }
+                       dfd = dfd_mnt;
+
+                       if (!is_empty_string(current_cgroup)) {
+                               dfd_base = open_at(dfd_mnt, current_cgroup,
+                                                  PROTECT_OPATH_DIRECTORY,
+                                                  PROTECT_LOOKUP_BENEATH_XDEV, 0);
+                               if (dfd_base < 0) {
+                                       if (errno != ENOENT)
+                                               return syserror("Failed to open %d/%s",
+                                                               dfd_mnt, current_cgroup);
+
+                                       SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
+                                                dfd_mnt, current_cgroup);
+                                       continue;
+                               }
+                               dfd = dfd_base;
+                       }
  
-               mountpoint = cg_hybrid_get_mountpoint(line);
-               if (!mountpoint) {
-                       WARN("Failed parsing mountpoint from \"%s\"", line);
-                       continue;
-               }
+                       if (!unified_hierarchy_delegated(dfd, &delegate))
+                               continue;
  
-               if (type == CGROUP_SUPER_MAGIC)
-                       base_cgroup = cg_hybrid_get_current_cgroup(relative, basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
-               else
-                       base_cgroup = cg_hybrid_get_current_cgroup(relative, basecginfo, NULL, CGROUP2_SUPER_MAGIC);
-               if (!base_cgroup) {
-                       WARN("Failed to find current cgroup");
-                       continue;
-               }
+                       controller_list = unified_controllers(dfd, "cgroup.controllers");
+                       if (!controller_list) {
+                               TRACE("No controllers are enabled for delegation in the unified hierarchy");
+                               controller_list = list_new();
+                               if (!controller_list)
+                                       return syserror_set(-ENOMEM, "Failed to create empty controller list");
+                       }
  
-               if (type == CGROUP2_SUPER_MAGIC)
-                       writeable = test_writeable_v2(mountpoint, base_cgroup);
-               else
-                       writeable = test_writeable_v1(mountpoint, base_cgroup);
-               if (!writeable) {
-                       TRACE("The %s group is not writeable", base_cgroup);
-                       continue;
-               }
+                       controllers = strdup(unified_mnt);
+                       if (!controllers)
+                               return ret_errno(ENOMEM);
+               } else {
+                       char *__controllers, *__current_cgroup;
  
-               if (type == CGROUP2_SUPER_MAGIC)
-                       ret = add_hierarchy(ops, NULL, move_ptr(mountpoint), move_ptr(base_cgroup), type);
-               else
-                       ret = add_hierarchy(ops, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
-               if (ret)
-                       return syserrno(ret, "Failed to add cgroup hierarchy");
-               if (ops->unified && unprivileged)
-                       cg_unified_delegate(&(ops->unified)->cgroup2_chown);
-       }
+                       type = LEGACY_HIERARCHY;
  
-       /* verify that all controllers in cgroup.use and all crucial
-        * controllers are accounted for
-        */
-       if (!all_controllers_found(ops))
-               return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
+                       __controllers = strchr(line, ':');
+                       if (!__controllers)
+                               return ret_errno(EINVAL);
+                       __controllers++;
  
-       return 0;
-}
+                       __current_cgroup = strchr(__controllers, ':');
+                       if (!__current_cgroup)
+                               return ret_errno(EINVAL);
+                       *__current_cgroup = '\0';
+                       __current_cgroup++;
  
-/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
-static char *cg_unified_get_current_cgroup(bool relative)
-{
-       __do_free char *basecginfo = NULL, *copy = NULL;
-       char *base_cgroup;
+                       controllers = strdup(stable_order(__controllers));
+                       if (!controllers)
+                               return ret_errno(ENOMEM);
  
-       if (!relative && (geteuid() == 0))
-               basecginfo = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
-       else
-               basecginfo = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
-       if (!basecginfo)
-               return NULL;
+                       dfd_mnt = open_at(ops->dfd_mnt,
+                                         controllers,
+                                         PROTECT_OPATH_DIRECTORY,
+                                         PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
+                       if (dfd_mnt < 0) {
+                               if (errno != ENOENT)
+                                       return syserror("Failed to open %d/%s",
+                                                       ops->dfd_mnt, controllers);
  
-       base_cgroup = strstr(basecginfo, "0::/");
-       if (!base_cgroup)
-               return NULL;
+                               SYSTRACE("%s not mounted", controllers);
+                               continue;
+                       }
+                       dfd = dfd_mnt;
  
-       base_cgroup = base_cgroup + 3;
-       copy = copy_to_eol(base_cgroup);
-       if (!copy)
-               return NULL;
-       trim(copy);
+                       if (!abspath(__current_cgroup))
+                               return ret_errno(EINVAL);
  
-       if (!relative) {
-               base_cgroup = prune_init_scope(copy);
-               if (!base_cgroup)
-                       return NULL;
-       } else {
-               base_cgroup = copy;
-       }
+                       /* remove init.scope */
+                       if (!relative)
+                               __current_cgroup = prune_init_scope(__current_cgroup);
  
-       if (abspath(base_cgroup))
-               base_cgroup = deabs(base_cgroup);
+                       /* create a relative path */
+                       __current_cgroup = deabs(__current_cgroup);
  
-       /* We're allowing base_cgroup to be "". */
-       return strdup(base_cgroup);
-}
+                       current_cgroup = strdup(__current_cgroup);
+                       if (!current_cgroup)
+                               return ret_errno(ENOMEM);
  
-static int cg_unified_init(struct cgroup_ops *ops, bool relative,
-                          bool unprivileged)
-{
-       __do_free char *base_cgroup = NULL;
-       int ret;
+                       if (!is_empty_string(current_cgroup)) {
+                               dfd_base = open_at(dfd_mnt, current_cgroup,
+                                                  PROTECT_OPATH_DIRECTORY,
+                                                  PROTECT_LOOKUP_BENEATH_XDEV, 0);
+                               if (dfd_base < 0) {
+                                       if (errno != ENOENT)
+                                               return syserror("Failed to open %d/%s",
+                                                               dfd_mnt, current_cgroup);
+
+                                       SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
+                                                dfd_mnt, current_cgroup);
+                                       continue;
+                               }
+                               dfd = dfd_base;
+                       }
  
-       base_cgroup = cg_unified_get_current_cgroup(relative);
-       if (!base_cgroup)
-               return ret_errno(EINVAL);
+                       if (!legacy_hierarchy_delegated(dfd))
+                               continue;
  
-       /* TODO: If the user requested specific controllers via lxc.cgroup.use
-        * we should verify here. The reason I'm not doing it right is that I'm
-        * not convinced that lxc.cgroup.use will be the future since it is a
-        * global property. I much rather have an option that lets you request
-        * controllers per container.
-        */
+                       /*
+                        * We intentionally pass __current_cgroup here and not
+                        * controllers because we would otherwise chop the
+                        * mountpoint.
+                        */
+                       controller_list = list_add_controllers(__controllers);
+                       if (!controller_list)
+                               return syserror_set(-ENOMEM, "Failed to create controller list from %s", __controllers);
  
-       ret = add_hierarchy(ops, NULL,
-                           must_copy_string(DEFAULT_CGROUP_MOUNTPOINT),
-                           move_ptr(base_cgroup), CGROUP2_SUPER_MAGIC);
-       if (ret)
-               return syserrno(ret, "Failed to add unified cgroup hierarchy");
+                       if (skip_hierarchy(ops, controller_list))
+                               continue;
  
-       if (unprivileged)
-               cg_unified_delegate(&(ops->unified)->cgroup2_chown);
+                       ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
+               }
  
-       if (bpf_devices_cgroup_supported())
-               ops->unified->bpf_device_controller = 1;
+               ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
+                                          current_cgroup, controller_list, type);
+               if (ret < 0)
+                       return syserror_ret(ret, "Failed to add %s hierarchy", controllers);
+
+               /* Transfer ownership. */
+               move_fd(dfd_mnt);
+               move_fd(dfd_base);
+               move_ptr(current_cgroup);
+               move_ptr(controllers);
+               move_ptr(controller_list);
+               if (type == UNIFIED_HIERARCHY)
+                       ops->unified->delegate = move_ptr(delegate);
+       }
+
+       /* determine cgroup layout */
+       if (ops->unified) {
+               if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
+                       ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
+               } else {
+                       if (bpf_devices_cgroup_supported())
+                               ops->unified->utilities |= DEVICES_CONTROLLER;
+                       ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
+               }
+       }
+
+       if (!controllers_available(ops))
+               return syserror_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
  
-       ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
-       return CGROUP2_SUPER_MAGIC;
+       return 0;
  }
  
-static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf)
+static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
  {
         __do_close int dfd = -EBADF;
-       bool relative = conf->cgroup_meta.relative;
         int ret;
-       const char *tmp;
+       const char *controllers_use;
  
-       if (ops->dfd_mnt_cgroupfs_host >= 0)
-               return ret_errno(EINVAL);
+       if (ops->dfd_mnt >= 0)
+               return ret_errno(EBUSY);
  
         /*
          * I don't see the need for allowing symlinks here. If users want to
@@ -3575,18 +3349,22 @@ static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf)
         dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
                         PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
         if (dfd < 0)
-               return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
+               return syserror("Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
  
-       tmp = lxc_global_config_value("lxc.cgroup.use");
-       if (tmp) {
-               __do_free char *pin = NULL;
-               char *chop, *cur;
+       controllers_use = lxc_global_config_value("lxc.cgroup.use");
+       if (controllers_use) {
+               __do_free char *dup = NULL;
+               char *it;
  
-               pin = must_copy_string(tmp);
-               chop = pin;
+               dup = strdup(controllers_use);
+               if (!dup)
+                       return -errno;
  
-               lxc_iterate_parts(cur, chop, ",")
-                       must_append_string(&ops->cgroup_use, cur);
+               lxc_iterate_parts(it, dup, ",") {
+                       ret = list_add_string(&ops->cgroup_use, it);
+                       if (ret < 0)
+                               return ret;
+               }
         }
  
         /*
@@ -3594,14 +3372,11 @@ static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf)
          * once we know the initialization succeeded. So if we fail we clean up
          * the dfd.
          */
-       ops->dfd_mnt_cgroupfs_host = dfd;
+       ops->dfd_mnt = dfd;
  
-       if (unified_cgroup_fd(dfd))
-               ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
-       else
-               ret = cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
+       ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !lxc_list_empty(&conf->id_map));
         if (ret < 0)
-               return syserrno(ret, "Failed to initialize cgroups");
+               return syserror_ret(ret, "Failed to initialize cgroups");
  
         /* Transfer ownership to cgroup_ops. */
         move_fd(dfd);
@@ -3617,24 +3392,27 @@ __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
  
         /* copy system-wide cgroup information */
         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
-       if (cgroup_pattern && !strequal(cgroup_pattern, ""))
-               ops->cgroup_pattern = must_copy_string(cgroup_pattern);
+       if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
+               ops->cgroup_pattern = strdup(cgroup_pattern);
+               if (!ops->cgroup_pattern)
+                       return ret_errno(ENOMEM);
+       }
  
         return 0;
  }
  
-struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
+struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
  {
-       __do_free struct cgroup_ops *cgfsng_ops = NULL;
+       __cleanup_cgroup_ops struct cgroup_ops *cgfsng_ops = NULL;
  
         cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
         if (!cgfsng_ops)
                 return ret_set_errno(NULL, ENOMEM);
  
-       cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
-       cgfsng_ops->dfd_mnt_cgroupfs_host = -EBADF;
+       cgfsng_ops->cgroup_layout       = CGROUP_LAYOUT_UNKNOWN;
+       cgfsng_ops->dfd_mnt             = -EBADF;
  
-       if (__cgroup_init(cgfsng_ops, conf))
+       if (initialize_cgroups(cgfsng_ops, conf))
                 return NULL;
  
         cgfsng_ops->data_init                           = cgfsng_data_init;
@@ -3646,7 +3424,7 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
         cgfsng_ops->payload_delegate_controllers        = cgfsng_payload_delegate_controllers;
         cgfsng_ops->payload_create                      = cgfsng_payload_create;
         cgfsng_ops->payload_enter                       = cgfsng_payload_enter;
-       cgfsng_ops->payload_finalize                    = cgfsng_payload_finalize;
+       cgfsng_ops->finalize                            = cgfsng_finalize;
         cgfsng_ops->get_cgroup                          = cgfsng_get_cgroup;
         cgfsng_ops->get                                 = cgfsng_get;
         cgfsng_ops->set                                 = cgfsng_set;
@@ -3660,7 +3438,7 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
         cgfsng_ops->chown                               = cgfsng_chown;
         cgfsng_ops->mount                               = cgfsng_mount;
         cgfsng_ops->devices_activate                    = cgfsng_devices_activate;
-       cgfsng_ops->get_limiting_cgroup                 = cgfsng_get_limiting_cgroup;
+       cgfsng_ops->get_limit_cgroup                    = cgfsng_get_limit_cgroup;
  
         cgfsng_ops->criu_escape                         = cgfsng_criu_escape;
         cgfsng_ops->criu_num_hierarchies                = cgfsng_criu_num_hierarchies;
@@ -3669,23 +3447,14 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
         return move_ptr(cgfsng_ops);
  }
  
-int cgroup_attach(const struct lxc_conf *conf, const char *name,
-                 const char *lxcpath, pid_t pid)
+static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid)
  {
-       __do_close int unified_fd = -EBADF;
         int ret;
  
-       if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
-               return ret_errno(EINVAL);
-
-       unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
-       if (unified_fd < 0)
-               return ret_errno(ENOCGROUP2);
-
         if (!lxc_list_empty(&conf->id_map)) {
                 struct userns_exec_unified_attach_data args = {
                         .conf           = conf,
-                       .unified_fd     = unified_fd,
+                       .unified_fd     = fd_unified,
                         .pid            = pid,
                 };
  
@@ -3699,62 +3468,182 @@ int cgroup_attach(const struct lxc_conf *conf, const char *name,
                                           cgroup_unified_attach_child_wrapper,
                                           &args);
         } else {
-               ret = cgroup_attach_leaf(conf, unified_fd, pid);
+               ret = cgroup_attach_leaf(conf, fd_unified, pid);
+       }
+
+       return ret;
+}
+
+static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name,
+                               const char *lxcpath, pid_t pid)
+{
+       call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){};
+       int ret;
+       size_t idx;
+       ssize_t pidstr_len;
+       char pidstr[INTTYPE_TO_STRLEN(pid_t)];
+
+       ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, sizeof(struct cgroup_ctx), ctx);
+       if (ret < 0)
+               return ret_errno(ENOSYS);
+
+       pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
+       if (pidstr_len < 0)
+               return pidstr_len;
+
+       for (idx = 0; idx < ctx->fd_len; idx++) {
+               int dfd_con = ctx->fd[idx];
+
+               if (unified_cgroup_fd(dfd_con))
+                       ret = __unified_attach_fd(conf, dfd_con, pid);
+               else
+                       ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len);
+               if (ret)
+                       return syserror_ret(ret, "Failed to attach to cgroup fd %d", dfd_con);
+               else
+                       TRACE("Attached to cgroup fd %d", dfd_con);
+       }
+
+       if (idx == 0)
+               return syserror_set(-ENOENT, "Failed to attach to cgroups");
+
+       TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->layout));
+       return 0;
+}
+
+static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name,
+                                  const char *lxcpath, pid_t pid)
+{
+       __do_close int dfd_unified = -EBADF;
+
+       if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
+               return ret_errno(EINVAL);
+
+       dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath);
+       if (dfd_unified < 0)
+               return ret_errno(ENOSYS);
+
+       return __unified_attach_fd(conf, dfd_unified, pid);
+}
+
+int cgroup_attach(const struct lxc_conf *conf, const char *name,
+                 const char *lxcpath, pid_t pid)
+{
+       int ret;
+
+       ret = __cgroup_attach_many(conf, name, lxcpath, pid);
+       if (ret < 0) {
+               if (!ERRNO_IS_NOT_SUPPORTED(ret))
+                       return ret;
+
+               ret = __cgroup_attach_unified(conf, name, lxcpath, pid);
+               if (ret < 0 && ERRNO_IS_NOT_SUPPORTED(ret))
+                       return ret_errno(ENOSYS);
         }
  
         return ret;
  }
  
  /* Connects to command socket therefore isn't callable from command handler. */
-int cgroup_get(const char *name, const char *lxcpath,
-              const char *filename, char *buf, size_t len)
+int cgroup_get(const char *name, const char *lxcpath, const char *key, char *buf, size_t len)
  {
-       __do_close int unified_fd = -EBADF;
-       ssize_t ret;
+       __do_close int dfd = -EBADF;
+       struct cgroup_fd fd = {
+               .fd = -EBADF,
+       };
+       size_t len_controller;
+       int ret;
  
-       if (is_empty_string(filename) || is_empty_string(name) ||
-           is_empty_string(lxcpath))
+       if (is_empty_string(name) || is_empty_string(lxcpath) ||
+           is_empty_string(key))
                 return ret_errno(EINVAL);
  
         if ((buf && !len) || (len && !buf))
                 return ret_errno(EINVAL);
  
-       unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
-       if (unified_fd < 0)
-               return ret_errno(ENOCGROUP2);
+       len_controller = strcspn(key, ".");
+       len_controller++; /* Don't forget the \0 byte. */
+       if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
+               return ret_errno(EINVAL);
+       (void)strlcpy(fd.controller, key, len_controller);
  
-       ret = lxc_read_try_buf_at(unified_fd, filename, buf, len);
-       if (ret < 0)
-               SYSERROR("Failed to read cgroup value");
+       ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
+       if (ret < 0) {
+               if (!ERRNO_IS_NOT_SUPPORTED(ret))
+                       return ret;
+
+               dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
+               if (dfd < 0) {
+                       if (!ERRNO_IS_NOT_SUPPORTED(ret))
+                               return ret;
+
+                       return ret_errno(ENOSYS);
+               }
+               fd.type = UNIFIED_HIERARCHY;
+               fd.fd = move_fd(dfd);
+       }
+       dfd = move_fd(fd.fd);
+
+       TRACE("Reading %s from %s cgroup hierarchy", key, cgroup_hierarchy_name(fd.type));
+
+       if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices"))
+               return ret_errno(EOPNOTSUPP);
+       else
+               ret = lxc_read_try_buf_at(dfd, key, buf, len);
  
         return ret;
  }
  
  /* Connects to command socket therefore isn't callable from command handler. */
-int cgroup_set(const char *name, const char *lxcpath,
-              const char *filename, const char *value)
+int cgroup_set(const char *name, const char *lxcpath, const char *key, const char *value)
  {
-       __do_close int unified_fd = -EBADF;
-       ssize_t ret;
+       __do_close int dfd = -EBADF;
+       struct cgroup_fd fd = {
+               .fd = -EBADF,
+       };
+       size_t len_controller;
+       int ret;
  
-       if (is_empty_string(filename) || is_empty_string(value) ||
-           is_empty_string(name) || is_empty_string(lxcpath))
+       if (is_empty_string(name) || is_empty_string(lxcpath) ||
+           is_empty_string(key) || is_empty_string(value))
                 return ret_errno(EINVAL);
  
-       unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
-       if (unified_fd < 0)
-               return ret_errno(ENOCGROUP2);
+       len_controller = strcspn(key, ".");
+       len_controller++; /* Don't forget the \0 byte. */
+       if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
+               return ret_errno(EINVAL);
+       (void)strlcpy(fd.controller, key, len_controller);
+
+       ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
+       if (ret < 0) {
+               if (!ERRNO_IS_NOT_SUPPORTED(ret))
+                       return ret;
+
+               dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
+               if (dfd < 0) {
+                       if (!ERRNO_IS_NOT_SUPPORTED(ret))
+                               return ret;
  
-       if (strnequal(filename, "devices.", STRLITERALLEN("devices."))) {
+                       return ret_errno(ENOSYS);
+               }
+               fd.type = UNIFIED_HIERARCHY;
+               fd.fd = move_fd(dfd);
+       }
+       dfd = move_fd(fd.fd);
+
+       TRACE("Setting %s to %s in %s cgroup hierarchy", key, value, cgroup_hierarchy_name(fd.type));
+
+       if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) {
                 struct device_item device = {};
  
-               ret = device_cgroup_rule_parse(&device, filename, value);
+               ret = device_cgroup_rule_parse(&device, key, value);
                 if (ret < 0)
-                       return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", filename, value);
+                       return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
+                                              key, value);
  
                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
         } else {
-               ret = lxc_writeat(unified_fd, filename, value, strlen(value));
+               ret = lxc_writeat(dfd, key, value, strlen(value));
         }
  
         return ret;
@@ -3768,9 +3657,9 @@ static int do_cgroup_freeze(int unified_fd,
                             const char *wait_error)
  {
         __do_close int events_fd = -EBADF;
-       call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
+       call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL;
         int ret;
-       struct lxc_epoll_descr descr = {};
+       struct lxc_async_descr descr = {};
  
         if (timeout != 0) {
                 ret = lxc_mainloop_open(&descr);
@@ -3784,7 +3673,11 @@ static int do_cgroup_freeze(int unified_fd,
                 if (events_fd < 0)
                         return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
  
-               ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
+               ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI,
+                                                     freezer_cgroup_events_cb,
+                                                     default_cleanup_handler,
+                                                     INT_TO_PTR(state_num),
+                                                     "freezer_cgroup_events_cb");
                 if (ret < 0)
                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
         }
@@ -3817,7 +3710,7 @@ int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
         if (is_empty_string(name) || is_empty_string(lxcpath))
                 return ret_errno(EINVAL);
  
-       unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
+       unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
         if (unified_fd < 0)
                 return ret_errno(ENOCGROUP2);
  
@@ -3842,7 +3735,7 @@ int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
         if (is_empty_string(name) || is_empty_string(lxcpath))
                 return ret_errno(EINVAL);
  
-       unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
+       unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
         if (unified_fd < 0)
                 return ret_errno(ENOCGROUP2);